In [1]:
import numpy as np
import pandas as pd
import re
import spacy
import nltk
from nltk.corpus import stopwords

In [2]:
# Importing the data
tweets = pd.read_csv('https://raw.githubusercontent.com/Deepakvk18/Twitter-Web-Scraping/main/negative_sentiment_tweets.csv')
tweets.head()

Unnamed: 0,tweets,labels,Sentiment
0,@IvorMectin1 He clearly hates white people. Mi...,no_poverty,1
1,"@vinodkapri New India, hide d poor =poverty va...",no_poverty,1
2,"If not, join us today.\nIf you encounter any C...",no_poverty,1
3,"""All the Loadshedding, Unemployment, R500-bill...",no_poverty,1
4,"""Ideals of Swami Vivekananda?""\n99% poverty in...",no_poverty,1


In [3]:
tweets.shape

(71888, 3)

In [4]:
nlp = spacy.load('en_core_web_sm')

# Text Pre-processing
def preprocess(text):
    
    """This function return the text after performing text cleaning which includes 
    removing punctuations, stopwords, urls and cleans the hashtags
    text - String to be cleaned
    Returns string"""
    
    doc = nlp(text)
    
    # List to store the tokens that are filtered and cleaned
    filtered_tokens = []
    for token in doc:
        
        # If token is not a punctuation, not a url,        not a stopword           not a mention of another twitter handle      is only text
        if not token.is_punct and not token.like_url and not token.is_stop and not re.match('.*@[A-Za-z0-9_]+', token.text) and token.is_alpha:
            filtered_tokens.append(token.lemma_.strip())
            
    return ' '.join(filtered_tokens)

In [5]:
sample_tweet = tweets.tweets.sample(1).values[0]
print(sample_tweet)
print("=============================================================================================================")
print("After Preprocessing")
print("=============================================================================================================")
print(preprocess(sample_tweet))

@BSNLCorporate @BSNL_HP @TRAI @DoT_India @AshwiniVaishnaw @RajinderGargBJP  BSNL 01893-233027 telecom services is faulty again after numerous request and complains no resolution provided . Bsnl is scamming consumer with pathetic services. HELP !@forum_consumer @CMOFFICEHP
After Preprocessing
BSNL telecom service faulty numerous request complain resolution provide Bsnl scamme consumer pathetic service HELP


In [6]:
%%time
processed_tweets = tweets['tweets'].apply(preprocess)
processed_tweets[:5]

CPU times: user 13min 19s, sys: 603 ms, total: 13min 20s
Wall time: 13min 21s


0    clearly hate white people million India live c...
1    New India hide d poor poverty vanish burn d ra...
2    join today encounter childlabour victim contac...
3    Loadshedding unemployment billion PPE Poverty ...
4    ideal Swami Vivekananda poverty India tieup co...
Name: tweets, dtype: object

In [7]:
processed_df = pd.DataFrame({'tweets':processed_tweets, 'label':tweets['labels']})
print(processed_df.shape)
processed_df.drop_duplicates(inplace=True)
print(processed_df.shape)

(71888, 2)
(57674, 2)


In [8]:
processed_df.to_csv('preprocessed_tweets.csv', index=False)