In [11]:
import pandas as pd
import seaborn
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import spacy
import string
from spacy.lang.en import English
from nltk.stem.porter import PorterStemmer
spacy.load('en')
parser = English()

In [12]:
STOPWORDS = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)) 
SYMBOLCHARS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

In [13]:
OffensiveLangDF = pd.read_csv('../Data/Offensive Language Dataset/Cleaned_labeled_data.csv')
spamSmsDF = pd.read_csv('../Data/SMS Spam Dataset/Cleaned_SMSSpamCollection.csv')
politicalDF = pd.read_csv('../Data/Indian Political Tweets Dataset/cleaned-tweets.csv')

In [14]:
OffensiveLangDF.head()

Unnamed: 0,class,category,text
0,1,OffensiveLang,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
1,1,OffensiveLang,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
2,1,OffensiveLang,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
3,1,OffensiveLang,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
4,1,OffensiveLang,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."


In [15]:
spamSmsDF.head()

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
politicalDF.head()

Unnamed: 0,category,text
0,NOTPOL,Bumping dj sefs mixtape nowww this is my music...
1,NOTPOL,#ieroween THE STORY OF IEROWEEN! THE VIDEO ->>...
2,NOTPOL,trick or treating at the mall today; ZOO! last...
3,NOTPOL,@Ussk81 PMSL!!! I try not to stare but I can't...
4,NOTPOL,@Sc0rpi0n676 btw - is there a remote chance i ...


In [17]:
def tokenizeText(textData):

    textData = textData.strip().replace("\n", " ").replace("\r", " ")
    textData = textData.lower()
    tokens = parser(textData)

    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    
    # Remove Stop Words
    tokens = [tok for tok in tokens if tok.lower() not in STOPWORDS]
    # Remove Symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLCHARS]
    # Remove words with less than 3 characters
    tokens = [tok for tok in tokens if len(tok) >= 3]
    # Remove Non-Alphabetic Characters
    tokens = [tok for tok in tokens if tok.isalpha()]
    
    # Stemming of Words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    
    tokens = list(set(tokens))
    textData = ' '.join(tokens[:])
    return textData

In [18]:
OffensiveLangDF['text'] = hateSpeechDF['text'].apply(lambda x:tokenizeText(x))
OffensiveLangDF['text']

0               bad tyga place dwn cold cuffin hoe boy dat
1                         start confu fuck shit bitch dawg
2                                         tranni look like
3                          true faker told hear shit bitch
4                  blow somebodi fuck faith claim hoe shit
                               ...                        
23348                         cute yaya avi tho idea sleep
23349              cute new friend yea darki lol kno allow
23350             know bird gummi earli coff say worm morn
23351      broke redneck drove gone crazi heart wrong babi
23352    ntac combin yellow dahlia beauti eileen orang ...
Name: text, Length: 23353, dtype: object

In [19]:
spamSmsDF['text'] = spamSmsDF['text'].apply(lambda x:tokenizeText(x))
spamSmsDF['text']

0       bugi wat crazi great cine jurong avail world b...
1                                        oni lar wif joke
2       free entri final txt appli cup win tkt comp wk...
3                                       dun earli hor say
4                                  usf live goe think nah
                              ...                        
5567    time pound rate easi tri prize nation minut cl...
5568                                     home go esplanad
5569                                    piti mood suggest
5570       week interest buy guy act free bitch gave like
5571                                            true rofl
Name: text, Length: 5572, dtype: object

In [20]:
politicalDF['text'] = politicalDF['text'].apply(lambda x:tokenizeText(x))
politicalDF['text']

0               bump nowww mixtap new music skooooool sef
1                              ieroween frank video stori
2           over treat ran mall year trick zoo today left
3                   stare compuls tri view help pmsl like
4                                   remot btw later chanc
                              ...                        
6055    nepal excel busi india forum prime minist pres...
6056                sister yogi adityanath begin newindia
6057    mamataoffici profit pay appeas india certain t...
6058    jihadi plight feel india remind imagin muslim ...
6059    voter women reform rjd support tmc ncp better ...
Name: text, Length: 6060, dtype: object