In [1]:
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
import seaborn as sns
from typing_extensions import dataclass_transform
from sklearn.preprocessing import LabelEncoder
from google.colab import files

In [2]:
!pip install nltk



In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, RegexpStemmer, WordNetLemmatizer
from textblob import TextBlob

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
"""
DataFrame structure for the program
Columns:
Label( 0-Neutral(non-hate); 1-Hate )
Tweet
Tweet_P (Tweet after preprocessing)

"""

'\nDataFrame structure for the program\nColumns:\nLabel( 0-Neutral(non-hate); 1-Hate )\nTweet\nTweet_P (Tweet after preprocessing)\n\n'

In [6]:
#Remove stop word
stop_words = set(stopwords.words('english'))
def remove_stopwords(tweet):
  words = word_tokenize(tweet)
  filtered_words = [word for word in words if word not in stop_words]
  return ' '.join(filtered_words)

In [7]:
#Applying Stemming
p_stem = PorterStemmer()
def p_stemming(tweet):
  words = word_tokenize(tweet)
  stemmed_words = [p_stem.stem(word) for word in words]
  return ' '.join(stemmed_words)

r_stem = RegexpStemmer('ing$|s$|e$|able$', min=4)
def r_stemming(tweet):
  words = word_tokenize(tweet)
  stemmed_words = [r_stem.stem(word) for word in words]
  return ' '.join(stemmed_words)

In [8]:
#Applying Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatizing(tweet):
  words = word_tokenize(tweet)
  lemmatized_word = [lemmatizer.lemmatize(word) for word in words]
  return ' '.join(lemmatized_word)

""""Preprocessing for data from source 1"""

In [9]:
Data_1 = pd.read_csv('/content/Data_Source_1.csv')

In [10]:
Data_1.head(5)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [11]:
Data_1.columns

Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')

In [12]:
Data_1 = Data_1.drop(columns=['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'])

In [13]:
Data_1.rename(columns={'class': 'Label', 'tweet': 'Tweet'}, inplace=True)

In [14]:
#class label: 0 - hate speech 1 - offensive language 2 - neither
#Convert to required labels
Data_1.loc[Data_1['Label']==0, 'Label'] = 1
Data_1.loc[Data_1['Label']==2, 'Label'] = 0

In [15]:
print(Data_1.isna().sum())
print(Data_1.shape)

Label    0
Tweet    0
dtype: int64
(24783, 2)


In [16]:
#Casefold to convert to lower case
Data_1['Tweet_P'] = Data_1.Tweet.str.casefold()

In [17]:
#Remove character except letter or space
remove = string.printable.replace(string.ascii_letters, '').replace(' ', '')
table = str.maketrans(remove, ' '*len(remove))
Data_1['Tweet_P'] = Data_1.Tweet_P.str.translate(table)

In [18]:
Data_1['Tweet_P'] = Data_1['Tweet_P'].apply(remove_stopwords)
Data_1['Tweet_P'] = Data_1['Tweet_P'].apply(p_stemming)
Data_1['Tweet_P'] = Data_1['Tweet_P'].apply(r_stemming)
Data_1['Tweet_P'] = Data_1['Tweet_P'].apply(lemmatizing)

In [19]:
# Download preprocessed file for further use
DataP_1=pd.DataFrame(Data_1)
DataP_1.to_csv('DataP_1.csv', index=False)
DataP_1.head()
files.download('DataP_1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
print(DataP_1.isna().sum())
print(DataP_1.shape)

Label      0
Tweet      0
Tweet_P    0
dtype: int64
(24783, 3)


In [21]:
DataP_1.sample(5)

Unnamed: 0,Label,Tweet,Tweet_P
18825,1,RT @dahliafoxx: sick of bitches bitching about...,rt dahliafoxx sick bitch bitch bitch
11515,1,If Kubiak was a player they'd pump him with pa...,kubiak player pump painkil tell stop pussi
8842,0,Doug Martin trash,doug martin trash
24517,1,"the microwave at work is broken, so my ghetto ...",microwav work broken ghetto as put hamburg pat...
2009,1,&lt;&lt;&lt;&lt;&lt;&lt;when bitches be doing ...,lt lt lt lt lt lt bitch much twitter


"""Preprocessing of data from source 2"""

In [22]:
Data_2 = pd.read_csv('/content/Data_Source_2.csv')

In [23]:
Data_2.head(5)

Unnamed: 0.1,Unnamed: 0,id,text,label,type,model_wrong,db.model_preds,status,round,split,annotator
0,1,10000,It should be legal for a husband to rape his wife,hate,notgiven,True,0.97457,dynabench entry,1,train,gnzuctwed3
1,2,10001,dalits are lowlife scum,hate,notgiven,False,0.08233,dynabench entry,1,test,cagnluiznm
2,3,10002,dalits are lowlives,hate,notgiven,True,0.92319,dynabench entry,1,train,cagnluiznm
3,4,10003,It was a better world when women dare not ques...,hate,notgiven,True,0.99006,dynabench entry,1,test,gnzuctwed3
4,5,10004,I don't work this hard so that those immigrant...,hate,notgiven,True,0.98836,dynabench entry,1,train,cagnluiznm


In [24]:
Data_2 = Data_2.drop(columns=['id', 'Unnamed: 0', 'type', 'model_wrong', 'db.model_preds', 'status', 'round', 'split', 'annotator'])
Data_2.rename(columns={'label': 'Label', 'text': 'Tweet'}, inplace=True)

In [25]:
#Convert to required labels
Data_2.loc[Data_2['Label']=='hate', 'Label'] = 1
Data_2.loc[Data_2['Label']=='nothate', 'Label'] = 0
Data_2 = Data_2.reindex(columns=['Label', 'Tweet'])
Data_2.sample(5)

Unnamed: 0,Label,Tweet
3479,0,My golden retriever bitch is having puppies
33164,0,i still believe chimps are fragile animals tha...
21703,0,Sometimes it is nice to just sit back and take...
24756,0,i love the idea that muslims should be able to...
30102,0,White identity is a degenerate concept fabrica...


In [26]:
Data_2['Tweet_P'] = Data_2.Tweet.str.casefold()
remove = string.printable.replace(string.ascii_letters, '').replace(' ', '')
table = str.maketrans(remove, ' '*len(remove))
Data_2['Tweet_P'] = Data_2.Tweet_P.str.translate(table)
Data_2['Tweet_P'] = Data_2['Tweet_P'].apply(remove_stopwords)
Data_2['Tweet_P'] = Data_2['Tweet_P'].apply(p_stemming)
Data_2['Tweet_P'] = Data_2['Tweet_P'].apply(r_stemming)
Data_2['Tweet_P'] = Data_2['Tweet_P'].apply(lemmatizing)

In [27]:
DataP_2=pd.DataFrame(Data_2)
DataP_2.to_csv('DataP_2.csv', index=False)
DataP_2.head()
files.download('DataP_2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
print(DataP_2.isna().sum())
print(DataP_2.shape)

Label      0
Tweet      0
Tweet_P    0
dtype: int64
(40623, 3)


In [29]:
DataP_2.sample(5)

Unnamed: 0,Label,Tweet,Tweet_P
1462,1,"to hindus: I hope our paths never meet, like e...",hindu hop path never meet lik ever
18378,0,I love to be rude to people who call you lezza,lov rud peopl call lezza
39332,1,I am not racist but I wouldn't want a white wife,racist want whit wif
4186,0,Mike Pence comes off as such an insufferable l...,mik penc com insuffer liar
31070,0,When a broad is too hard to get a YT so she ha...,broad hard get yt mak roll tarpap


"""Preprocessing of data from source 2"""

In [30]:
Data_3 = pd.read_csv('/content/Data_Source_3.csv')

In [31]:
Data_3.dropna()
Data_3.head(5)

Unnamed: 0,Category,CounterSpeech,commentText,date,hasReplies,id,likes,numberOfReplies,timestamp,user,Community,replies
0,Default,False,she is one of the last brave white women with ...,5 years ago,False,Ugw-_mfnTo9SUniGGTN4AaABAg,0,0.0,1354698838535,Ik hou van surfen,jews,[]
1,Default,False,AMEN PASTOR!!!!!!!!,2 years ago,False,UgiMOgO3u4akkngCoAEC,0,0.0,1449290856420,julius smith,lgbt,[]
2,28,True,What kind of God is it that Hates people & Cur...,4 years ago,False,UgzemCMbg-ri1ri7sCt4AaABAg,0,0.0,1386234965293,gary wersley,jews,[]
3,8,True,Question - Which comment is racist and why?\n\...,3 years ago,False,UghFhzrURF1svHgCoAEC,0,0.0,1420475245589,Les Ordway,jews,[]
4,8,True,That Israel fail that is nothing new. That Isr...,5 years ago,False,Ugzr50sQdH0Ictuex214AaABAg,0,0.0,1354699034596,HamalachMichael,jews,[]


In [32]:
Data_3.columns

Index(['Category', 'CounterSpeech', 'commentText', 'date', 'hasReplies', 'id',
       'likes', 'numberOfReplies', 'timestamp', 'user', 'Community',
       'replies'],
      dtype='object')

In [33]:
Data_3 = Data_3.drop(columns=['Category', 'date', 'hasReplies', 'id', 'likes', 'numberOfReplies', 'timestamp', 'user', 'Community', 'replies'])

In [34]:
Data_3.rename(columns={'CounterSpeech': 'Label', 'commentText': 'Tweet'}, inplace=True)

In [35]:
#To convert it in required label
encoder = LabelEncoder()
Data_3['Label'] = encoder.fit_transform(Data_3['Label'])
{class_:index for index, class_ in enumerate (encoder.classes_)}

{False: 0, True: 1}

In [36]:
#To rearrange the wrong labels
Data_3.loc[Data_3['Label']==0, 'Label'] = 2
Data_3.loc[Data_3['Label']==1, 'Label'] = 0
Data_3.loc[Data_3['Label']==2, 'Label'] = 1

In [38]:
Data_3['Tweet_P'] = Data_3.Tweet.str.casefold()
remove = string.printable.replace(string.ascii_letters, '').replace(' ', '')
table = str.maketrans(remove, ' '*len(remove))
Data_3['Tweet_P'] = Data_3.Tweet_P.str.translate(table)

In [39]:
#To convert in required data type
Data_3['Tweet_P']=Data_3['Tweet_P'].apply(str)
Data_3['Tweet']=Data_3['Tweet'].apply(str)

In [40]:
Data_3['Tweet_P'] = Data_3['Tweet_P'].apply(remove_stopwords)
Data_3['Tweet_P'] = Data_3['Tweet_P'].apply(p_stemming)
Data_3['Tweet_P'] = Data_3['Tweet_P'].apply(r_stemming)
Data_3['Tweet_P'] = Data_3['Tweet_P'].apply(lemmatizing)

In [41]:
print(Data_3.isna().sum())
print(Data_3.shape)

Label      0
Tweet      0
Tweet_P    0
dtype: int64
(13924, 3)


In [53]:
DataP_3.sample(5)

Unnamed: 0,Label,Tweet,Tweet_P
793,0,so sad how white people act,sad whit peopl act
13389,0,To be honest im all for the death penalty why ...,honest im death penalti wast tax payer dollar ...
12901,0,Ashamed that this is our world,asham world
1583,1,derka! derka! Muhammed jihad,derka derka muham jihad
12825,0,As a white man i can say that people like this...,whit man say peopl lik punish ther end reason ...


In [43]:
DataP_3=pd.DataFrame(Data_3)
DataP_3.to_csv('DataP_3.csv', index=False)
DataP_3.head()
files.download('DataP_3.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

""""Preprocessing for data from source R"""

In [44]:
Data_R = pd.read_csv('/content/Data_Source_R.csv')

In [45]:
Data_R.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [46]:
print(Data_R.columns)
Data_R = Data_R.drop(columns=['id'])
Data_R.rename(columns={'label': 'Label', 'tweet': 'Tweet'}, inplace=True)
print(Data_R.isna().sum())
print(Data_R.shape)

Index(['id', 'label', 'tweet'], dtype='object')
Label    0
Tweet    0
dtype: int64
(31962, 2)


In [47]:
Data_R['Tweet_P'] = Data_R.Tweet.str.casefold()
remove = string.printable.replace(string.ascii_letters, '').replace(' ', '')
table = str.maketrans(remove, ' '*len(remove))
Data_R['Tweet_P'] = Data_R.Tweet_P.str.translate(table)
Data_R['Tweet_P'] = Data_R['Tweet_P'].apply(remove_stopwords)
Data_R['Tweet_P'] = Data_R['Tweet_P'].apply(p_stemming)
Data_R['Tweet_P'] = Data_R['Tweet_P'].apply(r_stemming)
Data_R['Tweet_P'] = Data_R['Tweet_P'].apply(lemmatizing)

In [48]:
DataP_R=pd.DataFrame(Data_R)
DataP_R.to_csv('DataP_R.csv', index=False)
DataP_R.head()
files.download('DataP_R.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [49]:
print(DataP_R.isna().sum())
print(DataP_R.shape)

Label      0
Tweet      0
Tweet_P    0
dtype: int64
(31962, 3)


In [50]:
DataP_R.sample(5)

Unnamed: 0,Label,Tweet,Tweet_P
16398,0,@user ðbihday #virginboy,user ðbihday virginboy
2325,0,#model i love u take with u all the time in ...,model lov u tak u tim urð± ðððð ð...
17616,0,ð download daytodaygk mobile app for free,ð download daytodaygk mobil app fre
30146,0,today's garden love,today garden lov
10305,1,adveisments naked fucking whores,advei nak fuck whor
