This notebook aims to augment the existing datasets on hate speech into one large single dataset that is more or less balanced and has the right amount of data for generating word embeddings
## Import libraries

In [1]:
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

from utilities.data_preprocessors import simple_preprocess, preprocess, re_encode_tweet_targets, re_encode_ethos_targets
from utilities.data_loaders import load_data, load_binary_data, load_multi_label_data
from utilities.data_visualizers import view_sentence

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mig\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## load datasets hate-offensive-speech, slur-corupus, ethos, reddit

In [2]:
# load dataframes
tweets = load_data('./data/hate-speech-and-offensive-language/hate-offensive-speech.csv')
slurs = load_data('./data/slur-corpus/kurrek.2020.slur-corpus.csv', 'slur-corpus')

tweets

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,2
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1
...,...,...
25291,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,1
25292,"you've gone and broke the wrong heart baby, an...",2
25294,young buck wanna eat!!.. dat nigguh like I ain...,1
25295,youu got wild bitches tellin you lies,1


In [3]:
print(tweets['class'].dtype)
print(tweets['tweet'].dtype)

int64
object


In [4]:
slurs

Unnamed: 0,body,gold_label
0,Fuck that I dont wanna watch tranny porn.,DEG
1,Opie just wanted to have a good time. The same...,DEG
2,Fuck that faggot Fallon. Fucking sissy boy act...,DEG
3,BbBB...b.b..b.bb but OP's a faggot,DEG
4,Who even uses the word tranny except for trans...,NDG
...,...,...
39998,*As an alcoholic who on occasion has one too m...,DEG
39999,"It's not inaccurate, it's indoctrination of yo...",DEG
40000,What did you expect?? when they banned /r/nigg...,NDG
40001,That was hilariously bad. They used the insult...,NDG


## Explore dataset


In [5]:
print(tweets['class'].unique())
print(slurs['gold_label'].unique())

[2 1 0]
['DEG' 'NDG' 'HOM' 'CMP' 'APR' nan]


In [6]:
print(tweets['class'].value_counts())
print(slurs['gold_label'].value_counts())

1    19190
2     4163
0     1430
Name: class, dtype: int64
DEG    20530
NDG    16727
HOM     1998
APR      553
CMP      189
Name: gold_label, dtype: int64


## Clean data with nans

In [7]:
nan_idxs = np.where(slurs['gold_label'].isnull())[0]
nan_idxs

array([ 5733,  5734, 22333, 22334, 31815, 31816], dtype=int64)

In [8]:
slurs.iloc[nan_idxs]

Unnamed: 0,body,gold_label
5733,Nigger nigger nigger nigger nigger nigger.,
5734,,
22333,Mosquito,
22334,,
31815,They advertised posting it to r/atheism in the...,
31816,,


#### some rows with nans in gold_label have significant slur words attached to the comment so replace gold_label with DEG

In [9]:
slurs['body'] = slurs['body'].astype('str')
slurs['gold_label'] = slurs['gold_label'].astype('str')

# manually change gold_label of relevant comments to one of the classes DEG, NDG, HOM, CMP, and APR
slurs.loc[[5733, 31815], 'gold_label'] = 'DEG'

In [10]:
type(slurs.loc[5734, 'body'])

str

In [11]:
type(slurs.loc[5734, 'gold_label'])

str

In [12]:
slurs.iloc[nan_idxs]

Unnamed: 0,body,gold_label
5733,Nigger nigger nigger nigger nigger nigger.,DEG
5734,,
22333,Mosquito,
22334,,
31815,They advertised posting it to r/atheism in the...,DEG
31816,,


#### since all gold_label with nans and with significant slurs attached to them have been modified drop the rest of the rows with both nan in comment and in gold_label

In [13]:
# identify rows with both nan in comment or label
nan_rows = np.where((slurs['gold_label'] == 'nan') | (slurs['body'] == 'nan'))[0]
nan_rows

array([ 5734, 22333, 22334, 31816], dtype=int64)

In [14]:
# drop rows with nan in comment or label
slurs.drop(nan_rows, inplace=True)
slurs.reset_index(drop=True, inplace=True)

In [15]:
nan_idxs_new = np.where(slurs['gold_label'].isnull())[0]
slurs.iloc[nan_idxs_new]

Unnamed: 0,body,gold_label


In [16]:
print(slurs['gold_label'].unique())
print(slurs['gold_label'].value_counts())

['DEG' 'NDG' 'HOM' 'CMP' 'APR']
DEG    20532
NDG    16727
HOM     1998
APR      553
CMP      189
Name: gold_label, dtype: int64


In [17]:
slurs

Unnamed: 0,body,gold_label
0,Fuck that I dont wanna watch tranny porn.,DEG
1,Opie just wanted to have a good time. The same...,DEG
2,Fuck that faggot Fallon. Fucking sissy boy act...,DEG
3,BbBB...b.b..b.bb but OP's a faggot,DEG
4,Who even uses the word tranny except for trans...,NDG
...,...,...
39994,*As an alcoholic who on occasion has one too m...,DEG
39995,"It's not inaccurate, it's indoctrination of yo...",DEG
39996,What did you expect?? when they banned /r/nigg...,NDG
39997,That was hilariously bad. They used the insult...,NDG


## Preprocess text
- remove trailing whitespaces
- remove non-alphanumeric characters
- lower sentences
- tokenize
- remove stop words
- lemmatize or stem word
- encode target column to numbers

<u>or use gensim.utils.simple_preprocess as callback of self.apply()</u>

In [18]:
tweets['tweet'] = tweets['tweet'].apply(preprocess)

In [19]:
tweets['tweet'] = tweets['tweet'].apply(simple_preprocess)

rt  as a woman you should not  complain about cleaning up your house amp as a man you should always take the trash out
rt  boy dats coldtyga dwn bad for cuffin dat hoe in the st place
rt  dawg rt  you ever fuck a bitch and she start to cry you be confused as shit
rt   she look like a tranny
rt  the shit you hear about me might be true or it might be faker than the bitch who told it to ya
the shit just blows meclaim you so faithful and down for somebody but still fucking with hoes
i can not just sit up and hate on another bitch  i got too much shit going on
cause i am  tired of you big bitches coming for us skinny girls
amp you might not get ya bitch back amp thats that
hobbies include fighting mariam bitch
keeks is a bitch she curves everyone  lol i walked into a conversation like this smh
murda gang bitch its gang land
so hoes that smoke are losers   yea  go on ig
bad bitches is the only thing that i like
bitch get up off me
bitch nigga miss me with it
bitch plz whatever
bitch who do 

In [20]:
slurs['body'] = slurs['body'].apply(preprocess)

In [21]:
view_sentence(slurs['body'], limit=50)

phase:
Fuck that I dont wanna watch tranny porn. 

phase:
Opie just wanted to have a good time. The same old rant about Africans from Ant and the same old joke about tranny cock from Jim. Opie admitted and accepted his place on the show. It was the two other queens who couldn’t keep it together. Opie ✊

phase:
Fuck that faggot Fallon. Fucking sissy boy acting like an 8 year old playing slumber party games. Put Norm &amp; Louis on tour together. See how much fucking money they make together. 

phase:
BbBB...b.b..b.bb but OP's a faggot

phase:
Who even uses the word tranny except for transphobes lol

phase:
I'm just happy that anyone wants this faggot boipussy. 💓

phase:
this sub is big, /r/boottoobig is bigger But the admin running the page is a nigger 

phase:
Yeah but what about real sports like nigger rugby and vertical nigger soccer.

phase:
Faggot Jk. It looks awesome! 

phase:
That's like saying the the n word belong to hispanics. The word faggot has always applied specifically to

In [22]:
slurs['body'] = slurs['body'].apply(simple_preprocess)

fuck that i dont wanna watch tranny porn
opie just wanted to have a good time the same old rant about africans from ant and the same old joke about tranny cock from jim opie admitted and accepted his place on the show it was the two other queens who couldnt keep it together opie
fuck that faggot fallon fucking sissy boy acting like an  year old playing slumber party games put norm amp louis on tour together see how much fucking money they make together
bbbbbbbbb but op  a faggot
who even uses the word tranny except for transphobes lol
i am  just happy that anyone wants this faggot boipussy
this sub is big rboottoobig is bigger but the admin running the page is a nigger
yeah but what about real sports like nigger rugby and vertical nigger soccer
faggot jk it looks awesome
that is  like saying the the n word belong to hispanics the word faggot has always applied specifically to male homosexuals and bisexuals it does not belong to her group
i would  say the red pilling happened anyway sin

## Encode target/y labels/outputs/columns of the slur dataset
- recall that it has the ff labels and its respective counts
- ['DEG' 'NDG' 'HOM' 'CMP' 'APR']
- DEG    20532
- NDG    16727
- HOM     1998
- APR      553
- CMP      189

In [23]:
#  encoding labels to numbers in slur dataset
encoder = LabelEncoder()
slurs['gold_label'] = encoder.fit_transform(slurs['gold_label'])
print(slurs['gold_label'].unique())
print(slurs['gold_label'].value_counts())

[2 4 3 1 0]
2    20532
4    16727
3     1998
0      553
1      189
Name: gold_label, dtype: int64


summary:
- DEG has label 2, NDG has label 4, HOM has label 3, APR has label 0, and CMP has label 1
- encode non-derogatory label to just 1 and just think of 1 as now the non-derogatory label instead of just noise

In [24]:
slurs.loc[slurs['gold_label'] == 4, 'gold_label'] = 1
print(slurs['gold_label'].unique())
print(slurs['gold_label'].value_counts())
slurs

[2 1 3 0]
2    20532
1    16916
3     1998
0      553
Name: gold_label, dtype: int64


Unnamed: 0,body,gold_label
0,"[fuck, dont, wanna, watch, tranny, porn]",2
1,"[opie, wanted, good, time, old, rant, african,...",2
2,"[fuck, faggot, fallon, fucking, sissy, boy, ac...",2
3,"[bbbbbbbbb, op, faggot]",2
4,"[even, us, word, tranny, except, transphobes, ...",1
...,...,...
39994,"[alcoholic, occasion, one, many, beer, lying, ...",2
39995,"[inaccurate, indoctrination, woman, fuck, nigg...",2
39996,"[expect, banned, rniggers, user, never, gone, ...",1
39997,"[hilariously, bad, used, insult, tranny, episo...",1


## new label mappings are
- DEG/HATE 2
- NDG/CMP/NEITHER 1
- HOM 3
- APR/OFFENSIVE 0

## re-encode labels of hate tweet dataset
- hate class currently 0 can be lumped in with derogatory class of slur dataset, so encode to 2
- offensive class currently 1 can be lumped in with appropriative class of slur dataset, so encode to 0
- neither class currently 2 can be lumped in with non derogatory of slur dataset, so encode to 1

In [25]:
tweets['class'] = tweets['class'].apply(re_encode_tweet_targets)
print(tweets['class'].unique())
print(tweets['class'].value_counts())
tweets

[1 0 2]
0    19190
1     4163
2     1430
Name: class, dtype: int64


Unnamed: 0,tweet,class
0,"[woman, complain, cleaning, house, man, always...",1
1,"[boy, dat, coldtyga, dwn, bad, cuffin, dat, ho...",0
2,"[dawg, ever, fuck, bitch, start, cry, confused...",0
3,"[look, like, tranny]",0
4,"[shit, hear, might, true, might, faker, bitch,...",0
...,...,...
25291,"[muthafin, lie, right, tl, trash, mine, bible,...",0
25292,"[gone, broke, wrong, heart, baby, drove, redne...",1
25294,"[young, buck, wanna, eat, dat, nigguh, like, a...",0
25295,"[youu, got, wild, bitch, tellin, lie]",0


in summary classes are now to be the ff:
- NEITHER 1
- OFFENSIVE 0
- HATE 2
- HOMONYM 3

# Ethos Dataset Exploration
- on subsequent cells the reason why ethos_multi or teh multi labeled ethos dataset is not included is because ethos_multi is the same dataset as ethos_binary albeit the difference is that ethos_multi introduces other features of the comment not just whether comment "isHate" or not, like if comment is about sexuality, religion, race etc.
- comments and isHate column values on both binary and multi datasets are the same

In [26]:
ethos_binary = load_binary_data()

                                               comment  isHate
0            You should know women's sports are a joke     1.0
1      You look like Sloth with deeper Down’s syndrome     1.0
2    You look like Russian and speak like Indian. B...     1.0
3                 Women deserve to be abused, I guess.     1.0
4    Women are made for making babies and cooking d...     1.0
..                                                 ...     ...
993   From the midnight sun where the hot springs blow     0.0
994                        Don't say I'm not your type     0.0
995   And therefore never send to know for whom the...     0.0
996                      And I can't stand another day     0.0
997   All values, unless otherwise stated, are in U...     0.0

[998 rows x 2 columns]
X trains: ["You should know women's sports are a joke"
 'You look like Sloth with deeper Down’s syndrome'
 'You look like Russian and speak like Indian. Both are disgusting go kill yourself'
 'Women deserve to be abused,

## re encode columns of isHate
- encode values 1 and 0 representing (hate/derogatory or offensive/appropriative) and non-derogatory to (2 or 0) and 1 respectively

In [27]:
ethos_binary['isHate'] = ethos_binary['isHate'].apply(re_encode_ethos_targets)
ethos_binary

Unnamed: 0,comment,isHate
0,"[you, should, know, woman, sport, are, joke]",2
1,"[you, look, like, sloth, with, deeper, down, s...",2
2,"[you, look, like, russian, and, speak, like, i...",2
3,"[woman, deserv, to, be, abus, guess]",2
4,"[woman, are, made, for, make, babi, and, cook,...",2
...,...,...
993,"[from, the, midnight, sun, where, the, hot, sp...",1
994,"[do, not, say, am, not, your, type]",1
995,"[and, therefor, never, send, to, know, for, wh...",1
996,"[and, cannot, stand, anoth, day]",1


## rename columns of slur, tweet, and ethos (binary) datasets and combine

In [28]:
tweets.rename(columns={'tweet': 'comment', 'class': 'label'}, inplace=True)
slurs.rename(columns={'body': 'comment', 'gold_label': 'label'}, inplace=True)
ethos_binary.rename(columns={'isHate': 'label'}, inplace=True)

In [29]:
tweets

Unnamed: 0,comment,label
0,"[woman, complain, cleaning, house, man, always...",1
1,"[boy, dat, coldtyga, dwn, bad, cuffin, dat, ho...",0
2,"[dawg, ever, fuck, bitch, start, cry, confused...",0
3,"[look, like, tranny]",0
4,"[shit, hear, might, true, might, faker, bitch,...",0
...,...,...
25291,"[muthafin, lie, right, tl, trash, mine, bible,...",0
25292,"[gone, broke, wrong, heart, baby, drove, redne...",1
25294,"[young, buck, wanna, eat, dat, nigguh, like, a...",0
25295,"[youu, got, wild, bitch, tellin, lie]",0


In [30]:
slurs

Unnamed: 0,comment,label
0,"[fuck, dont, wanna, watch, tranny, porn]",2
1,"[opie, wanted, good, time, old, rant, african,...",2
2,"[fuck, faggot, fallon, fucking, sissy, boy, ac...",2
3,"[bbbbbbbbb, op, faggot]",2
4,"[even, us, word, tranny, except, transphobes, ...",1
...,...,...
39994,"[alcoholic, occasion, one, many, beer, lying, ...",2
39995,"[inaccurate, indoctrination, woman, fuck, nigg...",2
39996,"[expect, banned, rniggers, user, never, gone, ...",1
39997,"[hilariously, bad, used, insult, tranny, episo...",1


In [31]:
ethos_binary

Unnamed: 0,comment,label
0,"[you, should, know, woman, sport, are, joke]",2
1,"[you, look, like, sloth, with, deeper, down, s...",2
2,"[you, look, like, russian, and, speak, like, i...",2
3,"[woman, deserv, to, be, abus, guess]",2
4,"[woman, are, made, for, make, babi, and, cook,...",2
...,...,...
993,"[from, the, midnight, sun, where, the, hot, sp...",1
994,"[do, not, say, am, not, your, type]",1
995,"[and, therefor, never, send, to, know, for, wh...",1
996,"[and, cannot, stand, anoth, day]",1


In [32]:
# combining 
df = pd.concat([tweets, slurs, ethos_binary], ignore_index=True)
df

Unnamed: 0,comment,label
0,"[woman, complain, cleaning, house, man, always...",1
1,"[boy, dat, coldtyga, dwn, bad, cuffin, dat, ho...",0
2,"[dawg, ever, fuck, bitch, start, cry, confused...",0
3,"[look, like, tranny]",0
4,"[shit, hear, might, true, might, faker, bitch,...",0
...,...,...
65775,"[from, the, midnight, sun, where, the, hot, sp...",1
65776,"[do, not, say, am, not, your, type]",1
65777,"[and, therefor, never, send, to, know, for, wh...",1
65778,"[and, cannot, stand, anoth, day]",1


again in summary classes are now to be the ff:
- NEITHER 1
- OFFENSIVE 0
- HATE 2
- HOMONYM 3

In [33]:
df
print(df['label'].unique())
print(df['label'].value_counts())

[1 0 2 3]
2    22395
1    21644
0    19743
3     1998
Name: label, dtype: int64


In [34]:
type(df.iloc[0])

pandas.core.series.Series

## Export data to accessbile .csv file

In [35]:
df.to_csv('./data/hate-speech-data-cleaned.csv')