In [366]:
import pandas as pd
import numpy as np

In [367]:
# importing initial dataset
df_1 = pd.read_csv('../data/raw/labeled_data.csv')

The following dataset had 6 columns:

1) count = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF).

2) hate_speech = number of CF users who judged the tweet to be hate speech.

3) offensive_language = number of CF users who judged the tweet to be offensive.

4)neither = number of CF users who judged the tweet to be neither offensive nor non-offensive.

5)class = class label for majority of CF users. 0 - **hate speech** / 1 - **offensive language** / 2 - **neither**

6)tweet = content of the tweet

In [368]:
# printing out the info
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
Unnamed: 0            24783 non-null int64
count                 24783 non-null int64
hate_speech           24783 non-null int64
offensive_language    24783 non-null int64
neither               24783 non-null int64
class                 24783 non-null int64
tweet                 24783 non-null object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [369]:
# renaming the first columns as id
df_1.rename(columns={'Unnamed: 0':'id'}, inplace = True)
# increasing max length for all columns
pd.set_option('display.max_colwidth', -1)

In [370]:
# set id to index
df_1.set_index('id')

Unnamed: 0_level_0,count,hate_speech,offensive_language,neither,class,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;
...,...,...,...,...,...,...
25291,3,0,2,1,1,"you's a muthaf***in lie &#8220;@LifeAsKing: @20_Pearls @corey_emanuel right! His TL is trash &#8230;. Now, mine? Bible scriptures and hymns&#8221;"
25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, and drove me redneck crazy"
25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I aint fuckin dis up again
25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [371]:
# there are three different classes on this dataset, all of which vaguely or clearly offensive
df_1['class'].value_counts()

1    19190
2    4163 
0    1430 
Name: class, dtype: int64

In the next section I'll take a closer look at some of the comments for each class in order to see whether the count/vote system is represantitive of the actual Tweet content

In [372]:
# racist or homophobic comments
hate_speech = df.loc[df['class']==0] 

In [373]:
hate_speech.head(50)


Unnamed: 0_level_0,count,hate_speech,offensive_language,neither,positive,class,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
85,3,2,1,0,0,0,"""@Blackman38Tide: @WhaleLookyHere @HowdyDowdy11 queer"" gaywad"
90,3,3,0,0,0,0,"""@CB_Baby24: @white_thunduh alsarabsss"" hes a beaner smh you can tell hes a mexican"
111,3,3,0,0,0,0,"""@DevilGrimz: @VigxRArts you're fucking gay, blacklisted hoe"" Holding out for #TehGodClan anyway http://t.co/xUCcwoetmn"
186,3,3,0,0,0,0,"""@MarkRoundtreeJr: LMFAOOOO I HATE BLACK PEOPLE https://t.co/RNvD2nLCDR"" This is why there's black people and niggers"
204,3,2,1,0,0,0,"""@NoChillPaz: ""At least I'm not a nigger"" http://t.co/RGJa7CfoiT""\n\nLmfao"
206,3,2,1,0,0,0,"""@NotoriousBM95: @_WhitePonyJr_ Ariza is a snake and a coward"" but at least he isn't a cripple like your hero Roach lmaoo"
221,6,3,2,1,0,0,"""@RTNBA: Drakes new shoes that will be released by Nike/Jordan.... Yes, there's glitter on the shoes http://t.co/QCtPLxHEXM"" ....dudes a fag"
263,3,2,1,0,0,0,"""@TheoMaxximus: #GerrysHalloweenParty http://t.co/3ycrSrnjHc"" Halloween was yesterday stupid nigger"
317,3,2,1,0,0,0,"""@ashlingwilde: @ItsNotAdam is bored supposed to be cute, you faggot?"" Sometimes"
320,3,2,1,0,0,0,"""@bigbootybishopp: @white_thunduh lassen cc , the coach wants me"" i feel it youll get more my nig no doubt"


In [374]:
# neither offensive nor non-offensive speech
neutral = df.loc[df['class']== 2] 

In [375]:
neutral.head(50)

Unnamed: 0_level_0,count,hate_speech,offensive_language,neither,positive,class,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3,0,0,3,0,2,!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
40,3,0,1,2,0,2,""" momma said no pussy cats inside my doghouse """
63,3,0,0,3,0,2,"""@Addicted2Guys: -SimplyAddictedToGuys http://t.co/1jL4hi8ZMF"" woof woof hot scally lad"
66,3,0,1,2,0,2,"""@AllAboutManFeet: http://t.co/3gzUpfuMev"" woof woof and hot soles"
67,3,0,1,2,0,2,"""@Allyhaaaaa: Lemmie eat a Oreo &amp; do these dishes."" One oreo? Lol"
70,3,0,0,3,0,2,"""@ArizonasFinest6: Why the eggplant emoji doe?""y he say she looked like scream lmao"
75,3,0,1,2,0,2,"""@BabyAnimalPics: baby monkey bathtime http://t.co/7KPWAdLF0R""\nAwwwwe! This is soooo ADORABLE!"
116,3,0,0,3,0,2,"""@DomWorldPeace: Baseball season for the win. #Yankees"" This is where the love started"
119,3,0,0,3,0,2,"""@DunderbaIl: I'm an early bird and I'm a night owl, so I'm wise and have worms."""
120,3,0,1,2,0,2,"""@EdgarPixar: Overdosing on heavy drugs doesn't sound bad tonight."" I do that pussy shit every day."


In [376]:
neutral['count'].value_counts()

3    3919
6    184 
4    37  
9    19  
7    4   
Name: count, dtype: int64

In [377]:
# offensive but not hate speech
offensive_language = df.loc[df['class']==1] 

In [378]:
offensive_language.head(50)

Unnamed: 0_level_0,count,hate_speech,offensive_language,neither,positive,class,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,0,3,0,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
2,3,0,3,0,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
3,3,0,2,1,0,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny
4,6,0,6,0,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;
5,3,1,2,0,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just blows me..claim you so faithful and down for somebody but still fucking with hoes! &#128514;&#128514;&#128514;"""
6,3,0,3,0,0,1,"!!!!!!""@__BrighterDays: I can not just sit up and HATE on another bitch .. I got too much shit going on!"""
7,3,0,3,0,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of you big bitches coming for us skinny girls!!&#8221;
8,3,0,3,0,0,1,""" &amp; you might not get ya bitch back &amp; thats that """
9,3,1,2,0,0,1,""" @rhythmixx_ :hobbies include: fighting Mariam""\n\nbitch"
10,3,0,3,0,0,1,""" Keeks is a bitch she curves everyone "" lol I walked into a conversation like this. Smh"


In [379]:
# inserting a new column in our dataframe to account for positive tweets (which we will extract in teh following section)
df_1.insert(5, 'positive' ,0)


In [380]:
df_1['positive'].value_counts()


0    24783
Name: positive, dtype: int64

In [381]:
df_1.tail()

Unnamed: 0,id,count,hate_speech,offensive_language,neither,positive,class,tweet
24778,25291,3,0,2,1,0,1,"you's a muthaf***in lie &#8220;@LifeAsKing: @20_Pearls @corey_emanuel right! His TL is trash &#8230;. Now, mine? Bible scriptures and hymns&#8221;"
24779,25292,3,0,1,2,0,2,"you've gone and broke the wrong heart baby, and drove me redneck crazy"
24780,25294,3,0,3,0,0,1,young buck wanna eat!!.. dat nigguh like I aint fuckin dis up again
24781,25295,6,0,6,0,0,1,youu got wild bitches tellin you lies
24782,25296,3,0,0,3,0,2,"~~Ruffled | Ntac Eileen Dahlia - Beautiful color combination of pink, orange, yellow &amp; white. A Coll http://t.co/H0dYEBvnZB"


At this point, we notice that our datapoints only contain either offensive language (of some form) or at least not non-offensive language labelled as neither. In order to build a model that can also successfully identify completely not offensive speech, we'll need to introduce some example of not-offensive language.

In [None]:
The following dataset has only 3 columns:

id

In [382]:
# importing a second dataset with only two types of tweets (offensive -1 / not offensive-0)
df_2 = pd.read_csv('../data/raw/train_E6oV3lV.csv')

In [383]:
# only three columns 
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
id       31962 non-null int64
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [384]:
# the majority of our datapoints are not offensive
df_2['label'].value_counts()

0    29720
1    2242 
Name: label, dtype: int64

In [385]:
# given that we have 23,353 offensive tweets on the other dataset, we'll select 23,000 thousand positive tweets from df_2
df_pos = df_2.loc[df_2['label'] == 0]

In [386]:
df_pos.head(50)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦
7,8,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl
8,9,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦
9,10,0,@user @user welcome here ! i'm it's so #gr8 !


In [387]:
# selecting only 23000 to have a balanced dataset
df_p_23 = df_pos[:23000]

In [388]:
len(df_p_23)

23000

In [389]:
# setting the index as id
df_p_23.set_index('id')


Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
3,0,bihday your majesty
4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
5,0,factsguide: society now #motivation
...,...,...
24725,0,"yes, leeds!! #lesbiunion #girlsweekend"
24726,0,in other non tattoo related news my boy will be #crystalpalace mascot at the home game against liverpool next season ! ð´ðµ
24727,0,finally wo agaya :):)
24728,0,ðð ð #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme #followâ¦


In [390]:
# recreating the same structure of the other df
df_p_23.insert(1, 'count' ,3)
df_p_23.insert(2,'hate_speech',0)
df_p_23.insert(3,'offensive_language',0)
df_p_23.insert(4,'neither',0)
df_p_23.insert(5,'positive',3)

In [391]:
df_p_23

Unnamed: 0,id,count,hate_speech,offensive_language,neither,positive,label,tweet
0,1,3,0,0,0,3,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,3,0,0,0,3,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,3,0,0,0,3,0,bihday your majesty
3,4,3,0,0,0,3,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,3,0,0,0,3,0,factsguide: society now #motivation
...,...,...,...,...,...,...,...,...
24724,24725,3,0,0,0,3,0,"yes, leeds!! #lesbiunion #girlsweekend"
24725,24726,3,0,0,0,3,0,in other non tattoo related news my boy will be #crystalpalace mascot at the home game against liverpool next season ! ð´ðµ
24726,24727,3,0,0,0,3,0,finally wo agaya :):)
24727,24728,3,0,0,0,3,0,ðð ð #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme #followâ¦


In [392]:
# removing the existing label
df_p_23.drop(['label'], axis = 1, inplace = True)

In [393]:
# inserting label as 4 for positive tweets
df_p_23.insert(6,'class', 3)


In [394]:
df_p_23

Unnamed: 0,id,count,hate_speech,offensive_language,neither,positive,class,tweet
0,1,3,0,0,0,3,3,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,3,0,0,0,3,3,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,3,0,0,0,3,3,bihday your majesty
3,4,3,0,0,0,3,3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,3,0,0,0,3,3,factsguide: society now #motivation
...,...,...,...,...,...,...,...,...
24724,24725,3,0,0,0,3,3,"yes, leeds!! #lesbiunion #girlsweekend"
24725,24726,3,0,0,0,3,3,in other non tattoo related news my boy will be #crystalpalace mascot at the home game against liverpool next season ! ð´ðµ
24726,24727,3,0,0,0,3,3,finally wo agaya :):)
24727,24728,3,0,0,0,3,3,ðð ð #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme #followâ¦


In [395]:
# making sure all Ids are unique. Last Id from df_1 was 25926
df_p_23['id'] = df_p_23['id'] + 25926

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [396]:
# final df comprised of 23000 positive tweets
df_p_23

Unnamed: 0,id,count,hate_speech,offensive_language,neither,positive,class,tweet
0,25927,3,0,0,0,3,3,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,25928,3,0,0,0,3,3,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,25929,3,0,0,0,3,3,bihday your majesty
3,25930,3,0,0,0,3,3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,25931,3,0,0,0,3,3,factsguide: society now #motivation
...,...,...,...,...,...,...,...,...
24724,50651,3,0,0,0,3,3,"yes, leeds!! #lesbiunion #girlsweekend"
24725,50652,3,0,0,0,3,3,in other non tattoo related news my boy will be #crystalpalace mascot at the home game against liverpool next season ! ð´ðµ
24726,50653,3,0,0,0,3,3,finally wo agaya :):)
24727,50654,3,0,0,0,3,3,ðð ð #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme #followâ¦


We'll then merge the negative plus positives tweets to have our initial unstructured dataset including both negative and positives tweets

In [397]:
df = pd.concat([df_1, df_p_23], join = 'inner')

In [398]:
df.set_index('id', inplace = True)

In [399]:
# the final dataset has a good balance between positive and negatives tweets
df['positive'].value_counts()

0    24783
3    23000
Name: positive, dtype: int64

In [400]:
df

Unnamed: 0_level_0,count,hate_speech,offensive_language,neither,positive,class,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3,0,0,3,0,2,!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
1,3,0,3,0,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
2,3,0,3,0,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
3,3,0,2,1,0,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny
4,6,0,6,0,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;
...,...,...,...,...,...,...,...
50651,3,0,0,0,3,3,"yes, leeds!! #lesbiunion #girlsweekend"
50652,3,0,0,0,3,3,in other non tattoo related news my boy will be #crystalpalace mascot at the home game against liverpool next season ! ð´ðµ
50653,3,0,0,0,3,3,finally wo agaya :):)
50654,3,0,0,0,3,3,ðð ð #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme #followâ¦


In [401]:
# resshuffling the rows
df = df.sample(frac=1).reset_index(drop=True)

In [403]:
df.head(50)

Unnamed: 0,count,hate_speech,offensive_language,neither,positive,class,tweet
0,3,0,0,0,3,3,@user @user @user @user @user please for us #notlongnow #whatson #familyfun
1,3,1,2,0,0,1,"RT @SenorSteez: Man, now these hoes know &#128514;&#128176;&#128182;&#128184; #TheJigIsUp http://t.co/8W9Ag3bx1a"
2,3,0,3,0,0,1,"@politsoutherner @leahmeyer49 \nYeah, I'd want to shoot the son of a bitch myself. LOL"
3,3,0,3,0,0,1,RT @SteffyBallin: i hate when people ask &#8220;who you tryna look good for?!&#8221; bitch myself bye
4,3,0,0,0,3,3,finally my computer is back ;) .
5,3,0,3,0,0,1,RT @AntiCHILL: Just had a convo wit my ex n she reminded me y she is my ex smh idk y i even answerd for this bitch
6,3,0,0,0,3,3,ðððfriendship #friends #friendship #à¹à¸à¸·à¹à¸­à¸ #à¸¡à¸´à¸à¸£à¸ à¸²à¸ #summer #sea #foot #three #åè° #æå #æµ·è¾¹
7,3,0,3,0,0,1,Can't keep a shallow bitch if ya pockets ain't deep.
8,3,0,0,0,3,3,#boring saturday #summer #rain #candles #all_sunsets #selfies #holiday #sun #pay #girls #glasses #fun #instagood #baby #trees #b...
9,3,0,0,3,0,2,@kincade00 @mistaturk5 lmaooo just call my phone if you want them birds *Migos Voice*
