In [19]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:.0f}'.format

In [20]:
df = pd.read_csv(r'C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\unclean_combined_dataset.csv')

In [21]:
df.head(5)

Unnamed: 0,id,text,Annotation
0,596965466238619648,I just found the *perfect* rental. Why can't m...,Neither
1,575891807873040384,"@wetsprocket every time they ""discover"" anythi...",Neither
2,595259457828884480,ok time to write code bbl.\n\nmaking a new thing.,Neither
3,572344911002927104,Refined dessert! NOT #MKR,Neither
4,568132033215475712,@furt1v3ly @ClarkHat One of the best things an...,Neither


In [22]:
#text column requires some cleaning before the data can be used in a model

#1 - removing any links (links to websites / pictures)
#twitter has a link shortener where links will begin with http://t.co

print("example tweet with a link in it:\n", df.iloc[123].text)

import re
print("\n remove substrings of the 'text' column that have a http or https link...\n")
for i in range(len(df)):
    current_text = df.iloc[i].text
    text_without_link = re.sub('http://\S+|https://\S+', '', current_text)
    df['text'] = df['text'].replace(current_text, text_without_link)
    
print("the tweet text now reads:\n", df.iloc[123].text)

df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\dataset_no_links.csv", 
          index=False)

example tweet with a link in it:
 Verification: Yes, this is me. http://t.co/UQ9CEJVuAp

 remove substrings of the 'text' column that have a http or https link...

the tweet text now reads:
 Verification: Yes, this is me. 


In [23]:
#2 - removing any text that immidiately follows an @ / remove RT
#on twitter, text that immidiately follows the @ is known as a tag. this is a way to involve another
#user with the tweet, these do not relate to the sentiment of the tweet so they will be removed.

print("example tweet with an @ in it:\n", df.iloc[512].text)
print("\n remove substrings of the 'text' column that begin with an @...\n")

for i in range(len(df)):
    current_text = df.iloc[i].text
    text_without_at = re.sub('@\S+', '', current_text)
    df['text'] = df['text'].replace(current_text, text_without_at)
    
print("the tweet text now reads:\n", df.iloc[512].text)

#RT is text that tends to specify a retweet, removing as it doesnt add much to the context
print("\nexample tweet with an RT in it:\n", df.iloc[9].text, "\n")
for i in range(len(df)):
    current_text = df.iloc[i].text
    text_without_rt = re.sub(r"\bRT\b", "", current_text)
    df['text'] = df['text'].replace(current_text, text_without_rt)
    
print("the tweet text now reads:\n", df.iloc[9].text)


df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\dataset_nl_n@.csv", 
          index=False)

example tweet with an @ in it:
 @Auragasmic &lt;--Hysterical feminazi Level 100 @jtidyman @TheMitch182 @E__Strobel

 remove substrings of the 'text' column that begin with an @...

the tweet text now reads:
  &lt;--Hysterical feminazi Level 100   

example tweet with an RT in it:
 RT  Gonna be making a big announcement at the panel I am on about online harassment today :D 

the tweet text now reads:
   Gonna be making a big announcement at the panel I am on about online harassment today :D


In [24]:
#3 - punctuation cleaning 
#some symbol cleaning 
#change &amp; to and
df['text'] = df['text'].str.replace('&amp;', 'and')
# remove &lt; (<)
df['text'] = df['text'].str.replace('&lt;', '')
print("the last tweet text now reads:\n", df.iloc[512].text, "\n")

#remove punctuation
print("example tweet with some punctuation in it:\n", df.iloc[3123].text)
df['text'] = df['text'].str.replace('[^\w\s]','')
print("the tweet text now reads:\n", df.iloc[3123].text)

df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\dataset_nl_n@_np.csv", 
          index=False)

the last tweet text now reads:
  --Hysterical feminazi Level 100    

example tweet with some punctuation in it:
 It's difficult deciding to call out something as being problematic, because that gives it attention and not many people know about it.
the tweet text now reads:
 Its difficult deciding to call out something as being problematic because that gives it attention and not many people know about it


In [25]:
#4 - remove tweets which are now empty
print("here is a datapoint with an empty field")
print(df.iloc[16], "\n")
df = df.replace(r'^\s*$', np.nan, regex=True)
print("Dataset length before empty removal:", len(df))
df = df.dropna()

#reset the index since we removed some items
df = df.reset_index()
df = df.drop(['index'], axis=1)
print("Dataset length after empty removal:", len(df), "\n")
print("the datapoint has now been removed and index reformatted\n")
print(df.iloc[16])

df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\dataset_nl_n@_np_ne.csv", 
          index=False)

here is a datapoint with an empty field
id           573180997631537216
text                           
Annotation              Neither
Name: 16, dtype: object 

Dataset length before empty removal: 6946
Dataset length after empty removal: 6836 

the datapoint has now been removed and index reformatted

id                                           563378742152540160
text            And White people who bring up queerness or b...
Annotation                                              Neither
Name: 16, dtype: object


In [26]:
import nltk
#6 - lowercase / tokenize
df["text"] = df["text"].str.lower()

#tokenize text into a separate column
df['tokenized_text'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\dataset_nl_n@_np_ne_tokens.csv", 
          index=False)

df.head(5)

Unnamed: 0,id,text,Annotation,tokenized_text
0,596965466238619648,i just found the perfect rental why cant my le...,Neither,"[i, just, found, the, perfect, rental, why, ca..."
1,575891807873040384,every time they discover anything its either ...,Neither,"[every, time, they, discover, anything, its, e..."
2,595259457828884480,ok time to write code bbl\n\nmaking a new thing,Neither,"[ok, time, to, write, code, bbl, making, a, ne..."
3,572344911002927104,refined dessert not mkr,Neither,"[refined, dessert, not, mkr]"
4,568132033215475712,one of the best things anyone can do to impr...,Neither,"[one, of, the, best, things, anyone, can, do, ..."


In [27]:
#5 - stopword removal
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

stop = stopwords.words('english')

df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [item for item in x if item not in stop])

df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\dataset_nl_n@_np_ne_tokens_ns.csv", 
          index=False)

df.head(5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jackc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,text,Annotation,tokenized_text
0,596965466238619648,i just found the perfect rental why cant my le...,Neither,"[found, perfect, rental, cant, lease, right]"
1,575891807873040384,every time they discover anything its either ...,Neither,"[every, time, discover, anything, either, inco..."
2,595259457828884480,ok time to write code bbl\n\nmaking a new thing,Neither,"[ok, time, write, code, bbl, making, new, thing]"
3,572344911002927104,refined dessert not mkr,Neither,"[refined, dessert, mkr]"
4,568132033215475712,one of the best things anyone can do to impr...,Neither,"[one, best, things, anyone, improve, understan..."


In [28]:
from nltk.stem import WordNetLemmatizer 
#lemmatize
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

df['text_lemma'] = df["tokenized_text"].apply(lemmatize_text)

df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\dataset_nl_n@_np_ne_tokens_ns_lem.csv", 
          index=False)

df.head(5)

Unnamed: 0,id,text,Annotation,tokenized_text,text_lemma
0,596965466238619648,i just found the perfect rental why cant my le...,Neither,"[found, perfect, rental, cant, lease, right]","[found, perfect, rental, cant, lease, right]"
1,575891807873040384,every time they discover anything its either ...,Neither,"[every, time, discover, anything, either, inco...","[every, time, discover, anything, either, inco..."
2,595259457828884480,ok time to write code bbl\n\nmaking a new thing,Neither,"[ok, time, write, code, bbl, making, new, thing]","[ok, time, write, code, bbl, making, new, thing]"
3,572344911002927104,refined dessert not mkr,Neither,"[refined, dessert, mkr]","[refined, dessert, mkr]"
4,568132033215475712,one of the best things anyone can do to impr...,Neither,"[one, best, things, anyone, improve, understan...","[one, best, thing, anyone, improve, understand..."


In [29]:
#create a column of pos tagged words in each tweet
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

tweets = df.text
tweet_tags = []

for t in tweets:
    tokens = nltk.word_tokenize(t)
    tags = nltk.pos_tag(tokens)
    new_text = []
    for word in tags:
        new_text.append(word[0] + "_" + word[1])
    tweet_tags.append(new_text)
    
tagged_tokens = []
for tweet in tweet_tags:
    sent = ' '.join(tweet)
    new_tokens = tokenizer.tokenize(sent)
    tagged_tokens.append(new_tokens)
    
df['pos_tags'] = tagged_tokens

df


Unnamed: 0,id,text,Annotation,tokenized_text,text_lemma,pos_tags
0,596965466238619648,i just found the perfect rental why cant my le...,Neither,"[found, perfect, rental, cant, lease, right]","[found, perfect, rental, cant, lease, right]","[i_NN, just_RB, found_VBD, the_DT, perfect_JJ,..."
1,575891807873040384,every time they discover anything its either ...,Neither,"[every, time, discover, anything, either, inco...","[every, time, discover, anything, either, inco...","[every_DT, time_NN, they_PRP, discover_VBP, an..."
2,595259457828884480,ok time to write code bbl\n\nmaking a new thing,Neither,"[ok, time, write, code, bbl, making, new, thing]","[ok, time, write, code, bbl, making, new, thing]","[ok_JJ, time_NN, to_TO, write_VB, code_NN, bbl..."
3,572344911002927104,refined dessert not mkr,Neither,"[refined, dessert, mkr]","[refined, dessert, mkr]","[refined_VBN, dessert_NN, not_RB, mkr_VB]"
4,568132033215475712,one of the best things anyone can do to impr...,Neither,"[one, best, things, anyone, improve, understan...","[one, best, thing, anyone, improve, understand...","[one_CD, of_IN, the_DT, best_JJS, things_NNS, ..."
...,...,...,...,...,...,...
6831,186961697997721984,in case you missed it a homophobes guide for d...,Homophobia,"[case, missed, homophobes, guide, dealing, gay...","[case, missed, homophobe, guide, dealing, gay,...","[in_IN, case_NN, you_PRP, missed_VBD, it_PRP, ..."
6832,186629008824348992,a homophobes guide for dealing with a gay child,Homophobia,"[homophobes, guide, dealing, gay, child]","[homophobe, guide, dealing, gay, child]","[a_DT, homophobes_JJ, guide_NN, for_IN, dealin..."
6833,185639770825830016,joey mitchell reads through our 100 real twee...,Homophobia,"[joey, mitchell, reads, 100, real, tweets, hom...","[joey, mitchell, read, 100, real, tweet, homop...","[joey_NN, mitchell_NN, reads_VBZ, through_IN, ..."
6834,182386216543600992,why i hate gay people,Homophobia,"[hate, gay, people]","[hate, gay, people]","[why_WRB, i_JJ, hate_NN, gay_NN, people_NNS]"


In [30]:
#finally, add a binary Hate column which we will use for hate/non-hate classification
df['Hate'] = 0
for i in range(len(df)):
    if (df['Annotation'][i] != 'Neither'):
        df['Hate'][i] = 1
        
df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\clean_dataset.csv", 
          index=False)
        
df.iloc[1000]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Hate'][i] = 1


id                                               570731745752064000
text               yes it is legit sweden is one place any livin...
Annotation                                                   Sexism
tokenized_text    [yes, legit, sweden, one, place, living, thing...
text_lemma        [yes, legit, sweden, one, place, living, thing...
pos_tags          [yes_NNS, it_PRP, is_VBZ, legit_JJ, sweden_NN,...
Hate                                                              1
Name: 1000, dtype: object