In [25]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:.0f}'.format

In [26]:
df = pd.read_csv(r'C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\unclean_combined_dataset.csv')

In [27]:
df.head(5)

Unnamed: 0,id,text,Annotation
0,596965466238619648,I just found the *perfect* rental. Why can't m...,Neither
1,575891807873040384,"@wetsprocket every time they ""discover"" anythi...",Neither
2,595259457828884480,ok time to write code bbl.\n\nmaking a new thing.,Neither
3,572344911002927104,Refined dessert! NOT #MKR,Neither
4,568132033215475712,@furt1v3ly @ClarkHat One of the best things an...,Neither


In [28]:
#text column requires some cleaning before the data can be used in a model

#1 - removing any links (links to websites / pictures)
#twitter has a link shortener where links will begin with http://t.co

#Create new column to store the clean preprocessed text
df["clean_text"] = np.nan

print("example tweet with a link in it:\n", df.iloc[123].text)
print("example tweet with a link in it:\n", df.iloc[74].text)


import re
print("\n remove substrings of the 'text' column that have a http or https link...\n")
for i in range(len(df)):
    current_text = df.iloc[i].text
    text_without_link = re.sub('http[s]?://\S+', '', current_text)
    df['clean_text'][i] = text_without_link
    
print("the tweet text now reads:\n", df.iloc[123].clean_text)
print("the tweet text now reads:\n", df.iloc[74].clean_text)

df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\dataset_no_links.csv", 
          index=False)

example tweet with a link in it:
 Verification: Yes, this is me. http://t.co/UQ9CEJVuAp
example tweet with a link in it:
 #stopgamergate https://t.co/9ImWLPMpB9 "Anita Sarkeesian Is A Feminazi" because... uh... communism?

 remove substrings of the 'text' column that have a http or https link...



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'][i] = text_without_link
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


the tweet text now reads:
 Verification: Yes, this is me. 
the tweet text now reads:
 #stopgamergate  "Anita Sarkeesian Is A Feminazi" because... uh... communism?


In [29]:
#2 - removing any text that immidiately follows an @ / remove RT
#on twitter, text that immidiately follows the @ is known as a tag. this is a way to involve another
#user with the tweet, these do not relate to the sentiment of the tweet so they will be removed.

print("example tweet with an @ in it:\n", df.iloc[512].clean_text)
print("\n remove substrings of the 'text' column that begin with an @...\n")


for i in range(len(df)):
    current_text = df.iloc[i].clean_text
    text_without_at = re.sub('@\S+', '', current_text)
    df['clean_text'] = df['clean_text'].replace(df.iloc[i].clean_text, text_without_at)
    
print("the tweet text now reads:\n", df.iloc[512].clean_text)

# #RT is text that tends to specify a retweet, removing as it doesnt add much to the context
# print("\nexample tweet with an RT in it:\n", df.iloc[9].text, "\n")
# for i in range(len(df)):
#     current_text = df.iloc[i].text
#     text_without_rt = re.sub(r"\bRT\b", "", current_text)
#     df['clean_text'] = df['clean_text'].replace(current_text, text_without_rt)
    
# print("the tweet text now reads:\n", df.iloc[9].text)

example tweet with an @ in it:
 @Auragasmic &lt;--Hysterical feminazi Level 100 @jtidyman @TheMitch182 @E__Strobel

 remove substrings of the 'text' column that begin with an @...

the tweet text now reads:
  &lt;--Hysterical feminazi Level 100   


In [30]:
#3 - punctuation cleaning 
#some symbol cleaning 
#change &amp; to and
df['clean_text'] = df['clean_text'].str.replace('&amp;', 'and')
# remove &lt; (<)
df['clean_text'] = df['clean_text'].str.replace('&lt;', '')
print("the last tweet text now reads:\n", df.iloc[512].clean_text, "\n")

#remove punctuation
print("example tweet with some punctuation in it:\n", df.iloc[3123].clean_text)
df['clean_text'] = df['clean_text'].str.replace('[^\w\s]','')
print("the tweet text now reads:\n", df.iloc[3123].clean_text)

the last tweet text now reads:
  --Hysterical feminazi Level 100    

example tweet with some punctuation in it:
 It's difficult deciding to call out something as being problematic, because that gives it attention and not many people know about it.
the tweet text now reads:
 Its difficult deciding to call out something as being problematic because that gives it attention and not many people know about it


In [31]:
#4 remove numbers
print("tweet with numbers:\n", df.iloc[512].clean_text, "\n")
df['clean_text'] = df['clean_text'].str.replace('\d+', '')

print("tweet now reads:\n", df.iloc[512].clean_text, "\n")

tweet with numbers:
  Hysterical feminazi Level 100    

tweet now reads:
  Hysterical feminazi Level     



In [32]:
import nltk
#6 - lowercase
df["clean_text"] = df["clean_text"].str.lower()

df.head(5)

Unnamed: 0,id,text,Annotation,clean_text
0,596965466238619648,I just found the *perfect* rental. Why can't m...,Neither,i just found the perfect rental why cant my le...
1,575891807873040384,"@wetsprocket every time they ""discover"" anythi...",Neither,every time they discover anything its either ...
2,595259457828884480,ok time to write code bbl.\n\nmaking a new thing.,Neither,ok time to write code bbl\n\nmaking a new thing
3,572344911002927104,Refined dessert! NOT #MKR,Neither,refined dessert not mkr
4,568132033215475712,@furt1v3ly @ClarkHat One of the best things an...,Neither,one of the best things anyone can do to impr...


In [33]:
#5 - stopword removal
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

stop = stopwords.words('english')

df['clean_text_sl'] = df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df.head(5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jackc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,text,Annotation,clean_text,clean_text_sl
0,596965466238619648,I just found the *perfect* rental. Why can't m...,Neither,i just found the perfect rental why cant my le...,found perfect rental cant lease right
1,575891807873040384,"@wetsprocket every time they ""discover"" anythi...",Neither,every time they discover anything its either ...,every time discover anything either incorrect ...
2,595259457828884480,ok time to write code bbl.\n\nmaking a new thing.,Neither,ok time to write code bbl\n\nmaking a new thing,ok time write code bbl making new thing
3,572344911002927104,Refined dessert! NOT #MKR,Neither,refined dessert not mkr,refined dessert mkr
4,568132033215475712,@furt1v3ly @ClarkHat One of the best things an...,Neither,one of the best things anyone can do to impr...,one best things anyone improve understanding g...


In [34]:
from nltk.stem import WordNetLemmatizer 
#lemmatize
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text.split()]

df['clean_text_sl'] = df["clean_text_sl"].apply(lemmatize_text)
df['clean_text_sl'] = df['clean_text_sl'].apply(lambda x: ' '.join(map(str, x)))

df.head(5)

Unnamed: 0,id,text,Annotation,clean_text,clean_text_sl
0,596965466238619648,I just found the *perfect* rental. Why can't m...,Neither,i just found the perfect rental why cant my le...,found perfect rental cant lease right
1,575891807873040384,"@wetsprocket every time they ""discover"" anythi...",Neither,every time they discover anything its either ...,every time discover anything either incorrect ...
2,595259457828884480,ok time to write code bbl.\n\nmaking a new thing.,Neither,ok time to write code bbl\n\nmaking a new thing,ok time write code bbl making new thing
3,572344911002927104,Refined dessert! NOT #MKR,Neither,refined dessert not mkr,refined dessert mkr
4,568132033215475712,@furt1v3ly @ClarkHat One of the best things an...,Neither,one of the best things anyone can do to impr...,one best thing anyone improve understanding gr...


In [35]:
#6 - remove tweets which are now empty
print("here is a datapoint with an empty field")
print(df.iloc[16], "\n")
df = df.replace(r'^\s*$', np.nan, regex=True)
print("Dataset length before empty removal:", len(df))
df = df.dropna()

#reset the index since we removed some items
df = df.reset_index()
df = df.drop(['index'], axis=1)
print("Dataset length after empty removal:", len(df), "\n")
print("the datapoint has now been removed and index reformatted\n")
print(df.iloc[16])

here is a datapoint with an empty field
id                                 573180997631537216
text             @GentlerMouse http://t.co/qYgaRbGQDP
Annotation                                    Neither
clean_text                                           
clean_text_sl                                        
Name: 16, dtype: object 

Dataset length before empty removal: 6946
Dataset length after empty removal: 6815 

the datapoint has now been removed and index reformatted

id                                              563378742152540160
text             RT @thetrudz: And White people who bring up qu...
Annotation                                                 Neither
clean_text       rt  and white people who bring up queerness or...
clean_text_sl    rt white people bring queerness survivor way s...
Name: 16, dtype: object


In [36]:
#finally, add a binary Hate column which we will use for hate/non-hate classification
df['Hate'] = 0
for i in range(len(df)):
    if (df['Annotation'][i] != 'Neither'):
        df['Hate'][i] = 1
        
df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\clean_dataset.csv", 
          index=False)
        
df.iloc[1000]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Hate'][i] = 1


id                                              575605236464795648
text             @cbleiker #GermanProfessor gives meaning to te...
Annotation                                                  Sexism
clean_text        germanprofessor gives meaning to term feminaz...
clean_text_sl    germanprofessor give meaning term feminazidoes...
Hate                                                             1
Name: 1000, dtype: object

In [37]:
df.to_csv(r"C:\Users\jackc\Semester2\Project\Identifying-Hate-Speech-Categories-On-Social-Media\Data\clean_dataset.csv", 
          index=False)

df.head(5)

Unnamed: 0,id,text,Annotation,clean_text,clean_text_sl,Hate
0,596965466238619648,I just found the *perfect* rental. Why can't m...,Neither,i just found the perfect rental why cant my le...,found perfect rental cant lease right,0
1,575891807873040384,"@wetsprocket every time they ""discover"" anythi...",Neither,every time they discover anything its either ...,every time discover anything either incorrect ...,0
2,595259457828884480,ok time to write code bbl.\n\nmaking a new thing.,Neither,ok time to write code bbl\n\nmaking a new thing,ok time write code bbl making new thing,0
3,572344911002927104,Refined dessert! NOT #MKR,Neither,refined dessert not mkr,refined dessert mkr,0
4,568132033215475712,@furt1v3ly @ClarkHat One of the best things an...,Neither,one of the best things anyone can do to impr...,one best thing anyone improve understanding gr...,0
