# HTSPC- Hate Speech Classification on social media

In [1]:
import numpy as np
import pandas as pd

In [2]:
#Read train data
train = pd.read_csv('1fe720be-90e4-4e06-9b52-9de93e0ea937_train.csv')

In [3]:
train.head()
# print(train)

Unnamed: 0,text,labels
0,@realDonaldTrump This is one of the worst time...,0
1,How about the crowd in Oval in today's #AUSvIN...,1
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0
3,#etsy shop: Benedict Donald so called presiden...,1
4,@realDonaldTrump Good build a wall around Arka...,0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5266 entries, 0 to 5265
Data columns (total 2 columns):
text      5266 non-null object
labels    5266 non-null int64
dtypes: int64(1), object(1)
memory usage: 82.4+ KB


In [5]:
#Read test data
test = pd.read_csv('final_test.csv')

In [6]:
test.head()

Unnamed: 0.1,Unnamed: 0,text_id,text
0,0,hasoc_en_902,West Bengal Doctor Crisis: Protesting doctors ...
1,1,hasoc_en_416,68.5 million people have been forced to leave ...
2,2,hasoc_en_207,"You came, you saw .... we will look after the ..."
3,3,hasoc_en_595,We'll get Brexit delivered by October 31st. ...
4,4,hasoc_en_568,Fuck you. Go back to the dark ages you cow @IB...


In [7]:
test.drop(columns='Unnamed: 0',inplace=True)
test.drop(columns='text_id',inplace=True)

In [8]:
test.head()

Unnamed: 0,text
0,West Bengal Doctor Crisis: Protesting doctors ...
1,68.5 million people have been forced to leave ...
2,"You came, you saw .... we will look after the ..."
3,We'll get Brexit delivered by October 31st. ...
4,Fuck you. Go back to the dark ages you cow @IB...


In [9]:

#We the labels(0/1) which signifies 'Hate speech(1)' or 'Not a hate speech(0) in of type int64 which we 
#convert to type category. We use 'astype' to cast the datatype

train['labels'] = train['labels'].astype('category')

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5266 entries, 0 to 5265
Data columns (total 2 columns):
text      5266 non-null object
labels    5266 non-null category
dtypes: category(1), object(1)
memory usage: 46.5+ KB


# Processing the comments/tweets

In [11]:
from nltk.stem import WordNetLemmatizer
from nltk import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [12]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

train["text_w/o_url"] = train["text"].apply(lambda text: remove_urls(text))
test["text_w/o_url"] = test["text"].apply(lambda text: remove_urls(text))

In [13]:
train.head()

Unnamed: 0,text,labels,text_w/o_url
0,@realDonaldTrump This is one of the worst time...,0,@realDonaldTrump This is one of the worst time...
1,How about the crowd in Oval in today's #AUSvIN...,1,How about the crowd in Oval in today's #AUSvIN...
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,@skroskz @shossy2 @JoeBiden Biden &amp; his so...
3,#etsy shop: Benedict Donald so called presiden...,1,#etsy shop: Benedict Donald so called presiden...
4,@realDonaldTrump Good build a wall around Arka...,0,@realDonaldTrump Good build a wall around Arka...


In [14]:
print(test.head())

                                                text  \
0  West Bengal Doctor Crisis: Protesting doctors ...   
1  68.5 million people have been forced to leave ...   
2  You came, you saw .... we will look after the ...   
3  We'll get Brexit delivered by October 31st.   ...   
4  Fuck you. Go back to the dark ages you cow @IB...   

                                        text_w/o_url  
0  West Bengal Doctor Crisis: Protesting doctors ...  
1  68.5 million people have been forced to leave ...  
2  You came, you saw .... we will look after the ...  
3  We'll get Brexit delivered by October 31st.   ...  
4  Fuck you. Go back to the dark ages you cow @IB...  


##  Step  to remove all the special characters like '$,#,&,etc'

In [16]:
#Follow the below commands in case you havent downloaded nltk yet, in case you have downloaded you dont need this cell.
##Steps/commands :

#1) import nltk
#2) nltk.download('wordnet')


In [15]:
train['text_lem'] = [''.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]',' ',text)) for text in lis]) for lis in train['text_w/o_url']]
test['text_lem'] = [''.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]',' ',text)) for text in lis]) for lis in test['text_w/o_url']]

In [16]:
print('We can see the text after the removal of special characters under text_lem coloumn')
train.head()

We can see the text after the removal of special characters under text_lem coloumn


Unnamed: 0,text,labels,text_w/o_url,text_lem
0,@realDonaldTrump This is one of the worst time...,0,@realDonaldTrump This is one of the worst time...,realDonaldTrump This is one of the worst time...
1,How about the crowd in Oval in today's #AUSvIN...,1,How about the crowd in Oval in today's #AUSvIN...,How about the crowd in Oval in today s AUSvIN...
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,skroskz shossy JoeBiden Biden amp his so...
3,#etsy shop: Benedict Donald so called presiden...,1,#etsy shop: Benedict Donald so called presiden...,etsy shop Benedict Donald so called presiden...
4,@realDonaldTrump Good build a wall around Arka...,0,@realDonaldTrump Good build a wall around Arka...,realDonaldTrump Good build a wall around Arka...


In [17]:
print(test.head())

                                                text  \
0  West Bengal Doctor Crisis: Protesting doctors ...   
1  68.5 million people have been forced to leave ...   
2  You came, you saw .... we will look after the ...   
3  We'll get Brexit delivered by October 31st.   ...   
4  Fuck you. Go back to the dark ages you cow @IB...   

                                        text_w/o_url  \
0  West Bengal Doctor Crisis: Protesting doctors ...   
1  68.5 million people have been forced to leave ...   
2  You came, you saw .... we will look after the ...   
3  We'll get Brexit delivered by October 31st.   ...   
4  Fuck you. Go back to the dark ages you cow @IB...   

                                            text_lem  
0  West Bengal Doctor Crisis  Protesting doctors ...  
1       million people have been forced to leave ...  
2  You came  you saw      we will look after the ...  
3  We ll get Brexit delivered by October   st    ...  
4  Fuck you  Go back to the dark ages you cow  IB..

## Step for spelling correction

In [20]:
from spellchecker import SpellChecker

In [21]:
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
            
        else:
            corrected_text.append(word)
           
    return " ".join(corrected_text)


test["text_spell"] = test["text_lem"].apply(lambda text: correct_spellings(text))


In [22]:
train["text_spell"] = train["text_lem"].apply(lambda text: correct_spellings(text))

In [23]:
# print(train.head())

In [24]:
# print(test.head())

## Step for removing urls

In [25]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

train["text_w/o_url"] = train["text_lem"].apply(lambda text: remove_urls(text))
test["text_w/o_url"] = test["text_lem"].apply(lambda text: remove_urls(text))

In [26]:
# train.head()

## HTML tags removal step

###  This step deals with removing the html tags but as removing these performed bad on our data,we arent using this pre-processing method

In [25]:
# def remove_html(text):
#     html_pattern = re.compile('<.*?>')
#     return html_pattern.sub(r'', text)


# train["text_w/o_html"] = train["text_w/o_url"].apply(lambda text: remove_html(text))
# test["text_w/o_html"] = test["text_w/o_url"].apply(lambda text: remove_html(text))
# train.head()

## Step  to remove the stop words

In [26]:
#In case you havent downloaded 'stopwards from nltk' yet use these below commands in case you already have downloaded 
#then you can skip this cell

#Steps/Commands :

# import nltk
# nltk.download('stopwords')

In [18]:

print('The following are the stop words in English Language :')
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))


The following are the stop words in English Language :


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [19]:

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train["text_w/o_stop"] = train["text_lem"].apply(lambda text: remove_stopwords(text))

test["text_w/o_stop"] = test["text_lem"].apply(lambda text: remove_stopwords(text))


In [20]:
train.head()

Unnamed: 0,text,labels,text_w/o_url,text_lem,text_w/o_stop
0,@realDonaldTrump This is one of the worst time...,0,@realDonaldTrump This is one of the worst time...,realDonaldTrump This is one of the worst time...,realDonaldTrump This one worst times American ...
1,How about the crowd in Oval in today's #AUSvIN...,1,How about the crowd in Oval in today's #AUSvIN...,How about the crowd in Oval in today s AUSvIN...,How crowd Oval today AUSvIND holding Balidan b...
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,skroskz shossy JoeBiden Biden amp his so...,skroskz shossy JoeBiden Biden amp son Hunter t...
3,#etsy shop: Benedict Donald so called presiden...,1,#etsy shop: Benedict Donald so called presiden...,etsy shop Benedict Donald so called presiden...,etsy shop Benedict Donald called president tra...
4,@realDonaldTrump Good build a wall around Arka...,0,@realDonaldTrump Good build a wall around Arka...,realDonaldTrump Good build a wall around Arka...,realDonaldTrump Good build wall around Arkansa...


In [21]:
print(test.head())

                                                text  \
0  West Bengal Doctor Crisis: Protesting doctors ...   
1  68.5 million people have been forced to leave ...   
2  You came, you saw .... we will look after the ...   
3  We'll get Brexit delivered by October 31st.   ...   
4  Fuck you. Go back to the dark ages you cow @IB...   

                                        text_w/o_url  \
0  West Bengal Doctor Crisis: Protesting doctors ...   
1  68.5 million people have been forced to leave ...   
2  You came, you saw .... we will look after the ...   
3  We'll get Brexit delivered by October 31st.   ...   
4  Fuck you. Go back to the dark ages you cow @IB...   

                                            text_lem  \
0  West Bengal Doctor Crisis  Protesting doctors ...   
1       million people have been forced to leave ...   
2  You came  you saw      we will look after the ...   
3  We ll get Brexit delivered by October   st    ...   
4  Fuck you  Go back to the dark ages you cow 

## Step to convert emojis to words

In [25]:


# def convert_emojis(text):
#     for emot in UNICODE_EMO:
#         text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
#     return text

# train["text_emo"] = train["text_w/o_stop"].apply(lambda text: remove_stopwords(text))

# test["text_emo"] = test["text_w/o_stop"].apply(lambda text: remove_stopwords(text))

In [26]:
train.head()

Unnamed: 0,text,labels,text_w/o_url,text_lem,text_w/o_stop
0,@realDonaldTrump This is one of the worst time...,0,@realDonaldTrump This is one of the worst time...,realDonaldTrump This is one of the worst time...,realDonaldTrump This one worst times American ...
1,How about the crowd in Oval in today's #AUSvIN...,1,How about the crowd in Oval in today's #AUSvIN...,How about the crowd in Oval in today s AUSvIN...,How crowd Oval today AUSvIND holding Balidan b...
2,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,0,@skroskz @shossy2 @JoeBiden Biden &amp; his so...,skroskz shossy JoeBiden Biden amp his so...,skroskz shossy JoeBiden Biden amp son Hunter t...
3,#etsy shop: Benedict Donald so called presiden...,1,#etsy shop: Benedict Donald so called presiden...,etsy shop Benedict Donald so called presiden...,etsy shop Benedict Donald called president tra...
4,@realDonaldTrump Good build a wall around Arka...,0,@realDonaldTrump Good build a wall around Arka...,realDonaldTrump Good build a wall around Arka...,realDonaldTrump Good build wall around Arkansa...


In [27]:
print(test.head())

                                                text  \
0  West Bengal Doctor Crisis: Protesting doctors ...   
1  68.5 million people have been forced to leave ...   
2  You came, you saw .... we will look after the ...   
3  We'll get Brexit delivered by October 31st.   ...   
4  Fuck you. Go back to the dark ages you cow @IB...   

                                        text_w/o_url  \
0  West Bengal Doctor Crisis: Protesting doctors ...   
1  68.5 million people have been forced to leave ...   
2  You came, you saw .... we will look after the ...   
3  We'll get Brexit delivered by October 31st.   ...   
4  Fuck you. Go back to the dark ages you cow @IB...   

                                            text_lem  \
0  West Bengal Doctor Crisis  Protesting doctors ...   
1       million people have been forced to leave ...   
2  You came  you saw      we will look after the ...   
3  We ll get Brexit delivered by October   st    ...   
4  Fuck you  Go back to the dark ages you cow 

In [22]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(train['text_lem'],train['labels'])

In [23]:
vect = TfidfVectorizer().fit(X_train)


In [24]:
vect_transformed_X_train = vect.transform(X_train)
vect_transformed_X_test = vect.transform(X_test)

# Classifing the data using the following algorithms

## Using SVM

In [25]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [26]:
modelSVC = SVC(C=500).fit(vect_transformed_X_train,y_train)



In [27]:
predictionsSVC = modelSVC.predict(vect_transformed_X_test)
# sum(predictionsSVC==1),len(y_test),
print('The F1 score using SVm with C=100 is :')
print(f1_score(y_test,predictionsSVC))
print('The accuracy using SVM with C=100 is :')
print(accuracy_score(y_test,predictionsSVC))

The F1 score using SVm with C=100 is :
0.7698783910196446
The accuracy using SVM with C=100 is :
0.6264236902050114


## Using Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
modelLR = LogisticRegression().fit(vect_transformed_X_train,y_train)



In [29]:
predictionsLR = modelLR.predict(vect_transformed_X_test)
print('The F1 score using LR  is :')
print(f1_score(y_test,predictionsLR))
print('The accuracy using LR is :')
print(accuracy_score(y_test,predictionsLR))

The F1 score using LR  is :
0.7793176972281448
The accuracy using LR is :
0.6856492027334852


## Using Naive Bayes

In [30]:
from sklearn.naive_bayes import MultinomialNB

In [31]:
modelNB = MultinomialNB(alpha=1).fit(vect_transformed_X_train,y_train)

In [32]:
predictionsNB = modelNB.predict(vect_transformed_X_test)
print('The F1 score using NB  is :')
print(f1_score(y_test,predictionsNB))
print('The accuracy using NB  is :')
print(accuracy_score(y_test,predictionsNB))

The F1 score using NB  is :
0.7735941320293398
The accuracy using NB  is :
0.6484434320425209


# Using RandomForest

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
modelRF = RandomForestClassifier(n_estimators=127).fit(vect_transformed_X_train,y_train)

In [37]:
predictionsRF = modelRF.predict(vect_transformed_X_test)
f1_score(y_test,predictionsRF)

0.7562028047464942

# For test data

In [33]:
vect = TfidfVectorizer().fit(train['text_lem'])
vect_transformed_train = vect.transform(train['text_lem'])
vect_transformed_test = vect.transform(test['text_lem'])

In [34]:
FinalModel = LogisticRegression().fit(vect_transformed_train,train['labels'])



In [35]:
FN = MultinomialNB(alpha=0.9).fit(vect_transformed_train,train['labels'])

In [36]:
predictions = FN.predict(vect_transformed_test)

In [37]:
submission = pd.DataFrame({'labels':predictions})

In [38]:
file_name = 'Final_submission.csv'
submission.to_csv(file_name,index=True)

In [39]:
submission.head()

Unnamed: 0,labels
0,1
1,1
2,1
3,1
4,0


In [40]:
sum(predictions==1)

1068

In [44]:
hello=pd.read_csv('Final_submission.csv')

In [47]:
sum(hello['labels']==1)

1068

In [50]:
hello.shape

(1153, 2)