In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [2]:
df = pd.read_csv('TwitterHate.csv')

In [3]:
df = df.drop(columns='id')
df['tweet'] = df.tweet.str.lower()
df['tweet'] = df['tweet'].str.replace('@\w+','')
df['tweet'] = df['tweet'].str.replace('www.+com','')
df['tweet'] = df['tweet'].str.replace('\&amp|rt|etc','')
df['tweet'] = df['tweet'].str.replace('#','')
tweets = df['tweet'].tolist()

In [16]:
df.sample(n=10)

Unnamed: 0,label,tweet
20582,0,my coffee is telling me to be today. smiley ...
30129,0,the welsh in is what a tournament should be l...
19775,0,use the power of your mind to heal your body!!...
31624,0,looking really good trisha so happy for you
12084,0,great insights on trusted professions in emea ...
29556,0,u hv amazing voice hina aapi allah apkoo hme...
27319,0,be happy ðâï¸anchored recoveryâï¸ð...
28726,0,"hey emiliewin3, you may want to check for lat..."
14538,0,"â nzd/usd post-rbnz rally almost reversed, ..."
12087,0,seriously glad i don't live in minnesota and m...


In [5]:
temp = []
for tweet in tweets:
    text = nltk.word_tokenize(tweet)
    text = [i for i in text if (len(i) > 1) and (i not in stopwords.words('english'))]
    temp.append(' '.join(text))

In [7]:
from collections import Counter

Counter(' '.join(temp).split()).most_common(15)

[("'s", 3217),
 ('love', 2663),
 ('...', 2314),
 ('day', 2237),
 ("n't", 2228),
 ('happy', 1653),
 ('time', 1123),
 ('life', 1107),
 ('â\x80¦', 1088),
 ('today', 1061),
 ('like', 1044),
 ("'m", 1020),
 ('new', 985),
 ('``', 969),
 ('positive', 928)]

#### The most common words in the corpus are shown above. The sentiment of most of the words above is positive due the class imbalance present in data

In [8]:
x = temp
y = df['label'].tolist()

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.15)

In [10]:
vec = TfidfVectorizer()
x_train = vec.fit_transform(x_train)
x_test = vec.transform(x_test)

In [11]:
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression()

In [12]:
y_pred = model.predict(x_test)

In [13]:
accuracy_score(y_test,y_pred)

0.9505735140771637

In [14]:
pd.DataFrame(classification_report(y_test,y_pred,output_dict=True)).drop(columns='accuracy')

Unnamed: 0,0,1,macro avg,weighted avg
precision,0.951599,0.904762,0.928181,0.948454
recall,0.997764,0.295031,0.646398,0.950574
f1-score,0.974135,0.444965,0.70955,0.9386
support,4473.0,322.0,4795.0,4795.0


#### recall for Hateful comments is 29% which is very low. Will try assigning class weights

In [20]:
df['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [21]:
adj_model = LogisticRegression(class_weight={0:0.075,1:1})

In [22]:
adj_model.fit(x_train,y_train)

LogisticRegression(class_weight={0: 0.075, 1: 1})

In [23]:
adj_y_pred = adj_model.predict(x_test)

In [24]:
accuracy_score(y_test,adj_y_pred)

0.9284671532846716

In [25]:
pd.DataFrame(classification_report(y_test,adj_y_pred,output_dict=True)).drop(columns='accuracy')

Unnamed: 0,0,1,macro avg,weighted avg
precision,0.982928,0.479769,0.731348,0.949139
recall,0.939638,0.773292,0.856465,0.928467
f1-score,0.960796,0.592152,0.776474,0.93604
support,4473.0,322.0,4795.0,4795.0


#### recall has improved and we will find the best parameters by hpertuning next

In [26]:
param_grid = {'penalty':['l1','l2','elasticnet','none'],'C':[10**-4,10**-2,0.1,10,100]}
grid = GridSearchCV(LogisticRegression(),param_grid,cv=StratifiedKFold(n_splits=4),n_jobs=-1,scoring='recall')

In [27]:
grid.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.0001, 0.01, 0.1, 10, 100],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none']},
             scoring='recall')

In [28]:
grid.best_params_

{'C': 0.0001, 'penalty': 'none'}

In [29]:
grid_y_pred = grid.predict(x_test)

In [30]:
accuracy_score(y_test,grid_y_pred)

0.9501564129301355

In [31]:
pd.DataFrame(classification_report(y_test,grid_y_pred,output_dict=True)).drop(columns='accuracy')

Unnamed: 0,0,1,macro avg,weighted avg
precision,0.978526,0.61186,0.795193,0.953903
recall,0.967807,0.704969,0.836388,0.950156
f1-score,0.973137,0.655123,0.81413,0.951781
support,4473.0,322.0,4795.0,4795.0


#### The final recall score and f1 score for hateful comments is 70% and 65% with the best parameters being C=0.0001(high regularization) and no penalty