In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt   #for data visualization and graphical plotting
from matplotlib import style      #for styling the plot
style.use("ggplot")

import nltk
from nltk.util import pr
from nltk.tokenize import word_tokenize      #to divide strings into lists of substrings
from nltk.stem import WordNetLemmatizer      #to link words with similar meanings to one word.
from nltk.corpus import stopwords            #to filterout useless data
stopword = set(stopwords.words('english'))

# from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

import re
import string
import plotly.express as px


In [6]:
data = pd.read_csv("../data/labeled_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [7]:
# to get more info about dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [8]:
data.describe()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089
min,0.0,3.0,0.0,0.0,0.0,0.0
25%,6372.5,3.0,0.0,2.0,0.0,1.0
50%,12703.0,3.0,0.0,3.0,0.0,1.0
75%,18995.5,3.0,0.0,3.0,0.0,1.0
max,25296.0,9.0,7.0,9.0,9.0,2.0


In [9]:
data.shape

(24783, 7)

In [10]:
data["labels"] = data["class"].map({0: "Hate Speech", 
                                    1: "Offensive Language", 
                                    2: "No Hate and Offensive"})
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Language
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Language
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Language


In [11]:
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Language
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Language
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Language


In [12]:
# analysing some data
print(data["tweet"].iloc[0],"\n")
print(data["tweet"].iloc[1],"\n")
print(data["tweet"].iloc[2],"\n")
print(data["tweet"].iloc[3],"\n")
print(data["tweet"].iloc[4],"\n")

!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out... 

!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!! 

!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit 

!!!!!!!!! RT @C_G_Anderson: @viva_based she look like a tranny 

!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361; 



In [13]:
data = data[["tweet", "labels"]]
data.head()

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Language
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Language
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Language


In [14]:
data = data[["tweet", "labels"]]
data.head()

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Language
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Language
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Language


In [15]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub(r"\@w+|\#",'',text)
    text = re.sub(r"[^\w\s]",'',text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    tweet_tokens = word_tokenize(text)
    filtered_tweets=[w for w in tweet_tokens if not w in stopword] #removing stopwords
    return " ".join(filtered_tweets)

In [16]:
#applying pre-processing to text data
import nltk
data.tweet=data['tweet'].apply(clean)

In [17]:
# removing duplicate data
tweetData = data.drop_duplicates("tweet")

In [18]:
# removing duplicate data
tweetData = data.drop_duplicates("tweet")

In [19]:
# to see updated number of columns as duplicates entries are removed
tweetData.shape

(24506, 2)

In [20]:
tweetData.isnull().sum()

tweet     0
labels    0
dtype: int64

In [21]:
lemmatizer=WordNetLemmatizer()
def lemmatizing(data):
    tweet=[lemmatizer.lemmatize(word) for word in data]
    return data

In [22]:
#lemmatizing the processed data
import nltk
tweetData.loc[:, 'tweet']=tweetData['tweet'].apply(lambda x: lemmatizing(x))

In [23]:
# to see the data after pre-processing
print(tweetData["tweet"].iloc[0],"\n")
print(tweetData["tweet"].iloc[1],"\n")
print(tweetData["tweet"].iloc[2],"\n")
print(tweetData["tweet"].iloc[3],"\n")
print(tweetData["tweet"].iloc[4],"\n")

rt mayasolovely woman shouldnt complain cleaning house amp man always take trash 

rt boy dats coldtyga dwn bad cuffin dat hoe place 

rt urkindofbrand dawg rt ever fuck bitch start cry confused shit 

rt cganderson vivabased look like tranny 

rt shenikaroberts shit hear might true might faker bitch told ya 



In [24]:
# to see distribution of labels
tweetData['labels'].value_counts()

labels
Offensive Language       18984
No Hate and Offensive     4113
Hate Speech               1409
Name: count, dtype: int64

In [25]:
fig = px.bar(tweetData['labels'].value_counts(), x=tweetData['labels'].value_counts().index, y=tweetData['labels'].value_counts())
fig.update_layout(xaxis={'categoryorder':'total descending'}, xaxis_title="Labels", yaxis_title="Count")
fig.show()


In [26]:
# visualizing data using piechart
colors = ('red', 'green', 'blue')
wp = {'linewidth':2, "edgecolor":'black'}
tags = tweetData['labels'].value_counts()
px.pie(tweetData['labels'].value_counts(), 
       values=tweetData['labels'].value_counts(), 
       names=tweetData['labels'].value_counts().index,
       title="Distribution of sentiments")

In [27]:
tweetData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24506 entries, 0 to 24782
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   24506 non-null  object
 1   labels  24506 non-null  object
dtypes: object(2)
memory usage: 574.4+ KB


In [28]:
# clear a classfication training model using pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# splitting the data into training and testing
X = tweetData['tweet']
y = tweetData['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# creating pipeline
# model1 = Pipeline([('tfidf', TfidfVectorizer()), ('logistic', LogisticRegression())])
model2 = Pipeline([('tfidf', TfidfVectorizer()), ('randomforest', RandomForestClassifier())])
model3 = Pipeline([('tfidf', TfidfVectorizer()), ('gradientboosting', GradientBoostingClassifier())])
model4 = Pipeline([('tfidf', TfidfVectorizer()), ('naivebayes', MultinomialNB())])

# parameters for grid search
# param1 = {'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)], 'logistic__C': [1, 10, 100]}
param2 = {'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)], 'randomforest__n_estimators': [100, 200, 300]}
param3 = {'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)], 'gradientboosting__n_estimators': [100, 200, 300]}
param4 = {'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)], 'naivebayes__alpha': [0.1, 1, 10]}

# grid search
# grid1 = GridSearchCV(model1, param1, cv=5)
grid2 = GridSearchCV(model2, param2, cv=5)
grid3 = GridSearchCV(model3, param3, cv=5)
grid4 = GridSearchCV(model4, param4, cv=5)

grid4

In [30]:
# saving the models
import joblib
import os

if not os.path.exists("models"):
    os.makedirs("models")

In [31]:
if not os.path.exists(os.path.join("models", "random_forest.pkl")):
    # fitting the model 2
    grid2.fit(X_train, y_train)

    # evaluating the model 2
    y_pred = grid2.predict(X_test)
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    # plotting confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=grid2.classes_)
    disp.plot()
    joblib.dump(grid2, os.path.join("models", "random_forest.pkl"))

In [32]:
# import os
# if not os.path.exists(os.path.join("models", "gradient_boosting.pkl")):

#     # fitting the model 3
#     grid3.fit(X_train, y_train)

#     # evaluating the model 3
#     y_pred = grid3.predict(X_test)
#     print("Accuracy: ", accuracy_score(y_test, y_pred))
#     print(classification_report(y_test, y_pred))
#     print(confusion_matrix(y_test, y_pred))

#     # plotting confusion matrix
#     cm = confusion_matrix(y_test, y_pred)
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=grid3.classes_)
#     disp.plot()

#     # saving the models
#     joblib.dump(grid3, os.path.join("models", "gradient_boosting.pkl"))

In [33]:
if not os.path.exists(os.path.join("models", "naive_bayes.pkl")):
    # fitting the model 4
    grid4.fit(X_train, y_train)

# evaluating the model 4
    y_pred = grid4.predict(X_test)
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

     # plotting confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=grid4.classes_)
    disp.plot()

    # saving the models
    joblib.dump(grid4, os.path.join("models", "naive_bayes.pkl"))


In [34]:
# # saving the models
import joblib
import os

if not os.path.exists("models"):
         os.makedirs("models")

# joblib.dump(grid1, os.path.join("models", "logistic_regression.pkl"))
# joblib.dump(grid2, os.path.join("models", "random_forest.pkl"))
# joblib.dump(grid3, os.path.join("models", "gradient_boosting.pkl"))
joblib.dump(model4, os.path.join("models", "naive_bayes.pkl"))

['models\\naive_bayes.pkl']

In [35]:
# required import
import joblib
import os


# prediction function
def predict_sentiment(text):
    # loading the models
    random_forest_model = joblib.load(os.path.join("models", "naive_bayes.pkl"))
    # predicting the sentiment
    random_forest_prediction = random_forest_model.predict([text])[0]
    result = {"prediction": random_forest_prediction}
    return result

# testing the prediction function
text = "As a woman you shouldn't complain about cleaning up your house"
ans = predict_sentiment(text)
print(ans)

text = "Check out our 12th man"
ans = predict_sentiment(text)
print(ans)

NotFittedError: The TF-IDF vectorizer is not fitted