# List of Libraries using in this notebook
#### pandas: For data manipulation and analysis
#### numpy: For numerical operations
#### matplotlib: For data visualization
#### seaborn: For data visualization
#### wordcloud: For data visualization
#### pickle: For saving and loading the model
#### re: For regular expression
#### string: For string operations
#### nltk: For natural language processing
#### nltk.util: which is used to generate bigrams and trigrams
#### nltk.corpus:  which is used to get the list of stopwords
#### sklearn: For machine learning algorithms


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pickle
import re
import string
import nltk
from nltk.util import pr
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/binurathiranjaya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Configuration
#### nltk.download('stopwords'): To download the stopwords
#### nltk.SnawballStemmer('english'): To get the stem of the word
#### set(stopwords.words('english')): To get the list of stopwords

In [None]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))


## Load the data
# Read the data using pandas using read_csv() function
# Display the first five rows of the data using head() function

In [39]:
data = pd.read_csv("labeled_data.csv")
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


## Data Cleaning
# Clean the data by removing the missing values and url links

In [36]:
data["labels"] = data["class"].map({0: "Hate Speech", 
                                    1: "Offensive Language", 
                                    2: "No Hate and Offensive"})
data = data[["tweet", "labels"]]
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["tweet"] = data["tweet"].apply(clean)

print(data.head())

                                               tweet                 labels
0   rt mayasolov woman shouldnt complain clean ho...  No Hate and Offensive
1   rt  boy dat coldtyga dwn bad cuffin dat hoe  ...     Offensive Language
2   rt urkindofbrand dawg rt  ever fuck bitch sta...     Offensive Language
3             rt cganderson vivabas look like tranni     Offensive Language
4   rt shenikarobert shit hear might true might f...     Offensive Language


In [None]:
# print No Hate and Offensive tweets
non_hate_offensive_tweets = data[data["labels"] == "No Hate and Offensive"]
print(non_hate_offensive_tweets.head())

                                                tweet                 labels
0    rt mayasolov woman shouldnt complain clean ho...  No Hate and Offensive
40                momma said pussi cat insid doghous   No Hate and Offensive
63      simplyaddictedtoguy  woof woof hot scalli lad  No Hate and Offensive
66                allaboutmanfeet  woof woof hot sole  No Hate and Offensive
67    allyhaaaaa lemmi eat oreo amp dish one oreo lol  No Hate and Offensive


In [None]:
# print Offensive Language tweets
offensive_tweets = data[data["labels"] == "Offensive Language"]
print(offensive_tweets.head())

                                               tweet              labels
1   rt  boy dat coldtyga dwn bad cuffin dat hoe  ...  Offensive Language
2   rt urkindofbrand dawg rt  ever fuck bitch sta...  Offensive Language
3             rt cganderson vivabas look like tranni  Offensive Language
4   rt shenikarobert shit hear might true might f...  Offensive Language
5  tmadisonx shit blow meclaim faith somebodi sti...  Offensive Language


In [None]:
# print Hate Speech tweets
hate_tweets = data[data["labels"] == "Hate Speech"]
print(hate_tweets.head())

                                                 tweet       labels
85                         whalelookyher  queer gaywad  Hate Speech
89    whitethunduh alsarabsss hes beaner smh tell h...  Hate Speech
110  devilgrimz vigxrart your fuck gay blacklist ho...  Hate Speech
184  markroundtreejr lmfaoooo hate black peopl  the...  Hate Speech
202                   nochillpaz least im nigger lmfao  Hate Speech


In [None]:
# visualizing hate and not hate tweets

sns.set_theme(style="darkgrid")
sns.countplot(data["labels"])
plt.show()


In [None]:
# Only Hate Speech tweets
hate_speech = data[data["labels"] == "Hate Speech"]
hate_speech = hate_speech["tweet"]
hate_speech = " ".join(hate_speech)
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopword, 
                min_font_size = 10).generate(hate_speech)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()


In [None]:
# Only Offensive Language tweets
offensive_language = data[data["labels"] == "Offensive Language"]
offensive_language = offensive_language["tweet"]
offensive_language = " ".join(offensive_language)
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopword, 
                min_font_size = 10).generate(offensive_language)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()



In [None]:
# Only No Hate and Offensive tweets
no_hate_offensive = data[data["labels"] == "No Hate and Offensive"]
no_hate_offensive = no_hate_offensive["tweet"]
no_hate_offensive = " ".join(no_hate_offensive)
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopword, 
                min_font_size = 10).generate(no_hate_offensive)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

x = np.array(data["tweet"])
y = np.array(data["labels"])

cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [9]:
sample = "I am  a good boy"
data = cv.transform([sample]).toarray()
print(clf.predict(data))

['No Hate and Offensive']


In [28]:
# Testing the model
predictions = clf.predict(X_test)
print(predictions)

# Evaluating the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions))
print("Classification Report: \n", classification_report(y_test, predictions))
print("Accuracy: \n", accuracy_score(y_test, predictions))



['Offensive Language' 'Offensive Language' 'Offensive Language' ...
 'Offensive Language' 'Offensive Language' 'Offensive Language']
Confusion Matrix: 
 [[ 156   41  268]
 [  38 1124  217]
 [ 227  218 5890]]
Classification Report: 
                        precision    recall  f1-score   support

          Hate Speech       0.37      0.34      0.35       465
No Hate and Offensive       0.81      0.82      0.81      1379
   Offensive Language       0.92      0.93      0.93      6335

             accuracy                           0.88      8179
            macro avg       0.70      0.69      0.70      8179
         weighted avg       0.87      0.88      0.88      8179

Accuracy: 
 0.8766352854872234


In [31]:
# Saving the model
pickle.dump(clf, open("model.pkl", "wb"))
pickle.dump(cv, open("cv.pkl", "wb"))

# Loading the model
model = pickle.load(open("model.pkl", "rb"))
cv = pickle.load(open("cv.pkl", "rb"))



# Testing the model
# predictions = model.predict(X_test)
# print(predictions)

# Evaluating the model


['No Hate and Offensive']


In [34]:
sample = "I will kill you " 
data = cv.transform([sample]).toarray()
print(model.predict(data))

['Hate Speech']
