<a href="https://colab.research.google.com/github/ApollineFo/NLP---Hate-Speech/blob/main/hate_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#calling important libraries
import pandas as pd #for data manipulation
import numpy as np #for data manipulation
import matplotlib as plt #for plotting
import os #for ease of python system interaction
import sys
import re
import nltk
nltk.download("stopwords")
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
#retrieving the data
url = "https://raw.githubusercontent.com/ApollineFo/NPL/main/data/labeled_data.csv"

data = pd.read_csv(url)
#checking the dataset features
data.keys()
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [15]:
#extracting only tweets as features
features = data.iloc[:,6].values
#extracting labels
classes = data.iloc[:,5].values

In [16]:
#preprocessing tweets
processed_features = []

for sentence in range(0, len(features)):
    #Removing special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    #removing single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    #Removing single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    #Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    #Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    #Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [17]:
#specifying parameters
parameters = [
    {
    'vect__max_df': (0.9,), #ignore terms with frequency higher than aforementioned
        'vect__min_df': (2,), #ignore lower frequencies than aforementioned
        'vect__ngram_range': ((1, 1),), #only unigrams
        'clf__estimator__kernel': ['rbf'], #gausian kernel
    'clf__estimator__gamma': [1e0,], #a gamma of zero
        'clf__estimator__C': [1,],
        'clf__estimator__class_weight': [None, "balanced"] #balanced weight
    },
    {
        'vect__max_df': (0.9,),
        'vect__min_df': (2,),
        'vect__ngram_range': ((1, 1),),
        'clf__estimator__kernel': ['linear'], #linear kernel
    'clf__estimator__C': [1,]
    }
]

In [18]:
# building a pipeline
pipeline = Pipeline([('vect', TfidfVectorizer(parameters)),                     
    ('clf', OneVsRestClassifier(SVC(probability=True))),
])

In [19]:
#splitting the dataset to trian and test set
#80 percent of data for training, 20 percent for testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, classes, test_size=0.2, random_state=42)

In [20]:
#grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [None]:
#fitting the classifier
classifier = grid_search.fit(X_train, y_train)

#predicting
y_predict = classifier.predict(X_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [13]:
#confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[  51  198   41]
 [  38 3678  116]
 [   4   86  745]]
              precision    recall  f1-score   support

           0       0.55      0.18      0.27       290
           1       0.93      0.96      0.94      3832
           2       0.83      0.89      0.86       835

    accuracy                           0.90      4957
   macro avg       0.77      0.68      0.69      4957
weighted avg       0.89      0.90      0.89      4957



In [None]:
#evaluating the model with ROC AUC score
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, classifier.predict_proba(X_test), multi_class="ovr")

#with a higher score than 0.5, signifies that our model is useful.

0.630180719651229

In [None]:
#evaluating the model using cross validation
from sklearn.model_selection import cross_val_score
all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5)

#returning accuracies of the folds
print(all_accuracies)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.2s finished


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.1s finished


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.2s finished


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.0s finished


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    4.1s finished


[0.56880734 0.56880734 0.58715596 0.55963303 0.56481481]


In [None]:
#printing standard deviation of accuracies
print(all_accuracies.std())

#earning a low variance meaning; a good indicator that the model will perform similar on all test sets and the prediction obtained is not by chance

0.00928881117995757
