# Classification of Hate Speech in Tweets using C-Support Vector Machine

## Preparation

### Importing Libraries

In [1]:
import os
import sys
#sys.path.append(os.path.dirname((os.path.abspath(''))))
sys.path.append("../data")
from preprocessing import load_data, preprocess, train_tfidf, split_data, upsampling, get_features, setup
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


  demoji.download_codes()
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mayte\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Loading Data

In [2]:
# pre- processing
tfidf, df_train, df_test = setup()

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

There is 25569 training data, of which 7.02% is hate speech 
There is 6393 test data, of which 7.01% is hate speech 


In [3]:
print('There is {} training data, of which {}% is hate speech '.format(df_train['label'].count(), round(df_train['label'].sum()/df_train['label'].count()*100,2)))
print('There is {} test data, of which {}% is hate speech '.format(df_test['label'].count(), round(df_test['label'].sum()/df_test['label'].count()*100,2)))

There is 47550 training data, of which 50.0% is hate speech 
There is 6393 test data, of which 7.01% is hate speech 


## Setup Training Function (with GridSearch)

Parameter "class_weight = 'balanced' " has proven very useful, however it does have a similar effect so upsamling; therefore left out of analysis.

In [4]:
param_grid={'C': [i for i in range (1, 3000, 50)],
            'kernel': ["linear", "rbf"],
            'class_weight': [None],
            'gamma': ['scale', 0.00003, 0.0003, 0.003, 0.03, 0.3, 1, 3, 8]
            }

In [5]:
#testing notebook
# REMOVE
param_grid={'C': [1],
            'kernel': ["linear"],
            'class_weight': [None],
            'gamma': ['scale']
            }

In [6]:
def train_svm(df_train: pd.DataFrame, tfidf: TfidfVectorizer):

    X_train = tfidf.transform(df_train['preprocessed'])
    y_train = df_train['label']

    # C-Support  Support Vector Machine
    svm_grid = GridSearchCV(svm.SVC(random_state=55), param_grid=param_grid, verbose=10, n_jobs=-1, scoring='f1', cv=5)
    svm_grid.fit(X_train, y_train);
    
    return svm_grid.best_estimator_

### Setup Testing Function

In [7]:
def test_model(model, df_test: pd.DataFrame, tfidf: TfidfVectorizer):
    
    X_test = tfidf.transform(df_test['preprocessed'])
    y_test = df_test['label']
    y_pred = model.predict(X_test)

    predictions = []

    predictions.append(model.get_params())
    predictions.append(precision_score(y_test, y_pred))
    predictions.append(recall_score(y_test, y_pred))
    predictions.append(accuracy_score(y_test, y_pred))
    predictions.append(f1_score(y_test, y_pred))

    return predictions

### Setup Result List

In [8]:
results = []

### Only Tokenization

In [9]:
# Don't Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=False, do_stem=False, do_lem=False, split=True, upsample=False, do_emojis=False)
svm_cv = train_svm(df_train, tfidf)
results.append("Only Tokenization \n")
results.append(test_model(svm_cv, df_test, tfidf))

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

There is 25569 training data, of which 7.02% is hate speech 
There is 6393 test data, of which 7.01% is hate speech 
Fitting 5 folds for each of 1 candidates, totalling 5 fits


### Remove Stopwords

In [11]:
# Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, upsample=False, do_emojis=False)
svm_cv = train_svm(df_train, tfidf)
results.append("\n\nRemove Stopwords \n")
results.append(test_model(svm_cv, df_test, tfidf))

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (C:\Users\Mayte\.cache\huggingface\datasets\tweets_hate_speech_detection\default\0.0.0\c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)


  0%|          | 0/1 [00:00<?, ?it/s]

There is 25569 training data, of which 7.02% is hate speech 
There is 6393 test data, of which 7.01% is hate speech 
Fitting 5 folds for each of 1 candidates, totalling 5 fits


### Emojis

In [None]:
# Remove Stopwords, Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, upsample=False, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results.append("\n\nEmojis \n")
results.append(test_model(scm_cv, df_test, tfidf))

### Stemming

In [None]:
# Remove Stopwords, Emojis, Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=True, do_lem=False, split=True, upsample=False, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results.append("\n\nStemming \n")
results.append(test_model(svm_cv, df_test, tfidf))

### Upsampling

In [None]:
# Remove Stopwords, Emojis, Stemming, Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=True, do_lem=False, split=True, upsample=True, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results.append("\n\nUpsampling \n")
results.append(test_model(model, df_test, tfidf))

### All-but-Stemming

In [None]:
# Remove Stopwords, Emojis, No Stemming, Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, upsample=True, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results.append("\n\nAll-but-Stemming \n")
results.append(test_model(model, df_test, tfidf))

## Setup Training Function (with RandomSearch)

In [None]:
param_grid={'C': [i for i in range (1, 3000)],
            'kernel': ["linear", "rbf", "poly", "sigmoid"],
            'class_weight': [None, "balanced"],
            'gamma': ['scale', 0.00003, 0.0003, 0.003, 0.03, 0.3, 1, 3, 8]
            }

In [None]:
#testing notebook
# REMOVE
param_grid={'C': [1],
            'kernel': ["linear"],
            'class_weight': [None],
            'gamma': ['scale']
            }

In [None]:
def train_svm(df_train: pd.DataFrame, tfidf: TfidfVectorizer):

    X_train = tfidf.transform(df_train['preprocessed'])
    y_train = df_train['label']

    # C-Support  Support Vector Machine (RandomizedSearchCV)
    svm_grid = RandomizedSearchCV(svm.SVC(random_state=55), param_grid, verbose=10, n_jobs=-1, scoring='f1', cv=5)
    svm_grid.fit(X_train, y_train);
    
    return svm_grid.best_estimator_

### Setup Result List (RandomSearch)

In [None]:
results_random = []

### Only Tokenization

In [None]:
# Don't Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=False, do_stem=False, do_lem=False, split=True, upsample=False, do_emojis=False)
svm_cv = train_svm(df_train, tfidf)
results_random.append("Only Tokenization \n")
results_random.append(test_model(svm_cv, df_test, tfidf))

### Remove Stopwords

In [None]:
# Remove Stopwords, No Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, upsample=False, do_emojis=False)
svm_cv = train_svm(df_train, tfidf)
results_random.append("\n\nRemove Stopwords \n")
results_random.append(test_model(svm_cv, df_test, tfidf))

### Emojis

In [None]:
# Remove Stopwords, Emojis, No Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, upsample=False, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_random.append("\n\nEmojis \n")
results_random.append(test_model(scm_cv, df_test, tfidf))

### Stemming

In [None]:
# Remove Stopwords, Emojis, Stemming, No Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=True, do_lem=False, split=True, upsample=False, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_random.append("\n\nStemming \n")
results_random.append(test_model(svm_cv, df_test, tfidf))

### Upsampling

In [None]:
# Remove Stopwords, Emojis, Stemming, Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=True, do_lem=False, split=True, upsample=True, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_random.append("\n\nUpsampling \n")
results_random.append(test_model(model, df_test, tfidf))

### All-but-Stemming

In [None]:
# Remove Stopwords, Emojis, No Stemming, Upsampling 
tfidf, df_train, df_test = setup(rem_stop=True, do_stem=False, do_lem=False, split=True, upsample=True, do_emojis=True)
svm_cv = train_svm(df_train, tfidf)
results_random.append("\n\nAll-but-Stemming \n")
results_random.append(test_model(model, df_test, tfidf))

## Best Results

In [None]:
"""
textfile = open("results_random_svm.txt", "w")
for element in results:
    if not isinstance(element, str):
        for subelement in element:
            textfile.write(str(subelement) + "\n")
        continue
    textfile.write(str(element) + "\n")
textfile.close()
"""

In [None]:
# insert text?

## Evaluation of Model Performance

In [None]:
classifier = svm.SVC(kernel= "rbf", C=2496, class_weight = "balanced", gamma = 0.3)
classifier.fit(X_train, y_train)

prediction = classifier.predict(X_test)

In [None]:
print("accuracy: ", accuracy_score(y_test, prediction))
print("precision: ",(precision_score(y_test, y_pred))
print("recall: ", recall_score(y_test, y_pred))
print("f1: ", f1_score(y_test, y_pred))

In [None]:
# Confusion-Matrix
cm = confusion_matrix(y_test, y_pred)

fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title("Confusion Matrix")
plt.show()