# Classifing hate speech tweets
## Using Multi-variate Bernoulli Naive Bayes and Multinominal Naive Bayes in combination with count vectorizer and TF-IDF vectorizer

### To Do's
- Modify Preprocessing (currently default count and TF-IDF vectroizer preprocessing used)
    - Implement Emoji transformation
    - Implement Morphological Normalization (e.g. Stemming)

In [None]:
import numpy as np
import pandas as pd
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as text
import sklearn.naive_bayes as nb
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from mlxtend.plotting import plot_confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [None]:
from datasets import load_dataset

dataset = load_dataset("tweets_hate_speech_detection")

In [None]:
df = pd.DataFrame.from_dict(dataset['train'])
df.head()

In [None]:
# Currently not in use
def remove_punctioation(text:str) -> str:
    return "".join([i for i in text if i not in punctuation])

def tokenization(text:str) -> list:
    return nltk.word_tokenize(text)

def remove_stopwords(tokens) ->list:
    stopwords_list = stopwords.words("english")
    return [token for token in tokens if token not in stopwords_list]

porter_stemmer = PorterStemmer()

def stemming(text:list) -> list:
    return [porter_stemmer.stem(word) for word in text]

def preProcess(list):
    return list.apply(lambda x: stemming(remove_stopwords(tokenization(remove_punctioation(x.lower())))))

def preProcess2(list):
    return list.apply(lambda x: remove_stopwords(tokenization(remove_punctioation(x.lower()))))

def dummy(text):
    return text

def validate(y_test,y_pred):
    print('Precision: %.3f' % precision_score(y_test, y_pred))
    print('Recall: %.3f' % recall_score(y_test, y_pred))
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    print('F1 Score: %.3f' % f1_score(y_test, y_pred))

In [None]:
# Currently not in use
df["preprocessed"] = preProcess(df["tweet"])

In [None]:
df['no_user']= df['tweet'].str.replace("user","")

In [None]:
df.head()

# Model Building

In [None]:
# Definition des Label-Vektors
y = df['label']

## Using TF-IDF Vectorizer

In [None]:
'''# Using allready preprocessed tweets
tf = text.TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy,
    preprocessor=dummy,
    token_pattern=None)


X_vec = tf.fit(df['preprocessed'])
X = X_vec.transform(df['preprocessed'])

print(X.shape)'''

In [None]:
# Using unpreprocessed tweets
tf = text.TfidfVectorizer(stop_words='english')

X_vec = tf.fit(df['tweet'])
X = X_vec.transform(df['tweet'])

print(X.shape)

In [None]:
tf.vocabulary_

In [None]:
'''# No User string
# Definition der Feature-Matrix
tf = text.TfidfVectorizer(stop_words='english')

X_vec = tf.fit(df['no_user'])
X = X_vec.transform(df['no_user'])

print(X.shape)'''

### Splitting Data into train and test data

In [None]:
# Splitting of data into training and test data
(X_train, X_test, y_train, y_test) = ms.train_test_split(X, y, test_size=0.2, random_state = 17, stratify=y)

In [None]:
# Testing Stratification
print('There is {} training data, of which {}% is hate speech '.format(y_train.count(), round(y_train.sum()/y_train.count()*100,2)))
print('There is {} test data, of which {}% is hate speech '.format(y_test.count(), round(y_test.sum()/y_test.count()*100,2)))

### Train Models
The method GridSearchCV is used for hyperparameter optimization. In the following cases the smoothing parameter alpha is optimized.

In [None]:
# Multi-variate Bernoulli Naive Bayes
bnb = ms.GridSearchCV(nb.BernoulliNB(), param_grid={'alpha':np.logspace(-2., 2., 50)})
bnb.fit(X_train, y_train);

# Multinominal Naive Bayes
mnb = ms.GridSearchCV(nb.MultinomialNB(), param_grid={'alpha':np.logspace(-2., 2., 50)})
mnb.fit(X_train, y_train);

### Evaluate Model Performance based on TF-IDF Vectorizer

In [None]:
print('Scores with the multi-variate Bernoulli Naive Bayes:')
validate(y_test, bnb.predict(X_test))

In [None]:
print('Scores with the Multinominal Bernoulli Naive Bayes:')
validate(y_test, mnb.predict(X_test))

In [None]:
# Confusion-Matrix
cm = confusion_matrix(y_test, mnb.predict(X_test))

fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title("Confusion Matrix")
plt.show()

# Using Count Vectorizer

In [None]:
'''# No user string
co = text.CountVectorizer(stop_words='english')

Xc_vec = co.fit(df['no_user'])
Xc = Xc_vec.transform(df['no_user'])

print(Xc.shape)'''

In [None]:
co = text.CountVectorizer(stop_words='english')

Xc_vec = co.fit(df['tweet'])
Xc = Xc_vec.transform(df['tweet'])

print(Xc.shape)

### Split Data

In [None]:
# Splitting of data into training and test data
(Xc_train, Xc_test, yc_train, yc_test) = ms.train_test_split(Xc, y, test_size=.2, random_state = 17)

### Train Models

In [None]:
# Multi-variate Bernoulli Naive Bayes
bnbc = ms.GridSearchCV(nb.BernoulliNB(), param_grid={'alpha':np.logspace(-2., 2., 50)})
bnbc.fit(Xc_train, yc_train);


# Multinominal Naive Bayes
mnbc = ms.GridSearchCV(nb.MultinomialNB(), param_grid={'alpha':np.logspace(-2., 2., 50)})
mnbc.fit(Xc_train, yc_train);

### Evaluate Model Performance based on Count Vectorizer

In [None]:
print('Scores with the multi-variate Bernoulli Naive Bayes:')
validate(yc_test, bnbc.predict(Xc_test))

In [None]:
print('Scores with the multi-variate Bernoulli Naive Bayes:')
validate(yc_test, mnbc.predict(Xc_test))

In [None]:
# Confusion-Matrix
cm = confusion_matrix(yc_test, mnbc.predict(Xc_test))

fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title("Confusion Matrix")
plt.show()

# Analyze frequent words

In [None]:
'''# Get most frequent words including stop words
word_freq_df = pd.DataFrame(Xc.toarray(), columns=co.get_feature_names_out())
top_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)
top_words_df.head(10)'''

## Get words with biggest impact on each category

In [None]:
mnbc = nb.MultinomialNB()
mnbc.fit(Xc_train, yc_train);

In [None]:
# Get words with biggest impact on each category

mnbc.feature_log_prob_
mnbc.coef_

feature_names = co.get_feature_names_out()
for i, class_label in enumerate(['no_hate', 'hate']):
    top10 = np.argsort(mnbc.feature_log_prob_[i])[-10:]
    print("%s: %s" % (class_label,
          " ".join(feature_names[j] for j in top10)))

# Upsample

In [None]:
# Train vectorizer BEFORE upsampling
tf = TfidfVectorizer(stop_words='english')
X_tf_vec = tf.fit(df['tweet'])
#X_tf_vec = tf.fit(df['no_user'])

co = CountVectorizer(stop_words='english')
X_co_vec = co.fit(df['tweet'])
#X_co_vec = co.fit(df['no_user'])

In [None]:
# Prepare Split data
y = df['label']
X = df['tweet']
#X = df['no_user']

In [None]:
# Split data BEFORE upsampling
(X_train, X_test, y_train, y_test) = ms.train_test_split(X, y, test_size=0.2, random_state = 17, stratify=y)

df_train = pd.concat([y_train,X_train], axis=1)
df_test = pd.concat([y_test,X_test], axis = 1)
df_train.head()

In [None]:
data_minority = df_train[df_train.label == 1]
data_majority = df_train[df_train.label == 0]

print("length majority", len(data_majority))
print("length minority", len(data_minority))

In [None]:
from sklearn.utils import resample
data_minority = resample(data_minority, replace = True, n_samples=23775, random_state=55)

In [None]:
df_train_up = pd.concat([data_majority, data_minority])
df_train_up.label.value_counts()

In [None]:
'''# create embeddings

# tfifd
Xt_train = X_tf_vec.transform(df_train_up['no_user'])
Xt_test = X_tf_vec.transform(df_test['no_user'])

# count vectorizer
Xc_train = X_co_vec.transform(df_train_up['no_user'])
Xc_test = X_co_vec.transform(df_test['no_user'])

# labels
y_train = df_train_up['label']
y_test = df_test['label']

print(Xt_train.shape) '''

In [None]:
# create embeddings

# tfifd
Xt_train = X_tf_vec.transform(df_train_up['tweet'])
Xt_test = X_tf_vec.transform(df_test['tweet'])

# count vectorizer
Xc_train = X_co_vec.transform(df_train_up['tweet'])
Xc_test = X_co_vec.transform(df_test['tweet'])

# labels
y_train = df_train_up['label']
y_test = df_test['label']

print(Xt_train.shape) 

In [None]:
# Testing Stratification
print('There is {} training data, of which {}% is hate speech '.format(df_train_up['label'].count(), round(df_train_up['label'].sum()/df_train_up['label'].count()*100,2)))
print('There is {} test data, of which {}% is hate speech '.format(df_test['label'].count(), round(df_test['label'].sum()/df_test['label'].count()*100,2)))

In [None]:
# TF-IDF
# Multi-variate Bernoulli Naive Bayes
bnb = ms.GridSearchCV(nb.BernoulliNB(), param_grid={'alpha':np.logspace(-2., 2., 50)})
bnb.fit(Xt_train, y_train);

# Multinominal Naive Bayes
mnb = ms.GridSearchCV(nb.MultinomialNB(), param_grid={'alpha':np.logspace(-2., 2., 50)})
mnb.fit(Xt_train, y_train);

In [None]:
print('TF-IDF Scores with the multi-variate Bernoulli Naive Bayes:')
validate(y_test, bnb.predict(Xt_test))
print('TF-IDF Scores with the Multinominal Bernoulli Naive Bayes:')
validate(y_test, mnb.predict(Xt_test))

In [None]:
# Count Vectorizer
# Multi-variate Bernoulli Naive Bayes
cbnb = ms.GridSearchCV(nb.BernoulliNB(), param_grid={'alpha':np.logspace(-2., 2., 50)})
cbnb.fit(Xc_train, y_train);

# Multinominal Naive Bayes
cmnb = ms.GridSearchCV(nb.MultinomialNB(), param_grid={'alpha':np.logspace(-2., 2., 50)})
cmnb.fit(Xc_train, y_train);


In [None]:
print('Count Vectorizer Scores with the multi-variate Bernoulli Naive Bayes:')
validate(y_test, cbnb.predict(Xc_test))
print('Count Vectorizer Scores with the Multinominal Bernoulli Naive Bayes:')
validate(y_test, cmnb.predict(Xc_test))

In [None]:
# Confusion-Matrix
cm = confusion_matrix(y_test, cmnb.predict(Xc_test))

fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title("Confusion Matrix")
plt.show()