In [2]:
import pandas as pd

In [6]:
df = pd.read_csv("../../data/mendeley/HateSpeechDatasetBalanced.csv")
df.columns = df.columns.str.lower()

df.head()

Unnamed: 0,content,label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1


Vectorize text data

In [7]:
## libraries to be used

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [8]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['label'], test_size=0.2, random_state=42)

using both count vectorizer and tfidf to analyze performance

In [14]:
## check vocab size for reference, to help determine max features value for vectorizers 

vocab_file = '..\\..\\custom_bert_tokenizer_mendeley\\vocab.txt'
with open(vocab_file, 'r', encoding='utf-8') as f:
    vocab_size = sum(1 for _ in f)

max_features = min(vocab_size + 1000, 60000)  #  allow some extra room for unseen data
print(max_features, vocab_size)

51000 50000


In [15]:
# using the CountVectorizer
count_vectorizer = CountVectorizer(max_features=max_features)  
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)


In [16]:
# using the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


Train the model

In [17]:
## train log regression model on counts vectorized dataset
model_count = LogisticRegression()
model_count.fit(X_train_counts, y_train)

y_pred_counts = model_count.predict(X_test_counts)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# train the log regression model on tfidf vectorized dataset
model_tfidf = LogisticRegression()
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)


Evaluate the models

In [19]:
## evaluate model on count vectorizer 

# Calculate metrics
accuracy_counts = accuracy_score(y_test, y_pred_counts)
precision_counts = precision_score(y_test, y_pred_counts, average='binary')
recall_counts = recall_score(y_test, y_pred_counts, average='binary')
f1_counts = f1_score(y_test, y_pred_counts, average='binary')
roc_auc_counts = roc_auc_score(y_test, y_pred_counts)

# Print metrics
print(f'CountVectorizer - Accuracy: {accuracy_counts}')
print(f'CountVectorizer - Precision: {precision_counts}')
print(f'CountVectorizer - Recall: {recall_counts}')
print(f'CountVectorizer - F1 Score: {f1_counts}')
print(f'CountVectorizer - ROC AUC: {roc_auc_counts}')


CountVectorizer - Accuracy: 0.833395306560899
CountVectorizer - Precision: 0.8316946994935132
CountVectorizer - Recall: 0.8392068979653189
CountVectorizer - F1 Score: 0.8354339117008903
CountVectorizer - ROC AUC: 0.8333494062581754


In [20]:
## evaluaate on tfidf vectorizer 
# Calculate metrics
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
precision_tfidf = precision_score(y_test, y_pred_tfidf, average='binary')
recall_tfidf = recall_score(y_test, y_pred_tfidf, average='binary')
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='binary')
roc_auc_tfidf = roc_auc_score(y_test, y_pred_tfidf)

# Print metrics
print(f'TfidfVectorizer - Accuracy: {accuracy_tfidf}')
print(f'TfidfVectorizer - Precision: {precision_tfidf}')
print(f'TfidfVectorizer - Recall: {recall_tfidf}')
print(f'TfidfVectorizer - F1 Score: {f1_tfidf}')
print(f'TfidfVectorizer - ROC AUC: {roc_auc_tfidf}')


TfidfVectorizer - Accuracy: 0.8379124662590205
TfidfVectorizer - Precision: 0.838142335567544
TfidfVectorizer - Recall: 0.8406963556114292
TfidfVectorizer - F1 Score: 0.8394174028720538
TfidfVectorizer - ROC AUC: 0.8378904789314311


#### Analysis 

TfidfVectorizer is the better choice between the two this task. It achieves higher recall, F1 score, and ROC AUC, which are critical for effectively identifying hate speech tweets while minimizing false positives.

## Dataset effects 

For reference, this was the best result we got from using the davidson dataset (with tfidf Vectorizer, on a customized token vocabulary)
```
TfidfVectorizer - Accuracy: 0.791958041958042
TfidfVectorizer - Precision: 0.7568493150684932
TfidfVectorizer - Recall: 0.8215613382899628
TfidfVectorizer - F1 Score: 0.7878787878787878
TfidfVectorizer - ROC AUC: 0.7936189529733642
```

Our current model perfoms significantly better on this larger dataset.