This notebook aims to train and validate the softmax classifier as to compare its performance to that of an LSTM neural network for hate speech classification

# Import Libraries

In [50]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import tensorflow
from tensorflow import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer

from utilities.data_preprocessors import read_preprocess, series_to_1D_array, construct_embedding_dict, construct_embedding_matrix, sentences_to_avgs
from utilities.data_visualizers import train_cross_results_v2, view_final_metrics
from models.softmax_regression import SoftmaxRegression

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [51]:
# 1 for religious and 0 for non religious
df = pd.read_csv('./data/hate-speech-data-cleaned.csv', index_col=0)
df = read_preprocess(df)

all_words = pd.Series(series_to_1D_array(df['comment']))
all_unique_words_counts = all_words.value_counts()
all_unique_words = all_words.unique()


In [52]:
df['label'].value_counts()

2    22395
1    21644
0    19743
3     1998
Name: label, dtype: int64

# Preprocess data

In [53]:
# rejoin the comment columns values of lists of words to sentences
df['comment'] = df['comment'].apply(lambda comment: " ".join(comment))
df

Unnamed: 0,comment,label
0,woman complain cleaning house man always take ...,1
1,boy dat coldtyga dwn bad cuffin dat hoe st place,0
2,dawg ever fuck bitch start cry confused shit,0
3,look like tranny,0
4,shit hear might true might faker bitch told ya,0
...,...,...
65775,from the midnight sun where the hot spring blow,1
65776,do not say am not your type,1
65777,and therefor never send to know for whom the b...,1
65778,and cannot stand anoth day,1


In [54]:
# at the same time one hot encode the y labels/classes
len_unique_labels = len(df['label'].unique())
Y_oh = one_hot(df['label'], len_unique_labels, dtype=tf.float64).numpy()
Y_oh

NameError: name 'tf' is not defined

# Retrieving and assigning important variables for training classifier

In [None]:
sents = df['comment']

# get number of all unique words
num_words_3 = len(all_unique_words)

# instantiate Tokenizer on the total number of all unique words
tokenizer = Tokenizer(num_words=num_words_3, split=' ')

# call .fit_on_texts to create the word_index and index_word dicts
tokenizer.fit_on_texts(sents)

# save the tokenizer dictionaries for use later when loading GloVe embeddings
word_to_index = tokenizer.word_index
index_to_word = tokenizer.index_word
print(len(word_to_index))
# print(word_to_index)

47916


In [None]:
# important variables

# includes oov words
emb_dict, emb_vec_len = construct_embedding_dict('./embeddings/glove.42B.300d.txt', word_to_index)
emb_matrix = construct_embedding_matrix(word_to_index, emb_dict, emb_vec_len)

100%|██████████| 47916/47916 [00:00<00:00, 273974.50it/s]


# Transform all sentences to word vectors

In [None]:
vect_sents = sentences_to_avgs(sents, emb_dict)

In [None]:
vect_sents

array([[ 0.00396263,  0.36776125, -0.04045763, ..., -0.06078575,
         0.09397175, -0.05599563],
       [ 0.29468613,  0.39194278,  0.03957122, ..., -0.03101056,
         0.32691778,  0.10542444],
       [-0.30736394,  0.26374987, -0.06718987, ...,  0.0969691 ,
         0.08005263,  0.15465625],
       ...,
       [-0.152688  , -0.13310289, -0.10236191, ..., -0.16498182,
        -0.04798355,  0.04919357],
       [-0.0755652 ,  0.0974086 ,  0.1262868 , ..., -0.073394  ,
        -0.1070248 ,  0.246028  ],
       [ 0.009189  ,  0.167159  ,  0.03203911, ..., -0.15171311,
         0.11298801, -0.13568356]])

In [None]:
vect_sents.shape

(65780, 300)

In [None]:
Y_oh.shape

(65780, 4)

In [None]:
# split data into training, validation, adn testing
train_seqs, _seqs, train_labels, _labels = train_test_split(vect_sents, Y_oh, test_size=0.3, random_state=0)
val_seqs, test_seqs, val_labels, test_labels = train_test_split(_seqs, _labels, test_size=0.3, random_state=0)

In [None]:
Y_oh.dtype

dtype('float32')

In [None]:
vect_sents.dtype

dtype('float64')

In [None]:
model = SoftmaxRegression(train_seqs.T, train_labels.T)
model.train()

ValueError: initial_value: Tensor conversion requested dtype float64 for Tensor with dtype float32: <tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[0.],
       [0.],
       [0.],
       [0.]], dtype=float32)>

In [None]:

# # predict probabilities for test set
# yhat_probs = model.predict(testX, verbose=0)
# # predict crisp classes for test set
# yhat_classes = model.predict_classes(testX, verbose=0)

# # reduce to 1d array
# yhat_probs = yhat_probs[:, 0]
# yhat_classes = yhat_classes[:, 0]

# # accuracy: (tp + tn) / (p + n)
# accuracy = accuracy_score(testy, yhat_classes)
# print('Accuracy: %f' % accuracy)
# # precision tp / (tp + fp)
# precision = precision_score(testy, yhat_classes)
# print('Precision: %f' % precision)
# # recall: tp / (tp + fn)
# recall = recall_score(testy, yhat_classes)
# print('Recall: %f' % recall)
# # f1: 2 tp / (2 tp + fp + fn)
# f1 = f1_score(testy, yhat_classes)
# print('F1 score: %f' % f1)

# matrix = confusion_matrix(testy, yhat_classes)
# print(matrix)