In [1]:
##aim of this code is to develop a simple LSTM neural network that can identify hate speech

##relevant imports
import tensorflow as tf
from tensorflow import keras
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from keras import Sequential
from keras.layers import LSTM, Dense

Using TensorFlow backend.


In [2]:
from keras.preprocessing.sequence import pad_sequences

In [3]:
from keras.callbacks import LearningRateScheduler

In [4]:
from keras.optimizers import Adam

In [5]:
from IPython.display import clear_output

In [None]:
import spacy
nlp = spacy.load("en_core_web_md")

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv("labeled_data.csv")

In [None]:
#create a list of sentences where each word is represented by a spacy word vector
x =[]
c=0
m= len(df["tweet"])
for t in df["tweet"]:
    x.append(np.asarray([t.vector for t in nlp(t)]))
    c+=1
    print("%.4f" % (c/m * 100), "%")
    clear_output(wait=True)

In [5]:
wordcount = 40

In [None]:
#pads and trunactes the sentences the sentence vectors
# 40 is somewhat aribtrary
for i, s in enumerate(x):
    x[i] = pad_sequences([x[i]],maxlen=wordcount, padding = "pre",dtype='float32', truncating = "post")[0]

In [None]:
x = np.asarray(x)

In [None]:
#The output data is based on the "hate_speech" column of the dataset.
y= list(df["hate_speech"])

In [None]:
#In the original dataset, however, this "hate_speech" column is not binary:
set(df["hate_speech"])
#As explained in the source of the dataset, the value of this column is based on how many CrowdFlower users deemed it to be hate speech

In [None]:
#it will be assumed for the purposes of this project that texts that at least one CF user identified as hate speech are indeed hate speech
#as this is a binary cross entropy task, two categories are  created
for i, v in enumerate(y):
    #not hate speech
    if v == 0:
        y[i] = np.asarray([1,0])
    #hate speech
    elif v>0:
        y[i] = np.asarray([0,1])

In [None]:
#converts y from a list into a numpy array
y = np.asarray(y)

In [None]:
#splits the data into traning and testing data
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.33, random_state = 42)

In [None]:
#saves the vectorised sentences
np.save("HateSpeechTestingData.npy", X_test)

In [6]:
#for loading the training data for a new session
X_train = np.load("HateSpeechTrainingData.npy")

In [7]:
y_train = np.load("HatespeechYtrain.npy")

In [16]:
#decreases the learning rate exponentially after 10 epochs
def scheduler(epoch):
    if epoch < 10:
        print(0.00016529890126548707)
        return 0.00016529890126548707
    else:
        lrnrt=  float(0.001 * tf.math.exp(0.1 * (10 - (epoch+ 29))))
        print(lrnrt)
        return lrnrt

In [17]:
learnr = LearningRateScheduler(scheduler)

In [18]:
#the model
model = Sequential()
model.add(LSTM(256, input_shape = (wordcount, 300)))
model.add(Dense(300, activation = 'relu'))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(2, activation = 'softmax'))

In [19]:
optimizer = Adam(lr=0.00016529890126548707)
model.compile(loss = 'binary_crossentropy', optimizer=optimizer)

In [21]:
model.fit(X_train, y_train, batch_size = 50, epochs = 50, callbacks =[learnr])

Epoch 1/50
0.00016529890126548707
Epoch 2/50
0.00016529890126548707
Epoch 3/50
0.00016529890126548707
Epoch 4/50
0.00016529890126548707
Epoch 5/50
0.00016529890126548707
Epoch 6/50
0.00016529890126548707
Epoch 7/50
0.00016529890126548707
Epoch 8/50
0.00016529890126548707
Epoch 9/50
0.00016529890126548707
Epoch 10/50
0.00016529890126548707
Epoch 11/50
5.50232180103194e-05
Epoch 12/50
4.978706783731468e-05
Epoch 13/50
4.504920798353851e-05
Epoch 14/50
4.076220284332521e-05
Epoch 15/50
3.6883167922496796e-05
Epoch 16/50
3.3373267797287554e-05
Epoch 17/50
3.019738505827263e-05
Epoch 18/50
2.7323725589667447e-05
Epoch 19/50
2.4723527531023137e-05
Epoch 20/50
2.2370773876900785e-05
Epoch 21/50
2.0241910533513874e-05
Epoch 22/50
1.831564077292569e-05
Epoch 23/50
1.6572677850490436e-05
Epoch 24/50
1.4995580386312213e-05
Epoch 25/50
1.3568557733378839e-05
Epoch 26/50
1.2277339010324795e-05
Epoch 27/50
1.110899665945908e-05
Epoch 28/50
1.00518363979063e-05
Epoch 29/50
9.09527898329543e-06
Epoch 

<keras.callbacks.callbacks.History at 0x1c9e588ffd0>

In [None]:
#ended on epoch 29
#lr = 0.00016529890126548707

In [24]:
X_test = np.load("HateSpeechTestingData.npy")

In [26]:
#generates predictions for the model
preds = model.predict(X_test)

In [27]:
from sklearn.metrics import classification_report

In [28]:
#converts the prediction of the model into a string
def choice(arr):
    if arr[0] > arr[1]:
        return "not hate"
    else:
        return "hate"

In [29]:
predictions = [choice(x) for x in preds]
actualresults = [choice(x) for x in y_test]

In [25]:
y_test= np.load("HatespeechYtest.npy")

In [30]:
#prints out a classification report for the model
print(classification_report(predictions, actualresults))

              precision    recall  f1-score   support

        hate       0.41      0.49      0.44      1375
    not hate       0.89      0.86      0.87      6804

    accuracy                           0.79      8179
   macro avg       0.65      0.67      0.66      8179
weighted avg       0.81      0.79      0.80      8179



In [None]:
#saves the model weights
model.save_weights("HateSpeechLSTMweights06032.ckpt")

In [20]:
#loads the model weights
model.load_weights("HateSpeechLSTMweights0603.ckpt")