In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout


In [2]:
# Load the dataset
df = pd.read_pickle('../preprocessed/tweets_bert.pkl')
df.head()  

Unnamed: 0,tweet,label,tokens
0,as a woman you shouldnt complain about cleanin...,0,"[101, 2004, 1037, 2450, 2017, 5807, 2102, 1761..."
1,boy dats coldtyga dwn bad for cuffin dat hoe i...,0,"[101, 2879, 23755, 2015, 3147, 3723, 3654, 104..."
2,dawg you ever fuck a bitch and she sta to cry ...,0,"[101, 4830, 27767, 2017, 2412, 6616, 1037, 774..."
3,she look like a tranny,0,"[101, 2016, 2298, 2066, 1037, 25283, 4890, 102]"
4,the shit you hear about me might be true or it...,0,"[101, 1996, 4485, 2017, 2963, 2055, 2033, 2453..."


In [3]:
# Preprocess data
tweets = df['tweet'].values
labels = df['label'].values

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [4]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()




In [5]:
# Train the model
history = model.fit(X_train, y_train, epochs=4, batch_size=128, validation_split=0.2)


Epoch 1/4
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 285ms/step - accuracy: 0.9111 - loss: 0.2936 - val_accuracy: 0.9483 - val_loss: 0.1979
Epoch 2/4
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 311ms/step - accuracy: 0.9445 - loss: 0.1900 - val_accuracy: 0.9496 - val_loss: 0.1674
Epoch 3/4
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 349ms/step - accuracy: 0.9545 - loss: 0.1264 - val_accuracy: 0.9435 - val_loss: 0.1881
Epoch 4/4
[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 407ms/step - accuracy: 0.9655 - loss: 0.0979 - val_accuracy: 0.9443 - val_loss: 0.2070


In [6]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

y_pred = (model.predict(X_test) > 0.5).astype("int32")


print(classification_report(y_test, y_pred))


[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9324 - loss: 0.2311
Test Loss: 0.21762792766094208
Test Accuracy: 0.9360500574111938
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      4667
           1       0.42      0.24      0.30       290

    accuracy                           0.94      4957
   macro avg       0.69      0.61      0.63      4957
weighted avg       0.92      0.94      0.93      4957



In [7]:
# Save the model
model.save('bilstm_hate_speech_model.h5')




In [8]:
model.summary()