In [1]:
!pip install gensim



In [2]:
import gensim.downloader
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset


model_wiki = gensim.downloader.load('glove-wiki-gigaword-50')
dataset = load_dataset("mediabiasgroup/mbib-base", "hate-speech")
df = dataset['train'].to_pandas()
df = df.sample(frac=0.2).reset_index(drop=True)

Found cached dataset mbib-base (/Users/antoniocastaldo/.cache/huggingface/datasets/mediabiasgroup___mbib-base/hate-speech/1.0.0/cf6f80c612f1363f2162f92f58e1113915a6b01aa07680513a18b7d94570e875)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import string
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [4]:
def basic_cleaning(sentence):
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    removed_stopwords = ' '.join([word for word in sentence.split() if word not in stopwords])
    
    for punctuation in string.punctuation:
        sentence = removed_stopwords.replace(punctuation, '') 
    return sentence

In [5]:
df['text'] = df['text'].apply(basic_cleaning)

In [6]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])

    return np.array(embedded_sentence)


# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []

    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)

    return embed

from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Embed the training and test sentences
X_train_embed_2 = embedding(model_wiki, X_train)
X_test_embed_2 = embedding(model_wiki, X_test)

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers
from tensorflow.keras.callbacks import EarlyStopping

In [8]:

def initialize_model():
    model = Sequential()
    model.add(layers.LSTM(20))
    model.add(layers.Dense(10, activation='tahn'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [9]:
print(f"👉 Starting padding X_train_pad_2 ")
X_train_pad_2 = pad_sequences(X_train_embed_2,
                              dtype='float32',
                              padding='post',
                              maxlen=200)
print(f"✅ Completed padding X_train_pad_2 ")
print(f"👉 Starting padding X_test_pad_2 ")
X_test_pad_2 = pad_sequences(X_test_embed_2,
                             dtype='float32',
                             padding='post',
                             maxlen=200)
print(f"✅ Completed padding X_test_pad_2 ")
print("")
print("👉 Starting training...")


👉 Starting padding X_train_pad_2 
✅ Completed padding X_train_pad_2 
👉 Starting padding X_test_pad_2 
✅ Completed padding X_test_pad_2 

👉 Starting training...
Epoch 1/50


2023-06-06 12:57:11.302834: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
✅ Completed training


In [None]:
model = initialize_model()

model.fit(X_train_pad_2,
          y_train,
          epochs=50,
          batch_size=32,
          verbose=1,
          validation_split=0.2,
          callbacks=[EarlyStopping(patience=4, restore_best_weights=True)])
print("✅ Completed training")

In [11]:
res = model.evaluate(X_test_pad_2, y_test, verbose=0)
from sklearn.dummy import DummyClassifier
baseline_model = DummyClassifier(strategy='most_frequent')
baseline_model.fit(X_train_pad_2, y_train)
baseline_model.score(X_test_pad_2, y_test)
print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')
print(f'The accuracy of the baseline model is of {baseline_model.score(X_test_pad_2, y_test)*100:.3f}%')

The accuracy evaluated on the test set is of 81.748%
The accuracy of the baseline model is of 49.978%


In [17]:
print('The NN accuracy is {:.2f}% higher than the baseline model'.format((res[1] - baseline_model.score(X_test_pad_2, y_test))*100))

The NN accuracy is 31.77% higher than the baseline model


In [18]:
print('Classification report:')
y_pred = model.predict(X_test_pad_2)
print(classification_report(y_test, y_pred.round()))

Classification report:
              precision    recall  f1-score   support

           0       0.79      0.87      0.83     33886
           1       0.85      0.77      0.81     33916

    accuracy                           0.82     67802
   macro avg       0.82      0.82      0.82     67802
weighted avg       0.82      0.82      0.82     67802



In [14]:
# class balance
model.save('model.h5')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 20)                5680      
                                                                 
 dense (Dense)               (None, 10)                210       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,901
Trainable params: 5,901
Non-trainable params: 0
_________________________________________________________________
