In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset (assuming the DataFrame 'df' is already loaded with 'tweet' and 'label' columns)
df = pd.read_csv("../data/mendeley_data/HateSpeechDatasetBalanced.csv")
tweets = df['Content'].values
labels = df['Label'].values

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=42)

# Load the saved BiLSTM model
model_bilstm = load_model('bilstm_model.h5')

# Unfreeze the last 2 layers
for layer in model_bilstm.layers[:-1]:
    layer.trainable = False




In [3]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(tweets)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100, padding='post', truncating='post')
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100, padding='post', truncating='post')


In [5]:
# Compile the model with a smaller learning rate
model_bilstm.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Fine-tune the BiLSTM model
history_bilstm_fine_tune = model_bilstm.fit(X_train_seq, y_train, epochs=10, batch_size=128, validation_split=0.2)

# Save the fine-tuned BiLSTM model
model_bilstm.save('bilstm_model_fine_tuned_last1_layer.h5')
print('Fine-tuned BiLSTM model saved as bilstm_model_fine_tuned.h5')

Epoch 1/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m637s[0m 174ms/step - accuracy: 0.9100 - loss: 0.2148 - val_accuracy: 0.8638 - val_loss: 0.3209
Epoch 2/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m726s[0m 200ms/step - accuracy: 0.9107 - loss: 0.2128 - val_accuracy: 0.8637 - val_loss: 0.3250
Epoch 3/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m913s[0m 251ms/step - accuracy: 0.9116 - loss: 0.2108 - val_accuracy: 0.8636 - val_loss: 0.3279
Epoch 4/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m848s[0m 231ms/step - accuracy: 0.9115 - loss: 0.2110 - val_accuracy: 0.8637 - val_loss: 0.3300
Epoch 5/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m797s[0m 220ms/step - accuracy: 0.9118 - loss: 0.2092 - val_accuracy: 0.8637 - val_loss: 0.3314
Epoch 6/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m882s[0m 243ms/step - accuracy: 0.9114 - loss: 0.2101 - val_accuracy: 0.8637 - val_loss:



Fine-tuned BiLSTM model saved as bilstm_model_fine_tuned.h5


In [4]:
tweets = df['Content'].values
labels = df['Label'].values

# TF-IDF Vectorization
# # Split the data
# X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)


In [6]:
vectorizer = TfidfVectorizer(max_features=2000) 
X_tfidf = vectorizer.fit_transform(tweets)


In [7]:
# Split the data
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)


In [10]:
# Load the saved TF-IDF model
model_tfidf = load_model('tfidf_model.h5')
model_tfidf.summary()



In [11]:
# Unfreeze the last 2 layers
for layer in model_tfidf.layers[:-1]:
    layer.trainable = False

In [12]:
# Compile the model with a smaller learning rate
model_tfidf.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])


In [13]:
# Fine-tune the TF-IDF model
history_tfidf_fine_tune = model_tfidf.fit(X_train_tfidf, y_train, epochs=10, batch_size=128, validation_split=0.2)


Epoch 1/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 13ms/step - accuracy: 0.6935 - loss: 0.9133 - val_accuracy: 0.6947 - val_loss: 0.6968
Epoch 2/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 14ms/step - accuracy: 0.6902 - loss: 0.6886 - val_accuracy: 0.6849 - val_loss: 0.5947
Epoch 3/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 12ms/step - accuracy: 0.6811 - loss: 0.6111 - val_accuracy: 0.6816 - val_loss: 0.5832
Epoch 4/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 15ms/step - accuracy: 0.6788 - loss: 0.6013 - val_accuracy: 0.6841 - val_loss: 0.5790
Epoch 5/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 13ms/step - accuracy: 0.6824 - loss: 0.5943 - val_accuracy: 0.6868 - val_loss: 0.5762
Epoch 6/10
[1m3631/3631[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 13ms/step - accuracy: 0.6838 - loss: 0.5905 - val_accuracy: 0.6899 - val_loss: 0.5740
Epoc

In [14]:
# Save the fine-tuned TF-IDF model
model_tfidf.save('tfidf_model_fine_tuned_last1_layer.h5')
print('Fine-tuned TF-IDF model saved as tfidf_model_fine_tuned.h5')



Fine-tuned TF-IDF model saved as tfidf_model_fine_tuned.h5


In [15]:
model_tfidf.summary()