In [16]:
## fix dataset first 

In [17]:
import pandas as pd

In [18]:
df = pd.read_csv('..\\preprocessed\\tweets_general.csv')
df.dropna(inplace=True)
df.head(2)

Unnamed: 0,tweet,tokens,label
0,woman shouldnt complain cleaning house man alw...,"['woman', 'shouldnt', 'complain', 'cleaning', ...",0
1,boy dat coldtyga dwn bad cuffin dat hoe st place,"['boy', 'dat', 'coldtyga', 'dwn', 'bad', 'cuff...",0


### data preparation

In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

tweets = df['tweet'].tolist()
labels = df['label'].tolist()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets) ## tokenizer fit to our vocabulary

tweet_sequences = tokenizer.texts_to_sequences(tweets)

max_sequence_length = 55  ## padding to ensure uniformity in length of the sequences
tweet_padded = pad_sequences(tweet_sequences, maxlen=max_sequence_length, padding='post')
print(tweet_padded[:2])
labels = np.array(labels)

[[  98  845  936 2989  202   35   83   68   15    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0]
 [  95   75 8400 5937   34 2077   75    2  447  379    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0]]


In [37]:
## the cnn model 

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

embedding_dim = 55 
vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

In [38]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(tweet_padded, labels, test_size=0.2, random_state=42)

# Train the model
epochs = 3 # Adjust as needed
batch_size = 32  # Adjust based on your hardware capabilities

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))


Epoch 1/3
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.9466 - loss: 0.2176 - val_accuracy: 0.9435 - val_loss: 0.1537
Epoch 2/3
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9523 - loss: 0.1240 - val_accuracy: 0.9407 - val_loss: 0.1668
Epoch 3/3
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.9745 - loss: 0.0675 - val_accuracy: 0.9371 - val_loss: 0.2346


<keras.src.callbacks.history.History at 0x1e97304c1d0>

In [43]:
model.summary()

Trying the test on a balanced dataset

In [40]:
from sklearn.utils import resample

df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority),  # to match minority class
                                   random_state=42)  # reproducible results

df_balanced = pd.concat([df_minority, df_majority_downsampled])
df_balanced.dropna(inplace=True)
print(df_balanced['label'].value_counts())

tweets_balanced = df_balanced['tweet'].tolist()
labels_balanced = df_balanced['label'].tolist()

tweet_sequences_balanced = tokenizer.texts_to_sequences(tweets_balanced)

tweet_padded_balanced = pad_sequences(tweet_sequences_balanced, maxlen=max_sequence_length, padding='post')
print(tweet_padded_balanced[:2])
labels_balanced = np.array(labels_balanced)


label
1    1430
0    1430
Name: count, dtype: int64
[[ 302 8429    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0]
 [8431  130  732  144   64  130  783    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0]]


In [41]:
## new model for balanced df

model_balanced = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_balanced.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_balanced.summary()



In [44]:
## train cnn on balanced df and evaluate

X_train_balanced, X_val_balanced, y_train_balanced, y_val_balanced = train_test_split(tweet_padded_balanced, labels_balanced, test_size=0.2, random_state=42)

model_balanced.fit(X_train_balanced, y_train_balanced, epochs=7, batch_size=batch_size, validation_data=(X_val_balanced, y_val_balanced))


Epoch 1/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9774 - loss: 0.0814 - val_accuracy: 0.7657 - val_loss: 0.6894
Epoch 2/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9913 - loss: 0.0413 - val_accuracy: 0.7710 - val_loss: 0.7427
Epoch 3/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9948 - loss: 0.0173 - val_accuracy: 0.7657 - val_loss: 0.8659
Epoch 4/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9963 - loss: 0.0134 - val_accuracy: 0.7587 - val_loss: 0.9544
Epoch 5/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9983 - loss: 0.0066 - val_accuracy: 0.7552 - val_loss: 1.0099
Epoch 6/7
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9993 - loss: 0.0045 - val_accuracy: 0.7517 - val_loss: 1.1461
Epoch 7/7
[1m72/72[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1e907d3c590>

We can see from their outputs that:

-- The model performs well initially on validation on the **entire dataset** but starts to overfit as the  validation loss is increasing despite high training accuracy.

__ might need regularization, in progress.

-- The model's performance on the **balanced dataset** increases for one epoch thens starts decreasing. it's overfitting and needs regularization/ fine tuning.