In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
nltk.download('stopwords')

import nltk
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
train_df = pd.read_csv('/content/train_E6oV3lV.csv')

X_train, X_val, y_train, y_val = train_test_split(train_df['tweet'], train_df['label'], test_size=0.2, random_state=42)

In [4]:
# Tokenization
stop_words = set(stopwords.words('english'))
def tokenize_tweet(tweet):
    tokens = word_tokenize(tweet.lower())
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)


In [5]:
X_train_tokenized = X_train.apply(tokenize_tweet)
X_val_tokenized = X_val.apply(tokenize_tweet)

In [6]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train_tokenized)
X_val_tfidf = vectorizer.transform(X_val_tokenized)

In [7]:
max_length = 200
# Reshape the input data to 3D before padding:
X_train_tfidf_3d = X_train_tfidf.toarray().reshape(X_train_tfidf.shape[0], X_train_tfidf.shape[1], 1)
X_val_tfidf_3d = X_val_tfidf.toarray().reshape(X_val_tfidf.shape[0], X_val_tfidf.shape[1], 1)

In [9]:
# Pad the 3D data, specifying the correct axis:
X_train_padded = pad_sequences(X_train_tfidf_3d, maxlen=max_length, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_tfidf_3d, maxlen=max_length, padding='post', truncating='post')

In [10]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model.add(Bidirectional(LSTM(64, dropout=0.2)))
model.add(Dense(8, activation='softmax'))

In [11]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [12]:
model.fit(X_train_padded, y_train, validation_data=(X_val_padded, y_val), epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x79ec4a2912a0>

In [13]:
test_df = pd.read_csv('/content/test_tweets_anuFYb8.csv')
X_test_tokenized = test_df['tweet'].apply(tokenize_tweet)
X_test_tfidf = vectorizer.transform(X_test_tokenized)
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_tfidf.toarray(), maxlen=max_length)

In [31]:
y_pred_test = model.predict(X_val_padded)



In [22]:
X_val_padded = tf.keras.preprocessing.sequence.pad_sequences(X_val_tfidf.toarray(), maxlen=max_length)

In [34]:
print("Validation metrics:")
print("Accuracy:", accuracy_score(y_val, y_pred_test.argmax(-1)))

Validation metrics:
Accuracy: 0.9286719849835758


In [33]:
print("Classification Report:")
print(classification_report(y_val, y_pred_test.argmax(-1)))

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      5937
           1       0.00      0.00      0.00       456

    accuracy                           0.93      6393
   macro avg       0.46      0.50      0.48      6393
weighted avg       0.86      0.93      0.89      6393



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_test.argmax(-1)))

Confusion Matrix:
[[5937    0]
 [ 456    0]]
