In [15]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [3]:
train_df = pd.read_csv('train_E6oV3lV.csv')                       

In [4]:
X_train, X_val, y_train, y_val = train_test_split(train_df['tweet'], train_df['label'], test_size=0.2, random_state=42)

stop_words = set(stopwords.words('english'))

def tokenize_tweet(tweet):
    tokens = word_tokenize(tweet.lower())
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [5]:
X_train_tokenized = X_train.apply(tokenize_tweet)
X_val_tokenized = X_val.apply(tokenize_tweet)

In [6]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train_tokenized)
X_val_tfidf = vectorizer.fit_transform(X_val_tokenized)

In [7]:
max_length = 200
X_train_sequences = X_train_tfidf.toarray()
X_val_sequences = X_val_tfidf.toarray()

In [8]:
model1 = Sequential()
model1.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model1.add(LSTM(64, dropout=0.2))
model1.add(Dense(8, activation='softmax'))



In [10]:
model1.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [11]:
model1.fit(X_train_tfidf, y_train, epochs=5, batch_size=32, validation_data=(X_val_tfidf, y_val))

Epoch 1/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2804s[0m 4s/step - accuracy: 0.9217 - loss: 0.3743 - val_accuracy: 0.9287 - val_loss: 0.2583
Epoch 2/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35453s[0m 44s/step - accuracy: 0.9309 - loss: 0.2528 - val_accuracy: 0.9287 - val_loss: 0.2637
Epoch 3/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11179s[0m 14s/step - accuracy: 0.9315 - loss: 0.2512 - val_accuracy: 0.9287 - val_loss: 0.2600
Epoch 4/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3538s[0m 4s/step - accuracy: 0.9302 - loss: 0.2543 - val_accuracy: 0.9287 - val_loss: 0.2592
Epoch 5/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9236s[0m 12s/step - accuracy: 0.9289 - loss: 0.2579 - val_accuracy: 0.9287 - val_loss: 0.2587


<keras.src.callbacks.history.History at 0x25156a040d0>

In [16]:
test_df = pd.read_csv('test_tweets_anuFYb8.csv')

In [17]:
X_test_tokenized = test_df['tweet'].apply(tokenize_tweet)
X_test_tfidf = vectorizer.transform(X_test_tokenized)

In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [27]:
y_pred_test = model1.predict(X_val_tfidf)

[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 414ms/step


In [28]:
print("Validation metrics:")
print("Accuracy:", accuracy_score(y_val, y_pred_val.argmax(-1)))

Validation metrics:
Accuracy: 0.9286719849835758


In [29]:
print("Classification Report:")
print(classification_report(y_val, y_pred_val.argmax(-1)))

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      5937
           1       0.00      0.00      0.00       456

    accuracy                           0.93      6393
   macro avg       0.46      0.50      0.48      6393
weighted avg       0.86      0.93      0.89      6393



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_val.argmax(-1)))

Confusion Matrix:
[[5937    0]
 [ 456    0]]


In [31]:
submission_df = pd.DataFrame({'label': y_pred_test.argmax(-1)})
submission_df.to_csv('submission_lstm.csv', index=False)