In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import os
import pickle


In [3]:
BASE_DIR = r"D:\AI_PROJECTS\Sentiment Analysis"
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
LSTM_DIR = os.path.join(MODELS_DIR, "lstm")
os.makedirs(LSTM_DIR, exist_ok=True)


In [4]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "training.csv"))
valid_df = pd.read_csv(os.path.join(DATA_DIR, "validation.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

train_df.head()


Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [5]:
with open(os.path.join(MODELS_DIR, "label_encoder.pkl"), "rb") as f:
    le = pickle.load(f)

num_classes = len(le.classes_)
print("Labels:", le.classes_)
print("Num Classes:", num_classes)


Labels: [0 1 2 3 4 5]
Num Classes: 6


In [10]:
train_df = pd.read_csv("clean_train.csv")
valid_df = pd.read_csv("clean_valid.csv")
test_df  = pd.read_csv("clean_test.csv")
X_train_text = train_df["clean_text"].astype(str).tolist()
X_valid_text = valid_df["clean_text"].astype(str).tolist()
X_test_text  = test_df["clean_text"].astype(str).tolist()

In [11]:

y_train_lstm = train_df["label_enc"].values
y_valid_lstm = valid_df["label_enc"].values
y_test_lstm  = test_df["label_enc"].values


In [12]:
MAX_WORDS = 50000
MAX_LEN = 120

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

# Convert to sequences
train_seq = tokenizer.texts_to_sequences(X_train_text)
valid_seq = tokenizer.texts_to_sequences(X_valid_text)
test_seq  = tokenizer.texts_to_sequences(X_test_text)

# Pad sequences
X_train_lstm = pad_sequences(train_seq, maxlen=MAX_LEN, padding='post')
X_valid_lstm = pad_sequences(valid_seq, maxlen=MAX_LEN, padding='post')
X_test_lstm  = pad_sequences(test_seq, maxlen=MAX_LEN, padding='post')

# Save tokenizer
with open(os.path.join(LSTM_DIR, "tokenizer.pkl"), "wb") as f:
    pickle.dump(tokenizer, f)

X_train_lstm.shape, X_valid_lstm.shape, X_test_lstm.shape


((16000, 120), (2000, 120), (2000, 120))

In [13]:
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),

    Bidirectional(LSTM(128, return_sequences=False)),

    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),

    Dense(num_classes, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 128)          6400000   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              263168    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 6)                 390       
                                                        

In [14]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train_lstm, y_train_lstm,
    validation_data=(X_valid_lstm, y_valid_lstm),
    epochs=10,
    batch_size=256,
    callbacks=[early_stop]
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [16]:
from sklearn.metrics import accuracy_score, classification_report

pred_valid_lstm = np.argmax(model.predict(X_valid_lstm), axis=1)

print("LSTM Valid Accuracy:", accuracy_score(y_valid_lstm, pred_valid_lstm))

print(classification_report(
    y_valid_lstm, 
    pred_valid_lstm, 
    target_names=[str(c) for c in le.classes_]
))


LSTM Valid Accuracy: 0.8725
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       550
           1       0.90      0.92      0.91       704
           2       0.75      0.70      0.73       178
           3       0.84      0.88      0.86       275
           4       0.85      0.78      0.81       212
           5       0.75      0.60      0.67        81

    accuracy                           0.87      2000
   macro avg       0.83      0.80      0.82      2000
weighted avg       0.87      0.87      0.87      2000



In [17]:
model.save(os.path.join(LSTM_DIR, "lstm_model.h5"))
print("Model saved to:", os.path.join(LSTM_DIR, "lstm_model.h5"))
#!/

Model saved to: D:\AI_PROJECTS\Sentiment Analysis\models\lstm\lstm_model.h5


In [19]:
pred_test_lstm = np.argmax(model.predict(X_test_lstm), axis=1)

print("LSTM Test Accuracy:", accuracy_score(y_test_lstm, pred_test_lstm))

print(classification_report(
    y_test_lstm, 
    pred_test_lstm, 
    target_names=[str(c) for c in le.classes_]
))


LSTM Test Accuracy: 0.8655
              precision    recall  f1-score   support

           0       0.92      0.93      0.92       581
           1       0.89      0.92      0.90       695
           2       0.68      0.68      0.68       159
           3       0.83      0.87      0.85       275
           4       0.88      0.79      0.83       224
           5       0.60      0.50      0.55        66

    accuracy                           0.87      2000
   macro avg       0.80      0.78      0.79      2000
weighted avg       0.86      0.87      0.86      2000

