In [7]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [8]:
df = pd.read_csv("cleaned_feedback.csv")

df.head()

Unnamed: 0,feedback_text,Sentiment,cleaned_text
0,I like eat delicious food. That's I'm cooking ...,Positive,like eat delicious food thats im cooking food ...
1,This help eating healthy exercise regular basis,Positive,help eating healthy exercise regular basis
2,Works great especially going grocery store,Positive,work great especially going grocery store
3,Best idea us,Positive,best idea u
4,Best way,Positive,best way


In [9]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['Sentiment'])

# Check mapping
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))


{'Negative': np.int64(0), 'Neutral': np.int64(1), 'Positive': np.int64(2)}


In [11]:
X = df['cleaned_text']
y = df['label']

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.astype(str)
X_test = X_test.astype(str)
# Tokenize
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to same length
max_len = 100
X_train_pad = pad_sequences(train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(test_seq, maxlen=max_len, padding='post', truncating='post')

print("✅ Data tokenized and padded successfully!")


✅ Data tokenized and padded successfully!


In [13]:
# Load GloVe embeddings
embeddings_index = {}
with open("glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print("✅ Loaded %s word vectors." % len(embeddings_index))


✅ Loaded 400000 word vectors.


In [15]:
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [16]:
model = Sequential([
    Embedding(input_dim=len(word_index)+1,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()




In [17]:
history = model.fit(
    X_train_pad, 
    y_train, 
    validation_split=0.2,
    epochs=5,
    batch_size=64,
    verbose=1
)


Epoch 1/5
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 112ms/step - accuracy: 0.7342 - loss: 0.6249 - val_accuracy: 0.7964 - val_loss: 0.5011
Epoch 2/5
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 98ms/step - accuracy: 0.7995 - loss: 0.4885 - val_accuracy: 0.8127 - val_loss: 0.4478
Epoch 3/5
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 113ms/step - accuracy: 0.8217 - loss: 0.4434 - val_accuracy: 0.8272 - val_loss: 0.4157
Epoch 4/5
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 129ms/step - accuracy: 0.8430 - loss: 0.3991 - val_accuracy: 0.8404 - val_loss: 0.3958
Epoch 5/5
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 151ms/step - accuracy: 0.8558 - loss: 0.3685 - val_accuracy: 0.8401 - val_loss: 0.3892


In [18]:
y_pred = np.argmax(model.predict(X_test_pad), axis=1)
print(classification_report(y_test, y_pred, target_names=encoder.classes_))


[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step
              precision    recall  f1-score   support

    Negative       0.73      0.72      0.73      1222
     Neutral       0.81      0.73      0.77       849
    Positive       0.89      0.92      0.90      3528

    accuracy                           0.84      5599
   macro avg       0.81      0.79      0.80      5599
weighted avg       0.84      0.84      0.84      5599



In [26]:
import pickle


model.save("sentiment_model.h5")

with open("tokenizer.pkl", 'wb') as f:
    pickle.dump(tokenizer, f)

with open("label_encoder.pkl", 'wb') as f:
    pickle.dump(encoder, f)

print("✅ Model and tokenizer saved successfully!")




✅ Model and tokenizer saved successfully!
