In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping 
import warnings




In [2]:
# Step 1 : Data Preprocessing
data = pd.read_csv('Train.csv') #Load dataset
data = data.sample(frac=1)  # Shuffle dataset

In [3]:
data.rename(columns={'text':'reviews' , 'label':'sentiment'},inplace=True)

In [4]:
data['sentiment']

24059    0
34469    0
28131    1
7641     0
38728    1
        ..
17317    0
8351     0
14027    0
17390    1
2152     1
Name: sentiment, Length: 40000, dtype: int64

In [5]:
x =  data["reviews"].values
y =  data['sentiment'].values

In [6]:
# Step 2 : Data Splitting
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [7]:
# Step 3 : Model Building
max_words = 10000 # Number of unique words to use in tokenizer
maxlen    = 200   # Maximun length of sequence
embedding_dim = 128 # Dimension of word embeddings

In [8]:
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(x_train)

In [9]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test  = tokenizer.texts_to_sequences(x_test)

In [10]:
np.unique(y_test)

array([0, 1], dtype=int64)

In [11]:
x_train = pad_sequences(x_train,padding='post',maxlen=maxlen)
x_test  = pad_sequences(x_test,padding='post',maxlen= maxlen)

In [12]:
model = Sequential()
model.add(Embedding(input_dim=max_words,output_dim=embedding_dim,input_length=maxlen,))
model.add(LSTM(units=64,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))




In [13]:
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])





In [14]:
# Define early stopping to prevent overfitting 
early_stopping = EarlyStopping(monitor='val_score',patience=3,restore_best_weights=True)

In [15]:
# Train the model
model.fit(x_train,y_train,epochs=6,batch_size=128,validation_split=0.1,callbacks=[early_stopping])

Epoch 1/6


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.src.callbacks.History at 0x1c5e8172d50>

In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming maxlen is the maximum length of sequences
x_test_padded = pad_sequences(x_test , maxlen=maxlen)

In [17]:
# Model Evaluation
y_pred = model.predict(x_test_padded) 
# Convert probabilities to binary labels
y_pred_binary = (y_pred > 0.5).astype(int)

# Print accuracy, classification report, and confusion matrix
print("Accuracy:", accuracy_score(y_test, y_pred_binary))
print("Classification Report:\n", classification_report(y_test, y_pred_binary))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_binary))

Accuracy: 0.618625
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.87      0.69      3970
           1       0.75      0.37      0.49      4030

    accuracy                           0.62      8000
   macro avg       0.66      0.62      0.59      8000
weighted avg       0.66      0.62      0.59      8000

Confusion Matrix:
 [[3463  507]
 [2544 1486]]
