### **Setup**

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GlobalAveragePooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

### Loading Data

In [5]:
df = pd.read_csv("../data/processed/imdb_clean.csv")
df = df[['clean_review', 'sentiment']]
df.head()


Unnamed: 0,clean_review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### **Feature Extraction - Tokenization & Sequences**

In [7]:
MAX_VOCAB = 20000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_review'])

In [8]:
X = tokenizer.texts_to_sequences(df['clean_review'])
X = pad_sequences(X, maxlen=MAX_LEN, padding="post", truncating="post")

y = df['sentiment'].map({'negative':0, 'positive':1}).values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=123)

### **Model 1 - Simple Embedding + Average Pooling**

In [18]:
model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=128),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, validation_split=.2, epochs=5, batch_size=128)

Epoch 1/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 60ms/step - accuracy: 0.6674 - loss: 0.6067 - val_accuracy: 0.8452 - val_loss: 0.3517
Epoch 2/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 59ms/step - accuracy: 0.8762 - loss: 0.2997 - val_accuracy: 0.8630 - val_loss: 0.3161
Epoch 3/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 58ms/step - accuracy: 0.9061 - loss: 0.2365 - val_accuracy: 0.8810 - val_loss: 0.2931
Epoch 4/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 58ms/step - accuracy: 0.9246 - loss: 0.1988 - val_accuracy: 0.8675 - val_loss: 0.3228
Epoch 5/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 57ms/step - accuracy: 0.9404 - loss: 0.1585 - val_accuracy: 0.8514 - val_loss: 0.3730


### **Evaluation**

In [19]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("AvgPool Model Accuracy:", accuracy_score(y_test, y_pred))


[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
AvgPool Model Accuracy: 0.8521581282775312


In [20]:
print("AvgPool Model Classification Report")
print(classification_report(y_test, y_pred))


AvgPool Model Classification Report
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      4939
           1       0.88      0.82      0.85      4977

    accuracy                           0.85      9916
   macro avg       0.85      0.85      0.85      9916
weighted avg       0.85      0.85      0.85      9916



### **Model 2 - LSTM with Embeddings**

In [11]:
EMB_DIM = 120

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMB_DIM),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
model.summary()

### **Training**

In [12]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=128,
    verbose=1
)

Epoch 1/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 573ms/step - accuracy: 0.5233 - loss: 0.6904 - val_accuracy: 0.6143 - val_loss: 0.6652
Epoch 2/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 546ms/step - accuracy: 0.6103 - loss: 0.6554 - val_accuracy: 0.5361 - val_loss: 0.6841
Epoch 3/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 539ms/step - accuracy: 0.6003 - loss: 0.6463 - val_accuracy: 0.5922 - val_loss: 0.6515
Epoch 4/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 568ms/step - accuracy: 0.6387 - loss: 0.6030 - val_accuracy: 0.6411 - val_loss: 0.6254
Epoch 5/5
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 605ms/step - accuracy: 0.7542 - loss: 0.5050 - val_accuracy: 0.8466 - val_loss: 0.3750


### **Evaluation**

In [13]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("LSTM Accuracy:", accuracy_score(y_test, y_pred))


[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 80ms/step
LSTM Accuracy: 0.8429810407422348


In [14]:
print("LSTM Classification Report")
print(classification_report(y_test, y_pred))


LSTM Classification Report
              precision    recall  f1-score   support

           0       0.87      0.81      0.84      4939
           1       0.82      0.88      0.85      4977

    accuracy                           0.84      9916
   macro avg       0.84      0.84      0.84      9916
weighted avg       0.84      0.84      0.84      9916



## **Notebook Summary**

This notebook explores the performance of two neural network models for IMDB sentiment classification:

- **Model 1: Embedding + Global Average Pooling**
  - Test Accuracy: **85.2%**
  - Very lightweight and fast to train

- **Model 2: Embedding + LSTM**
  - Test Accuracy: **84.3%**
  - Performed slightly worse despite higher complexity
  - Shows that sequential modeling did not significantly outperform the simpler baseline

**Key Insight:**  
For this dataset, a simple average pooling model achieved *better performance* than the LSTM,  
suggesting that word presence/absence carries enough signal for sentiment without needing sequential dependencies.
