In [None]:
# # IMDb Sentiment Analysis with LSTM
# This notebook implements an LSTM model for sentiment analysis on IMDb reviews

# # 1. Setup and Imports
# First, let's import all required libraries and modules.

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

# Set random seed for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# # 2. Data Loading and Preprocessing

In [4]:
# Load data
df = pd.read_csv('../../data/raw/IMDB Dataset.csv')
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# Clean text function
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [6]:
# Apply cleaning
df['cleaned_review'] = df['review'].apply(clean_text)
df['sentiment_label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [7]:
# Split data
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")


Train size: 35000
Validation size: 7500
Test size: 7500


In [8]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['cleaned_review'])

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['cleaned_review'])
val_sequences = tokenizer.texts_to_sequences(val_df['cleaned_review'])
test_sequences = tokenizer.texts_to_sequences(test_df['cleaned_review'])

In [9]:
# Padding sequences
max_length = 200
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')


In [10]:
# Save tokenizer for later use
import pickle
with open('../../models/lstm/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
# # 3. Model Building

In [None]:
# Define the LSTM architecture
# 1. Limit vocabulary size in Tokenizer (add this when creating tokenizer)
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')  # Top 20,000 words only

# 2. Update model building (remove input_length)
def build_lstm_model(vocab_size, embedding_dim=128, lstm_units=64):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),  # Removed input_length
        Bidirectional(LSTM(lstm_units, return_sequences=True)),
        Dropout(0.5),
        Bidirectional(LSTM(lstm_units//2)),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# 3. Explicitly build the model by passing a sample input
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {min(vocab_size, 20000)}")  # Will use max 20,000 due to Tokenizer

model = build_lstm_model(min(vocab_size, 20000))  # Ensure we don't exceed num_words
model.build((None, 200))  # Explicitly build for 200-length sequences
model.summary()


In [15]:
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

model = build_lstm_model(vocab_size)
model.summary()

Vocabulary size: 160788
