<a href="https://colab.research.google.com/github/GTworx/IMDb_Text_Classification_with_LSTM/blob/main/IMDb_Text_Classification_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
IMDb Text Classification with LSTM.

This script demonstrates how to build, train, and evaluate a Long Short-Term
Memory (LSTM) neural network for sentiment analysis on the IMDb movie review
dataset.
"""

# -----------------------------------------------------------------------------
# 1. Import Necessary Libraries
# -----------------------------------------------------------------------------

import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical

# Set a seed for reproducibility
np.random.seed(42)

# -----------------------------------------------------------------------------
# 2. Set Hyperparameters and Load Data
# -----------------------------------------------------------------------------

# --- Hyperparameters ---
# The number of most frequent words to consider in the dataset.
# Words are ranked by frequency, so only the top `top_words` are kept.
TOP_WORDS = 5000
# The maximum number of words to use in each movie review.
# If a review is shorter, it will be padded; if longer, it will be truncated.
MAX_REVIEW_LENGTH = 500
# The dimension of the word embeddings. Each word will be represented
# by a vector of this size.
EMBEDDING_VECTOR_LENGTH = 32
# The number of memory units in the LSTM layer.
LSTM_UNITS = 100

# --- Load the IMDb Dataset ---
# The dataset is pre-processed, where each review is a sequence of word
# indexes (integers). The `num_words` argument ensures that we only load
# words that are among the `TOP_WORDS` most frequent.
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=TOP_WORDS)

print(f"--- Data Loading ---")
print(f"Number of training samples: {len(X_train)}")
print(f"Number of testing samples: {len(X_test)}")
print("\nExample of a raw training review (sequence of word indexes):")
print(X_train[0])

# -----------------------------------------------------------------------------
# 3. Preprocess the Data
# -----------------------------------------------------------------------------

# --- Pad Sequences ---
# Neural networks require inputs of a consistent shape. Since movie reviews
# have different lengths, we need to pad or truncate them to be of
# `MAX_REVIEW_LENGTH`.
# `pad_sequences` is a utility function that transforms a list of sequences
# into a 2D NumPy array of shape (num_samples, num_timesteps).
X_train = sequence.pad_sequences(X_train, maxlen=MAX_REVIEW_LENGTH)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_REVIEW_LENGTH)

print(f"\n--- Data Preprocessing ---")
print(f"Shape of training data after padding: {X_train.shape}")
print(f"Shape of testing data after padding: {X_test.shape}")
print("\nExample of a padded training review:")
print(X_train[0])

# -----------------------------------------------------------------------------
# 4. Build the LSTM Model
# -----------------------------------------------------------------------------

# --- Define the Model Architecture ---
# We will use a Keras Sequential model, which is a linear stack of layers.
model = Sequential()

# --- Layer 1: Embedding Layer ---
# This layer takes the integer-encoded vocabulary and looks up the embedding
# vector for each word index. The result is a 3D tensor of shape:
# (batch_size, sequence_length, embedding_dim).
# It's a crucial first step for processing text data in deep learning.
model.add(Embedding(TOP_WORDS, EMBEDDING_VECTOR_LENGTH, input_length=MAX_REVIEW_LENGTH))

# --- Layer 2: LSTM Layer ---
# The core of our model. The LSTM layer processes the sequence of word
# embeddings and learns to capture long-term dependencies in the text.
# It returns a single output vector for the final timestep.
model.add(LSTM(LSTM_UNITS))

# --- Layer 3: Output Layer ---
# A standard fully connected (Dense) layer with a 'sigmoid' activation
# function. The sigmoid function outputs a value between 0 and 1, which
# is perfect for binary classification (positive vs. negative sentiment).
model.add(Dense(1, activation='sigmoid'))


# -----------------------------------------------------------------------------
# 5. Compile the Model
# -----------------------------------------------------------------------------

# --- Set Optimizer, Loss Function, and Metrics ---
# - Optimizer: 'adam' is an efficient and commonly used optimization algorithm.
# - Loss Function: 'binary_crossentropy' is the standard loss function for
#   binary classification problems.
# - Metrics: 'accuracy' will be used to monitor the model's performance.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# --- Display Model Summary ---
# This provides a clear overview of the model's architecture, including the
# layers, output shapes, and number of parameters.
print("\n--- Model Architecture ---")
model.summary()


# -----------------------------------------------------------------------------
# 6. Train the Model
# -----------------------------------------------------------------------------

print("\n--- Training the Model ---")
# The `fit` method trains the model for a fixed number of epochs (iterations
# over the entire dataset).
# - epochs: The number of times to iterate over the entire training dataset.
# - batch_size: The number of samples per gradient update.
model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_test, y_test))


# -----------------------------------------------------------------------------
# 7. Evaluate the Model
# -----------------------------------------------------------------------------

# --- Calculate Final Accuracy on Test Data ---
# The `evaluate` method returns the loss value and metric values for the model
# in test mode.
scores = model.evaluate(X_test, y_test, verbose=0)

print("\n--- Model Evaluation ---")
print(f"Accuracy on test data: {scores[1]*100:.2f}%")

# -----------------------------------------------------------------------------
# 8. Make Predictions on New Data
# -----------------------------------------------------------------------------
# To show how to use the model in a real-world scenario, let's classify a
# few example reviews.

# --- Example Reviews (as sequences of word indexes) ---
# In a real application, you would need a tokenizer to convert raw text
# into these sequences based on the original IMDb word index.
# For this example, we'll just use a few samples from the test set.
sample_positive_review = X_test[1:2] # A known positive review
sample_negative_review = X_test[3:4] # A known negative review

# --- Make Predictions ---
prediction_positive = model.predict(sample_positive_review)
prediction_negative = model.predict(sample_negative_review)

print("\n--- Making Predictions ---")
print(f"Prediction for a positive review (raw output): {prediction_positive[0][0]:.4f}")
print(f"Sentiment: {'Positive' if prediction_positive[0][0] > 0.5 else 'Negative'}")

print(f"\nPrediction for a negative review (raw output): {prediction_negative[0][0]:.4f}")
print(f"Sentiment: {'Positive' if prediction_negative[0][0] > 0.5 else 'Negative'}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
--- Data Loading ---
Number of training samples: 25000
Number of testing samples: 25000

Example of a raw training review (sequence of word indexes):
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029,




--- Training the Model ---
Epoch 1/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m414s[0m 1s/step - accuracy: 0.7085 - loss: 0.5407 - val_accuracy: 0.8569 - val_loss: 0.3577
Epoch 2/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 950ms/step - accuracy: 0.8549 - loss: 0.3509 - val_accuracy: 0.8648 - val_loss: 0.3201
Epoch 3/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m359s[0m 919ms/step - accuracy: 0.8856 - loss: 0.2839 - val_accuracy: 0.8700 - val_loss: 0.3384

--- Model Evaluation ---
Accuracy on test data: 87.00%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step

--- Making Predictions ---
Prediction for a positive review (raw output): 0.9814
Sentiment: Positive

Prediction for a negative review (raw output): 0.9445
Sentiment: Positive


In [None]:
# -----------------------------------------------------------------------------
# 2.5. Decode a Review to See Text Content
# -----------------------------------------------------------------------------
# To see the actual words, we need to retrieve the word-to-index mapping
# provided by Keras.
word_to_index = imdb.get_word_index()

# The indexes are offset by 3 because 0, 1, and 2 are reserved for special
# tokens: '<PAD>', '<START>', and '<UNK>' (unknown).
index_to_word = {value + 3: key for key, value in word_to_index.items()}
index_to_word[0] = "<PAD>"
index_to_word[1] = "<START>"
index_to_word[2] = "<UNK>"

def decode_review(text_sequence):
    """Converts a sequence of word indexes back into a readable string."""
    return " ".join([index_to_word.get(i, "?") for i in text_sequence])

# Let's decode the first training review to see the text.
print("\nExample of a decoded training review:")
print(decode_review(X_train[0]))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step

Example of a decoded training review:
<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <