In [1]:
# Import required libraries
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense     #The Embedding Layer translates words into meaningful number profiles so the LSTM can "understand" language

# Dataset parameters
vocab_size = 10000  # Use top 10K frequent words, rare words are replaced with <UNK>,    Balances model complexity and computational efficiency.
maxlen = 200        # padding: maxlen=200 ensures all reviews are 200 words long.

# Load IMDB dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

# Create word-to-index mapping
word_index = imdb.get_word_index()

# Decoding setup (for human-readable output)
reverse_word_index = {value + 3: key for key, value in word_index.items()}
reverse_word_index[0] = "<PAD>"     # Padding token
reverse_word_index[1] = "<START>"   # Sequence start token
reverse_word_index[2] = "<UNK>"     # Unknown word token
reverse_word_index[3] = "<UNUSED>"  # Reserved token

# Decode integer sequence to text
def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i, '?') for i in encoded_review])

# Display sample review
sample_index = 0
print("Decoded Review Example:")
print(decode_review(x_train[sample_index]))
print("Sentiment:", "Positive" if y_train[sample_index] == 1 else "Negative")

# Pad sequences to uniform length
x_train = pad_sequences(x_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(x_test, maxlen=maxlen, padding='post')

# Model architecture
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen),  # Word embeddings
    LSTM(100),                                                           # LSTM layer     A single LSTM layer with 100 memory units (neurons).
    Dense(1, activation='sigmoid')                                       # Output layer
])

# Compile model
model.compile(optimizer='adam',
              loss='binary_crossentropy',   # Binary classification loss, Measures the difference between predicted probabilities and true labels (0/1)
              metrics=['accuracy'])

# Train model
history = model.fit(x_train, y_train,
                   epochs=3,
                   batch_size=64,
                   validation_split=0.2)    #Reserves 20% of training data for validation (monitors overfitting).

# Evaluate
loss, accuracy = model.evaluate(x_test, y_test)
print("\nTest Accuracy:", accuracy)









#To perform binary sentiment classification (positive/negative) on IMDB movie reviews using an LSTM-based deep learning model, demonstrating NLP (Natural Language Processing) for text analysis.

#Sigmoid activation:     Sigmoid outputs a value between 0 (negative) and 1 (positive).
                        #Ideal for binary classification (vs. softmax for multi-class).

# How would you improve this model?
        #Add Dropout (e.g., Dropout(0.2)) to prevent overfitting.

#limitations of this approach:   Fixed vocabulary: Rare words are ignored (<UNK>)

# purpose of pad_sequences:   Ensures all reviews have the same length (maxlen).
                             #Padding: Adds zeros to shorter sequences.
                              #Truncation: Cuts off excess words in longer sequences.

#word_index dictionary do:  Maps each word to a unique integer (e.g., {"movie": 17, "good": 42}).
                            #Used to encode reviews as integer sequences.

#Epochs	3	Training iterations
#The encoded review ([1, 14, 22, ...]) is the numerical version of the text, where:
#Each number = a word (e.g., 14 = "this", 22 = "film").
#1 = Start, 2 = Unknown word, 0 = Padding (to make all reviews the same length).

# How was the "Decoded Review" generated?
         #Mapped integer IDs back to words using reverse_word_index
        #Replaced special tokens:
         #<START> = Beginning of review
         #<UNK> = Unknown word (not in top 10,000 vocabulary)

# UserWarning about input_length:     Modern Keras versions automatically handle sequence length

#Why did validation accuracy drop in Epoch 3?
            #Overfitting: Model memorized training data (solution: add Dropout)
            #Small dataset: Only 3 epochs may not show stable trends
            #Learning rate: Too high

#final Test Accuracy (62.42%) mean:
                            #The model correctly classified 62.42% of unseen reviews
                            #Baseline for binary classification: 50% (random guessing)
                            #Interpretation: The LSTM learned some sentiment patterns but has room for improvement

#mprove 62.42% accuracy?   :  Increase vocabulary size (from 10,000 to 20,000 words)    , Add Dropout layers (e.g., Dropout(0.2) after LSTM)

#timing information (e.g., 122s/epoch:  Training time per epoch


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Encoded Review:
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 7



Epoch 1/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 377ms/step - accuracy: 0.5236 - loss: 0.6866 - val_accuracy: 0.5942 - val_loss: 0.6516
Epoch 2/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 378ms/step - accuracy: 0.6393 - loss: 0.6322 - val_accuracy: 0.6930 - val_loss: 0.5961
Epoch 3/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 383ms/step - accuracy: 0.6639 - loss: 0.5941 - val_accuracy: 0.6284 - val_loss: 0.6189
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 51ms/step - accuracy: 0.6286 - loss: 0.6143

Test Accuracy: 0.6241999864578247
