In [1]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Parameters
max_features = 10000  # Vocabulary size: top 10,000 words
max_len = 200         # Maximum review length (in words)
embedding_dim = 128

# Load IMDB data with the top max_features words only
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# Pad sequences so that each review is max_len words long
x_train_pad = pad_sequences(x_train, maxlen=max_len)
x_test_pad = pad_sequences(x_test, maxlen=max_len)

# Build the model
model = Sequential([
    Embedding(max_features, embedding_dim, input_length=max_len),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')  # Binary classification output
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print(model.summary())

# Train the model
history = model.fit(x_train_pad, y_train,
                    epochs=5,
                    batch_size=64,
                    validation_split=0.2)

# Evaluate on the test set
test_loss, test_acc = model.evaluate(x_test_pad, y_test)
print(f'Test accuracy: {test_acc:.4f}')

# Prediction on a few sample reviews from test set with mapping back to text
word_index = imdb.get_word_index()
index_word = {v+3: k for k, v in word_index.items()}  # +3 to shift indices as per Keras IMDB dataset convention
index_word[0] = '<PAD>'
index_word[1] = '<START>'
index_word[2] = '<UNK>'
index_word[3] = '<UNUSED>'

def decode_review(text_ids):
    return ' '.join(index_word.get(i, '?') for i in text_ids)

# Predict on first 5 reviews in test set
sample_reviews = x_test_pad[:5]
sample_preds = model.predict(sample_reviews)
sample_preds_bin = (sample_preds > 0.5).astype(int)

for i in range(5):
    print(f'Review {i+1}:')
    print(decode_review(x_test[i]))
    print(f'True label: {y_test[i]}, Predicted probability: {sample_preds[i][0]:.4f}, Predicted label: {sample_preds_bin[i][0]}')
    print('-----')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step




None
Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 478ms/step - accuracy: 0.7661 - loss: 0.4777 - val_accuracy: 0.7724 - val_loss: 0.4769
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 410ms/step - accuracy: 0.8483 - loss: 0.3646 - val_accuracy: 0.8266 - val_loss: 0.4111
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 398ms/step - accuracy: 0.8950 - loss: 0.2700 - val_accuracy: 0.8436 - val_loss: 0.3651
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 401ms/step - accuracy: 0.9064 - loss: 0.2357 - val_accuracy: 0.8448 - val_loss: 0.3958
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 396ms/step - accuracy: 0.9225 - loss: 0.2048 - val_accuracy: 0.8568 - val_loss: 0.3852
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 46ms/step - accuracy: 0.8476 - loss: 0.4058
Test accuracy: 0.8476
Downloading data from https://storage