In [None]:
# Import libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# num_words = 10000 means we keep only the top 10,000 most frequent words
vocab_size = 10000
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=vocab_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
print("Training samples:", len(x_train))
print("Test samples:", len(x_test))
print("Example review (as integers):", x_train[0][:10])

Training samples: 25000
Test samples: 25000
Example review (as integers): [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]


In [None]:
maxlen = 200  # we’ll use only first 200 words of each review
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

print("Shape of x_train:", x_train.shape)
print("Shape of x_test:", x_test.shape)

Shape of x_train: (25000, 200)
Shape of x_test: (25000, 200)


In [None]:
model = keras.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=16, input_length=maxlen),
    layers.GlobalAveragePooling1D(),   # average all embeddings
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # binary output (positive or negative)
])

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=512,
                    validation_split=0.2,
                    verbose=1)

Epoch 1/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.5832 - loss: 0.6914 - val_accuracy: 0.6914 - val_loss: 0.6819
Epoch 2/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.6859 - loss: 0.6777 - val_accuracy: 0.7104 - val_loss: 0.6583
Epoch 3/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.7435 - loss: 0.6480 - val_accuracy: 0.7452 - val_loss: 0.6133
Epoch 4/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.7679 - loss: 0.5994 - val_accuracy: 0.7900 - val_loss: 0.5522
Epoch 5/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8131 - loss: 0.5344 - val_accuracy: 0.8256 - val_loss: 0.4883


In [None]:
results = model.evaluate(x_test, y_test, verbose=2)
print("\nTest Accuracy:", results[1])

782/782 - 2s - 2ms/step - accuracy: 0.8230 - loss: 0.4936

Test Accuracy: 0.8230000138282776


In [None]:
embedding_layer = model.layers[0]
embeddings = embedding_layer.get_weights()[0]
print("\nEmbedding matrix shape:", embeddings.shape)


Embedding matrix shape: (10000, 16)


In [None]:
# Code to Check the Embedding for Any Word

from tensorflow.keras.datasets import imdb

# Load the IMDB word index dictionary
word_index = imdb.get_word_index()

# Because Keras IMDB adds special reserved indices (0, 1, 2, 3),
# we need to shift existing indices by 3
reverse_word_index = {v + 3: k for k, v in word_index.items()}
reverse_word_index[0] = "<PAD>"
reverse_word_index[1] = "<START>"
reverse_word_index[2] = "<UNK>"
reverse_word_index[3] = "<UNUSED>"

# ---- Enter any word you want to check ----
word = "director"   # try "great", "bad", "terrible", "love", etc.

# Get its index
index = word_index.get(word)

if index is not None and index + 3 < embeddings.shape[0]:
    print(f"Word: {word}")
    print(f"Index in vocabulary: {index + 3}")
    print("Embedding vector:\n", embeddings[index + 3])
else:
    print(f"'{word}' not found in the vocabulary (maybe too rare).")

Word: director
Index in vocabulary: 167
Embedding vector:
 [ 0.1995685   0.26369825 -0.20912778 -0.21768637  0.2382142  -0.06943125
  0.20830259 -0.2475625   0.32483417  0.26994038  0.28109616  0.21879698
  0.31488776  0.25802612 -0.1878137  -0.24240795]
