In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
vocab_size = 10000
(x_train, y_train), (x_test,y_test) = keras.datasets.imdb.load_data(num_words=vocab_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
print("Training samples:", len(x_train))
print("Test samples", len(x_test))
print("Example review (as integers):", x_train[0][:10])

Training samples: 25000
Test samples 25000
Example review (as integers): [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]


In [None]:
maxlen = 200
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

print("shape of x_train:", x_train.shape)
print("Shape of x_test:", x_test.shape)

shape of x_train: (25000, 200)
Shape of x_test: (25000, 200)


In [None]:
model = keras.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=16, input_length=maxlen),
    layers.GlobalAveragePooling1D(),
    layers.Dense(16, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=512,
                    validation_split=0.2,
                    verbose=1)

Epoch 1/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.5282 - loss: 0.6915 - val_accuracy: 0.5804 - val_loss: 0.6834
Epoch 2/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.6117 - loss: 0.6789 - val_accuracy: 0.7162 - val_loss: 0.6570
Epoch 3/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7356 - loss: 0.6471 - val_accuracy: 0.7792 - val_loss: 0.6107
Epoch 4/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.7780 - loss: 0.5944 - val_accuracy: 0.8018 - val_loss: 0.5479
Epoch 5/5
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.8185 - loss: 0.5265 - val_accuracy: 0.8190 - val_loss: 0.4849


In [None]:
results = model.evaluate(x_test, y_test, verbose=2)
print("\nTest Accuracy:", results[1])

782/782 - 1s - 2ms/step - accuracy: 0.8149 - loss: 0.4903

Test Accuracy: 0.8149200081825256


In [None]:
embedding_layer = model.layers[0]
embeddings = embedding_layer.get_weights()[0]
print("\nEmbedding matrix shape: ", embeddings.shape)


Embedding matrix shape:  (10000, 16)


In [None]:
from tensorflow.keras.datasets import imdb
word_index = imdb.get_word_index()
reverse_word_index = {v + 3: k for k,v in word_index.items()}
reverse_word_index[0] = "<PAD>"
reverse_word_index[1] = "<START>"
reverse_word_index[2] = "<UNK>"
reverse_word_index[3] = "<UNUSED>"
word = "cute"
index = word_index.get(word)
if index is not None and index + 3 < embeddings.shape[0]:
  print(f"Word: {word}")
  print(f"Index in vocabulary: {index + 3}")
  print("Embedding vector:\n" ,embeddings[index + 3])
else:
  print(f" '{word}' not found in vocabulary(maybe too rare).")

Word: cute
Index in vocabulary: 1036
Embedding vector:
 [ 0.11818766 -0.05218965 -0.11516    -0.06647555 -0.03518979  0.02760907
 -0.04192568 -0.08184912 -0.09488284  0.05790931 -0.07164112 -0.09909887
 -0.04431644  0.12069669  0.04283465 -0.08983482]
