In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
tf.random.set_seed(42)

In [2]:
import tensorflow_datasets as tfds
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
print(datasets.keys())
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

print(train_size , test_size)



dict_keys(['test', 'train', 'unsupervised'])
25000 25000


In [3]:
for X_batch, y_batch in datasets["train"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



In [4]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 900)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 117), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'piece', b'The', b'most', b'pathetic', b'scenes',
         b'were', b'those', b'when', b'the', b'Columbian', b'rebels',
         b'were', b'making', b'their', b'cases', b'for', b'revolutions',
         b'Maria', b'Conchita', b'Alonso', b'appeared', b'phony', b'and',
         b'her', b'pseudo', b'love', b'affair', b'with', b'Walken',
         b'was', b'nothing', b'but', b'a', b

In [5]:
from collections import Counter

In [6]:
vocabulary = Counter()

In [7]:
for X_batch, y_batch in datasets["train"].batch(2).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [8]:
vocabulary.most_common()[:5]

[(b'<pad>', 402966),
 (b'the', 169165),
 (b'a', 95577),
 (b'and', 92590),
 (b'of', 85655)]

In [9]:
len(vocabulary)

83916

In [10]:
vocab_size = 10000
truncated_vocabulary = [ word for word, count in vocabulary.most_common()[:vocab_size]]

In [11]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))


<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   38,    13,    12, 10053]])>

In [12]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch
train_set = datasets["train"].repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)
test_set = datasets["test"].batch(1000).map(preprocess)
test_set = test_set.map(encode_words)
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)


tf.Tensor(
[[  38   12   31 ...    0    0    0]
 [   7   21   74 ...    0    0    0]
 [4691 6619    1 ...    0    0    0]
 ...
 [  38   13  127 ...    0    0    0]
 [1966 4420  501 ...    0    0    0]
 [3592 5567    7 ...    0    0    0]], shape=(32, 168), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [13]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
               mask_zero=True,
               input_shape=[None]),
    keras.layers.GRU(4, return_sequences=True),
    keras.layers.GRU(2),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [14]:
import time
start = time.time()
model.fit(train_set, steps_per_epoch=train_size // 32, epochs=2)
end = time.time()
print("Time of execution:", end-start)
model.evaluate(test_set)

Epoch 1/2
Epoch 2/2
Time of execution: 351.48623538017273


[0.3867467939853668, 0.839680016040802]