In [1]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

In [2]:
import matplotlib.pyplot as plt 

def plot_graph(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_' +metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metrics])

In [3]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                         as_supervised=True)

train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [4]:
for example, label, in train_dataset.take(1):
    print('text: ',example.numpy())
    print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [5]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [6]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [7]:
for example, label in train_dataset.take(1):
    print('texts: ', example.numpy()[:3])
    print()
    print('labels: ', label.numpy()[:3])

texts:  [b'Dreadful acting. A thinly veiled attempt to slam those on the left side of the aisle.<br /><br />Women are subjugated and revolve around men. Tom Selleck shows his acting range from A to B.'
 b"Yes this a B- grade horror. But at least the producers, directors, and cast does not pretend this flick is manna from heaven. The plot is corny, a psychotic serial killer on his way to execution is splashed with genetic acid turning him into a snow man. The snowman a.k.a. Jack Frost then goes on a murdering rampage to find the small town sheriff that finally arrested him. With a limited budget the crew had to make do with limited special effects, most of the money appears to spent on the snowman's costume. Particullary difficult shots are managed by cartoons or pan away shots (shots where the camera moves away to disguise the details). <br /><br /> This is no kid's movie and should not be confused with Disney movie of the same title. If you do not let your children watch pg-13 movies 

In [8]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)

In [9]:
encoder.adapt(train_dataset.map(lambda text, label: text))

In [10]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [11]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[  1, 112,   4, ...,   0,   0,   0],
       [414,  11,   4, ...,   0,   0,   0],
       [ 39,   2,  86, ...,   0,   0,   0]])

In [12]:
for n in range(3):
    print("Original: ", example[n].numpy())
    print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
    print()

Original:  b'Dreadful acting. A thinly veiled attempt to slam those on the left side of the aisle.<br /><br />Women are subjugated and revolve around men. Tom Selleck shows his acting range from A to B.'
Round-trip:  [UNK] acting a [UNK] [UNK] attempt to [UNK] those on the left side of the [UNK] br women are [UNK] and [UNK] around men tom [UNK] shows his acting [UNK] from a to b                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           