In [1]:
""" Load the pre-tokenized IMBD Reviews with Subwords dataset """
import tensorflow_datasets as tfds

imdb, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True)



In [2]:
""" Split the data into training and testing """
train_data, test_data = imdb['train'], imdb['test']

In [3]:
""" If you want to access the sub words tokenizer, use this """
tokenizer = info.features['text'].encoder
print(tokenizer.subwords)

, 'pee', 'nar', 'location_', 'ining_', 'gam', 'disappointing_', 'desire_', 'criminal_', 'considera', 'century_', 'celebrat', 'brow', 'area', 'Thin', 'Rec', "' (", 'ward_', 'vision_', 'treme', 'surprising_', 'super_', 'risk', 'receive', 'qual', 'pic', 'mee', 'levels', 'kins', 'jack', 'ire_', 'introduc', 'hits_', 'happening_', 'handsome', 'gradua', 'giv', 'garbage', 'forces_', 'finest_', 'easi', 'depressing', 'credits', 'asto', 'Sadly', 'Ple', 'Inc', 'Dick_', 'Alexand', 'wooden_', 'wood_', 'stro', 'steal_', 'soul_', 'reference', 'race', 'quis', 'pir', 'perv', 'obvious', 'majority_', 'lean', 'kes_', 'insti', 'identity', 'everybody_', 'double_', 'dies', 'credit', 'const', 'confe', 'compar', 'centur', 'bloody_', 'Under', 'Twi', 'Sean_', 'Lio', 'Halloween', 'Gal', 'Clu', 'Came', 'Barbara_', '?)', '11_', 'ws', 'ulous', 'subtle', 'substance', 'string', 'shocking_', 'scientist_', 'rian', 'nou', 'multi', 'lf', 'inal', 'harsh', 'handed', 'fir', 'expectations_', 'excited', 'exceptional', 'eva', 'c

In [4]:
""" See how it encodes and decodes strings """
sample_string = 'TensorFlow, from basics to mastery'

tokenized_string = tokenizer.encode(sample_string)
print('Tokenized string: ', tokenized_string)

original_string = tokenizer.decode(tokenized_string)
print('The original string: ', original_string)

Tokenized string:  [6307, 2327, 4043, 2120, 2, 48, 4249, 4429, 7, 2652, 8050]
The original string:  TensorFlow, from basics to mastery


In [5]:
""" Look at the actual tokens

Notice that this is case-sensitive and punctuation _is_ maintained, unlike the tokenizer in the previous lessons.
"""
tokens = []
words = []

for token in tokenized_string:
    tokens.append(token)
    words.append(tokenizer.decode([token]))

longest_token = max([str(t) for t in tokens], key=len)
spaces = ' ' * len(longest_token)

for i, token in enumerate(tokens):
    t_length = len(str(token))
    space = spaces[:len(spaces) - t_length]
    print(token, space, '-->', words[i])


6307  --> Ten
2327  --> sor
4043  --> Fl
2120  --> ow
2     --> , 
48    --> from 
4249  --> basi
4429  --> cs 
7     --> to 
2652  --> master
8050  --> y


In [6]:
""" Define the model

Note we need to use GlobalAveragePooling1D() instead of Flatten() because the input is not easily flattened.
"""
import tensorflow as tf
embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
global_average_pooling1d (Gl (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 6)                 390       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 524,237
Trainable params: 524,237
Non-trainable params: 0
_________________________________________________________________


In [7]:
""" Compile and Train """

num_epochs = 10

model.compile(
    optimizer=tf.optimizers.Adam(),
    loss=tf.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

history = model.fit(
    train_data,
    epochs=num_epochs,
    validation_data=test_data
)

Epoch 1/10


ValueError: in user code:

    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:748 train_step
        loss = self.compiled_loss(
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:204 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:149 __call__
        losses = ag_call(y_true, y_pred)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:253 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1605 binary_crossentropy
        K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4823 binary_crossentropy
        return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/carlos/Library/Caches/pypoetry/virtualenvs/road-to-ml-engineer-pxbKeJd0-py3.8/lib/python3.8/site-packages/tensorflow/python/ops/nn_impl.py:173 sigmoid_cross_entropy_with_logits
        raise ValueError("logits and labels must have the same shape (%s vs %s)" %

    ValueError: logits and labels must have the same shape ((None, 1) vs ())


In [8]:
""" Plot the results """
import matplotlib.pyplot as Plot

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_' + metric])
    plt.xlabel('Epochs')
    plt.ylabel(metric)
    plt.legend([string, 'val_' + metric])
    plt.show()

plot_graphs(history, 'acc')
plot_graphs(history, 'loss')

NameError: name 'history' is not defined

# The accuracy and loss are terrible... what happened?

This notebook is dealing with "subwords" like `fl`, `basi` and `cs` which don't really have any real meanings.

The `accuracy` and `val_accuracy` are stopping at ~50% which is just as good as random guesses, but this is because we're trying to solve this problem with subwords using the DNN we're used to. Now I get to learn about Recurrent Neural Networks (RNNs) to solve this!