In [1]:
import tensorflow as tf
print(tf.__version__)
import tensorflow_datasets as tfds

2.4.0


In [2]:
imdb_bldr = tfds.builder('imdb_reviews')
imdb_bldr.download_and_prepare()
imdb_ds_orig = imdb_bldr.as_dataset(shuffle_files=False)
imdb_train_orig = imdb_ds_orig['train']
imdb_test_orig = imdb_ds_orig['test']

[A
Dl Completed...:   0%|          | 0/1 [00:22<?, ? url/s]
Dl Size...:  98%|█████████▊| 78/80 [00:22<00:00,  7.27 MiB/s][A
Dl Completed...:   0%|          | 0/1 [00:23<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:23<?, ? url/s]
Dl Completed...: 100%|██████████| 1/1 [00:23<00:00, 23.79s/ url]
Dl Size...: 100%|██████████| 80/80 [00:23<00:00,  3.35 MiB/s]
Dl Completed...: 100%|██████████| 1/1 [00:23<00:00, 23.87s/ url]
  0%|          | 0/3 [00:00<?, ? splits/s]
0 examples [00:00, ? examples/s][A


1 examples [00:01,  1.73s/ examples][A
255 examples [00:01, 193.87 examples/s][A
518 examples [00:01, 432.37 examples/s][A
778 examples [00:02, 698.52 examples/s][A
1045 examples [00:02, 991.30 examples/s][A
1314 examples [00:02, 1287.13 examples/s][A
1573 examples [00:02, 1544.97 examples/s][A
1842 examples [00:02, 1795.53 examples/s][A
2106 examples [00:02, 1998.07 examples/s][A
2365 examples [00:02, 2143.79 examples/s][A
2631 examples [00:02, 2275.56 examples/s][A
2896

In [3]:
imdb_train_orig = imdb_train_orig.map(lambda example: (example['text'], example['label']))
test_ds = imdb_test_orig.map(lambda example: (example['text'], example['label']))

imdb_train_orig = imdb_train_orig.shuffle(25000, reshuffle_each_iteration=False)
train_ds = imdb_train_orig.take(20000)
val_ds = imdb_train_orig.skip(20000)

In [5]:
print(next(iter(train_ds))[1])

tf.Tensor(0, shape=(), dtype=int64)


In [6]:
from collections import Counter
tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()

for example in train_ds:
  tokens = tokenizer.tokenize(example[0].numpy())
  token_counts.update(tokens)
print('Vocab-size: ', len(token_counts))

Vocab-size:  85526


In [7]:
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

In [8]:
def encode(text_tensor, label):
  text = text_tensor.numpy()
  encoded_text = encoder.encode(text)
  return encoded_text, label

def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

train_ds = train_ds.map(encode_map_fn)
val_ds = val_ds.map(encode_map_fn)
test_ds = test_ds.map(encode_map_fn)

print(next(iter(train_ds))[0])

train_ds = train_ds.padded_batch(32, padded_shapes=([-1],[]))
val_ds = val_ds.padded_batch(32, padded_shapes=([-1],[]))
test_ds = test_ds.padded_batch(32, padded_shapes=([-1],[]))

tf.Tensor(
[  1   2   3   4   5   6   1   7   8   9  10   5  11  12  13   3  14  15
  16  17  18  19  20  21  22  14  23  24  13  25  21  26  27  28  21  29
  30  31  31  32  30  33  34  35  36  37  38  39  40  41  42  43  44   1
  45  46  47  48  49  50  51   1  52  53  54  19  55  56  57  58  59  60
  61  62  48  63  64  65  66  19  44  18  51  67  68  23  19  55  69  70
  18  71  72  73  74  75   1  76  77  22  51  78  50   1  79  80  19  44
  81  82  33  83  84  60  31  31   1   2  85   4  12  13  86  87  88  15
  89  24  90  91  28  92  93  30  94  95  26  96   3  21  22  30  97  98
  99  95 100 101 102 103 104  21 105  28  30 106 107 108  30 109 110  69
  70 111  14  21 112  19 113  33 114 115 116 117  19 118 119  13 120  60
 121  13  25  21  26 122  18  51  20 123  31  31   1 124  30  86 125 126
  10   1 127  86 128 129 130  23  30 131 130  23 132  30 133  69  70 134
 135 100 101  19 136   1 137  18  93   1 138  48 139 140 141  38 142 143
   1 144 145  19  20  28  19 146 147 148

In [9]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(input_dim=len(token_counts) + 2, output_dim=32),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2736896   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 64)          16640     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,782,593
Trainable params: 2,782,593
Non-trainable params: 0
_________________________________________________________________


In [11]:
import numpy as np
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=['accuracy'])
history = model.fit(train_ds, validation_data=val_ds, epochs=4, steps_per_epoch=np.ceil(20000//32))
print(history.history)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
{'loss': [0.465929239988327, 0.3280166983604431, 0.1764788031578064, 0.14459805190563202], 'accuracy': [0.7783499956130981, 0.858299970626831, 0.9370499849319458, 0.9519000053405762], 'val_loss': [0.3522716462612152, 0.4174749553203583, 0.32418209314346313, 0.45008188486099243], 'val_accuracy': [0.8560000061988831, 0.819599986076355, 0.8737999796867371, 0.8557999730110168]}
