# Load text

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os

In [2]:
physical_devices = tf.config.list_physical_devices('GPU') 
try: 
    tf.config.experimental.set_memory_growth(physical_devices[0], True) 
except:
    print('Invalid device or cannot modify virtual devices once initialized.')

In [3]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
    
parent_dir = os.path.dirname(text_dir)

parent_dir

'/home/georg/.keras/datasets'

## Load text into datasets

In [4]:
# This will iterate over every example in the dataset, returning (example, label) pairs.
def labeler(example, index):
    return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

In [5]:
# Combine these labeled datasets into a single dataset, and shuffle it.
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [6]:
for ex in all_labeled_data.take(7):
    print(ex[0].numpy().decode())

As that thy nave may seem to touch, the goal:
Following their sovereign with dejected hearts,
And contest hot between them, all alike,
It burst his helmet, and his batter'd skull
But when Achilles had indulg'd his grief,
steadfast nor courageous. If you will not fight, or would talk others
The embraces of a God, and bore to Mars


## Encode text lines as numbers

### Build vocabulary

In [7]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

17178

### Encode examples

In [8]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [9]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text.decode())

As that thy nave may seem to touch, the goal:


In [10]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[5822, 15115, 4030, 10269, 16067, 9706, 16836, 5522, 13630, 10817]


In [11]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

In [12]:
def encode_map_fn(text, label):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(encode, 
                                         inp=[text, label], 
                                         Tout=(tf.int64, tf.int64))
    
    # `tf.data.Datasets` work best if all components have a shape set
    #  so set the shapes manually: 
    encoded_text.set_shape([None])
    label.set_shape([])
    
    return encoded_text, label


all_encoded_data = all_labeled_data.map(encode_map_fn)

## Split the dataset into test and train batches

In [13]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

In [14]:
sample_text, sample_labels = next(iter(test_data))

print(sample_text[0].numpy(), sample_labels[0].numpy())

[ 5822 15115  4030 10269 16067  9706 16836  5522 13630 10817     0     0
     0     0] 1


In [15]:
vocab_size += 1

## Build the model

In [16]:
model = tf.keras.Sequential()

In [17]:
model.add(tf.keras.layers.Embedding(vocab_size, 64))

In [18]:
#model.add(tf.keras.layers.Bidirectional(tf.keras.layers.RNN(tf.keras.layers.LSTMCell(64))))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [19]:
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3))

In [20]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

## Train the model

In [21]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f64986f2950>

In [22]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

     79/Unknown - 1s 18ms/step - loss: 0.4198 - accuracy: 0.8358
Eval loss: 0.420, Eval accuracy: 0.836
