In [None]:
import tensorflow as tf

import pandas as pd
import numpy as np

In [None]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

def labeler(example, index):
  return example, tf.cast(index, tf.int64)

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  file_path = tf.keras.utils.get_file(file_name, origin=DIRECTORY_URL + file_name)
  lines_dataset = tf.data.TextLineDataset(file_path)
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


# Preparing the dataset

In [None]:
dataset = labeled_data_sets[0]

for labeled_dataset in labeled_data_sets[1:]:
  dataset = dataset.concatenate(labeled_dataset)

dataset = dataset.shuffle(buffer_size=50000)

for ex in dataset.take(5):
  print(ex[0].numpy(), ex[1].numpy())

b"Then rose, and loudly to the Greeks proclaim'd," 1
b'Of irremediable pain severe.' 0
b'But let me quickly go, that with mine eyes' 1
b'But in our own good arms, our safety lies."' 1
b'As soon as he had spoken thus, Neptune and Minerva came up to him in' 2


In [None]:
import tensorflow_datasets as tfds

tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()

for text_tensor, _ in dataset:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)

vocab_size

17178

In [None]:
# Encode an example

# Show one of the labeled data
original_text = next(iter(dataset))[0].numpy()

# Create an text encoder with a fixed vocabulary set
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

# Encode an example
encoded_text = encoder.encode(original_text)



In [None]:
original_text

b'"Give me thy promise sure, thy gracious nod,'

In [None]:
encoded_text

[10750, 7591, 1675, 14361, 1024, 1675, 288, 2932]

In [None]:
# Encoding all the examples
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())

  return encoded_text, label

def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label],
                        Tout=(tf.int64, tf.int64))
  
all_encoded_data = dataset.map(encode_map_fn)

In [None]:
# Prepare the dataset

BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

train_data = dataset.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1], []))

test_data = dataset.take(TAKE_SIZE)
test_data = dataset.padded_batch(BATCH_SIZE, padded_shapes=([-1], []))

ValueError: ignored

In [None]:
# Training the model

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, 64),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
  tf.keras.Sequential([
    tf.keras.layers.Dense(units, activation='relu') for units in [64, 64]
  ]),
  tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(train_data, epochs=3)