# TensorFlow 2.0 alpha - Load Text with tf.data
### Create a Dataset from a Text File - Each example is a line from the original text

In [1]:
from __future__ import absolute_import, division, unicode_literals, print_function

import tensorflow as tf
import tensorflow_datasets as tfds
import os

  from ._conv import register_converters as _register_converters


## Data - 3 English translations of Homer's *Illiad*
### Texts by
* William Cowper, Edward, Earl of Derby, and Samuel Butler
* Text files have undergone preprocessing - header, footer, line numbers, and chapter titles removed

In [3]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)

parent_dir = os.path.dirname(text_dir)
parent_dir

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


'/Users/MPHA/.keras/datasets'

## Load Text into Datasets
#### Iterate through the files - Load each into its own dataset
#### Each Example must be Labeled Individually - use tf.data.Dataset.map - for labeler function

In [4]:
# should return (example, label) pairs

def labeler(example, index):
    return example, tf.cast(index, tf.int64)

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

#### Combine the Labeled Datasets - to a single Dataset - shuffle it

In [5]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [6]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

#### Use tf.data.Dataset.take - and print - view what the (example, label) pairs look like

In [7]:
for ex in all_labeled_data.take(5):
    print(ex)

(<tf.Tensor: id=49, shape=(), dtype=string, numpy=b'took his place among the foremost fighters, three times more fierce'>, <tf.Tensor: id=50, shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: id=53, shape=(), dtype=string, numpy=b'Of this dread Chief, alone, lest premature'>, <tf.Tensor: id=54, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=57, shape=(), dtype=string, numpy=b"A yearling heifer, broad of brow, untam'd,">, <tf.Tensor: id=58, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=61, shape=(), dtype=string, numpy=b'and give them their feeds of corn; then make speed to bring sheep and'>, <tf.Tensor: id=62, shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: id=65, shape=(), dtype=string, numpy=b'But of the Grecians with the Gods themselves.'>, <tf.Tensor: id=66, shape=(), dtype=int64, numpy=0>)


## Encode Text Lines as Numbers
#### MLearning Models work with numbers (Not words) - convert string values to List of numbers
### Build Vocabulary - Tokenize text
* Iterate over each example's numpy value
* Use tfds.features.text.Tokenizer - split it into tokens
* Collect tokens into a python set - remove duplicates
* Get vocabulary Size, for future use

In [8]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)
    
vocab_size = len(vocabulary_set)
vocab_size

17178

## Encode Examples
#### Pass vocabulary_set - to tfds.features.text.TokenTextEncode - Encode, turns text string to integers list

In [9]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

#### Try a single line - check the output

In [10]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b'took his place among the foremost fighters, three times more fierce'


In [11]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[1199, 15933, 13676, 10850, 7682, 16341, 2576, 11578, 10749, 12307, 9816]


#### Run Encoder on dataset - wrapping it with tf.py_function - passing it to dataset's map

In [12]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

all_encoded_data = all_labeled_data.map(encode_map_fn)

## Split Dataset - into Test and Training Batches
#### Use tf.data.Dataset.take, and tf.data.Dataset.skip - create small test, larger training set
### Use Padded Batch 
* examples Not the same size
* typically Batch examples are the Same Size - hence the need for Padded
* Pads each example to be the same size

In [13]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1], []))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

#### test_data, train_data - collections of batches now - no longer collections of (example,label) pairs
* Each batch is a pair - (many examples, many labels) - represented as arrays

In [14]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

(<tf.Tensor: id=149212, shape=(16,), dtype=int64, numpy=
 array([ 1199, 15933, 13676, 10850,  7682, 16341,  2576, 11578, 10749,
        12307,  9816,     0,     0,     0,     0,     0])>,
 <tf.Tensor: id=149216, shape=(), dtype=int64, numpy=2>)

#### After introducing a new token encoding (the zero used for padding) - Vocabulary Size has Increased by 1

In [15]:
vocab_size += 1

## Build Model

In [16]:
model = tf.keras.Sequential()

### First Layer - Embedding
* converts integer representations - to dense vector embeddings

In [17]:
model.add(tf.keras.layers.Embedding(vocab_size, 64))

### Next Layer - Long Short-Term Memory
* allows model to understand words - in their context with other words
* Bidirectional wrapper on the LSTM - learns datapoints, in relation to datapoints before/after it

In [18]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

### Next - series of Densely Connected Layers (Last being Output)
* Output Layer - produces probability for all labels - highest being model's prediction

In [19]:
# One or more Dense layers
# edit the list in the 'for' line - to experiment with layer Sizes

for units in [64,64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))
    
model.add(tf.keras.layers.Dense(3, activation='softmax'))

### Compile Model
* for softmax categorization - use sparse_categorical_crossentropy - as loss function
* Adam - is a very common Optimizer

In [20]:
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

## Train Model

In [21]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0xb2ecd2cc0>

In [22]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {}, Eval accuracy: {}'.format(eval_loss, eval_acc))

     79/Unknown - 3s 41ms/step - loss: 0.4178 - accuracy: 0.8390
Eval loss: 0.4178068392638919, Eval accuracy: 0.8389999866485596


### At this point, the model produces results on this data of about 84% accuracy