# Generating Shakespearean Text

## Creating the Training Dataset

In [61]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [62]:
url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", url,
                               cache_dir = '.')

In [63]:
with open(filepath) as f:
    text = f.read()

We have now downloaded the dataset. Now we need to encode every character as an integer. To do this we can use Keras Tokenizer class which maps every character in the text to a unique character id from 1 to the number of distinct characters.

In [64]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([text])

Here set char_level = True to get character level encoding instead of the default word-level encoding.

In [65]:
tokenizer.texts_to_sequences(['last'])

[[12, 5, 8, 3]]

In [66]:
tokenizer.sequences_to_texts([[12, 5, 8, 3]])

['l a s t']

In [67]:
distinct_ids = len(tokenizer.word_index) # number of distinct characters
distinct_ids

39

In [68]:
[encoded] = np.array(tokenizer.texts_to_sequences([text])) - 1

In [69]:
len(encoded) # total number of characters

1115394

## How to Split a Sequential Dataset

It is very important to avoid any overlap between the training set, the validation set, and the test set. For example we use the first 90% for training the next 5% for validation and the final 5% for the test set.

In [70]:
train_size = len(encoded) * 90 // 100

In [71]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

## Chopping the Sequential Dataset into Multiple Windows

In [72]:
n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True) # creates a datset of windows and each window is a dataset

In [73]:
dataset = dataset.flat_map(lambda window: window.batch(window_length)) # creates a dataset of tensors of length window_length.
                                                                       # This essentially flattens the nested dataset above.

In [74]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:])) # separate inputs from target

In [75]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=distinct_ids), Y_batch))

In [76]:
dataset = dataset.prefetch(1)

# Building and Training the RNN Model

Since we want to predict the next character based on the previous 100 characters, we can use an RNN with GRU layers of 128 units each. The output is a time distributed dense layer with 39 units for each character and a softmax activation function

In [77]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, distinct_ids]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(distinct_ids, activation='softmax'))
    
    
])

In [78]:
model.compile(loss="sparse_categorical_crossentropy", optimizer='adam')

In [79]:
history = model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [105]:
def preprocess(texts):
 X = np.array(tokenizer.texts_to_sequences(texts)) - 1
 return tf.one_hot(X, distinct_ids)

In [114]:
X_new = preprocess(["how are yo"])
Y_pred = np.argmax(model.predict(X_new), axis=1)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]



'n'

In [104]:
Y_pred[0]

array([6, 5, 7, 8, 0, 0, 7, 2, 5, 4, 6, 4, 3, 9, 5, 7, 1, 2, 4, 3, 7, 3,
       4, 2, 7, 2, 2, 6, 2, 6, 2, 0, 7, 3, 4, 1, 9, 9, 9], dtype=int64)