In [1]:
import tensorflow as tf
import numpy as np

### Generating Shakespearean Text Using a Character RNN

In [2]:
url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt",url)
with open(filepath) as fp:
    shakes_text = fp.read()

In [3]:
shakes_text[:100] # first 100 chars

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [31]:
# encode every character as integer 
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakes_text)

In [32]:
# testing tokenizer
x = tokenizer.texts_to_sequences(['First'])
print(x)
x = tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])
print(x)

[[20, 6, 9, 8, 3]]
['f i r s t']


In [36]:
# number of distict chars
max_id = len(tokenizer.word_index)
print('max_id:',max_id)
# tokenizer.word_index
dataset_size = tokenizer.document_count

max_id: 39


In [37]:
# tokenizer.document_count
tokenizer.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': True,
 'oov_token': None,
 'document_count': 1115394,
 'word_counts': '{"f": 17567, "i": 57369, "r": 53758, "s": 54219, "t": 74024, " ": 169892, "c": 19443, "z": 554, "e": 100652, "n": 53608, ":": 10316, "\\n": 40000, "b": 14082, "o": 71279, "w": 21115, "p": 12449, "d": 33447, "a": 63326, "y": 22166, "u": 29897, "h": 54378, ",": 19846, "m": 25083, "k": 8672, ".": 7885, "l": 37215, "v": 8591, "?": 2462, "\'": 6187, "g": 15755, ";": 3628, "!": 2172, "j": 948, "-": 1897, "q": 840, "x": 641, "&": 3, "3": 27, "$": 1}',
 'word_docs': '{"f": 17567, "i": 57369, "r": 53758, "s": 54219, "t": 74024, " ": 169892, "c": 19443, "z": 554, "e": 100652, "n": 53608, ":": 10316, "\\n": 40000, "b": 14082, "o": 71279, "w": 21115, "p": 12449, "d": 33447, "a": 63326, "y": 22166, "u": 29897, "h": 54378, ",": 19846, "m": 25083, "k": 8672, ".": 7885, "l": 37215, "v": 8591, "?": 2462, "\'": 6187, 

In [46]:
# actual ids are from 1 to 39 , by substracting -1 we will get in between 0 to 1
[encoded] = np.array(tokenizer.texts_to_sequences([shakes_text])) - 1

- spliting dataset into train and test set  as it is sequential data we can not shuffle it and make sure to not have overlapping content between different sets.

In [47]:
train_size = total_chars*90 // 100
print("train size: ",train_size)
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

train size:  1003854


##### Chopping the Sequential dataset into multiple windows
- if we train our current 1 million inputs (chars ) to go through RNN ,it will iterate 1M time which is equivalent to 1M parameters.
- so we use datasets Window() method to convert this long sentence of characters into smaller windows of text.
- Every instance in the dataset will be fairly short substring of the whole text, and RNN will be unrolled only over the length of these substrings, this is called truncated backpropagation through time.

In [48]:
n_steps = 100
window_length =  n_steps + 1
dataset = dataset.window(window_length,shift=1,drop_remainder=True)

In [49]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset

<FlatMapDataset shapes: (None,), types: tf.int64>

In [50]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
print(dataset)
dataset = dataset.map(lambda windows: (windows[:,:-1],windows[:,1:]))
print(dataset)
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
print(dataset)
dataset = dataset.prefetch(1)

<BatchDataset shapes: (None, None), types: tf.int64>
<MapDataset shapes: ((None, None), (None, None)), types: (tf.int64, tf.int64)>
<MapDataset shapes: ((None, None, 39), (None, None)), types: (tf.float32, tf.int64)>


In [51]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


In [54]:
# model = tf.keras.models.Sequential([
#     tf.keras.layers.GRU(128,return_sequences=True,input_shape=[None,max_id],dropout=0.2,recurrent_dropout=0.2),
#     tf.keras.layers.GRU(128,return_sequences=True,dropout=0.2,recurrent_dropout=0.2),
#     tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_id,activation="softmax"))
# ])
# model.compile(loss="sparse_categorical_crossentropy",optimizer="adam")
# history = model.fit(dataset,epochs=20)

# takes too much time -> to train but code works

In [55]:
def preprocess_text(text):
    x = np.array(tokenizer.texts_to_sequences(text))
    print(x)
    return tf.one_hot(x,max_id)

In [58]:
preprocess_text(["how are you?"])[0][0]

[[ 7  4 17  1  5  9  2  1 16  4 14 30]]


<tf.Tensor: shape=(39,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.], dtype=float32)>

In [59]:
X_new = preprocess(["How are yo"])
#Y_pred = model.predict_classes(X_new)
# Y_pred = np.argmax(model(X_new)a, axis=-1)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char