In [None]:
# RNN PLAY GENERATOR

In [None]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [None]:
# Dataset

path_to_file = tf.keras.utils.get_file("shakespeare.txt", "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [None]:
# Read contents of file
# Read, then decode for py2 compat

text = open(path_to_file, "rb").read().decode(encoding="utf-8")

# length of text is the number of characters in it

print(f"Length of the text: {len(text)} characters")

Length of the text: 1115394 characters


In [None]:
# Encoding 

vocab = sorted(set(text))

# creating a mapping from unique characters to indices
char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [None]:
print(f"Text: {text[:13]}")
print(f"Encoded: {text_to_int(text[:13])}")

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [None]:
# Decoding

def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return "".join(idx2char[ints])

print(int_to_text)

<function int_to_text at 0x7fd668281b90>


In [None]:
# Creating Training Examples

seq_length = 100    # length of sequence for a training example
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples/targets

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [None]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [None]:
# Now we need to use these sequences of length 101 and split them into input and output

def split_input_target(chunk):    # hello
  input_text = chunk[:-1]         # hell
  target_text = chunk[1:]         # ello
  return input_text, target_text

dataset = sequences.map(split_input_target)   # we use map to apply the above function to every entry

In [None]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)   # number of unique characters
EMBEDDING_DIM = 256   
RNN_UNITS = 1024

BUFFER_SIZE = 10000
data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# Building the Model

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
      tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer="glorot_uniform"),
      tf.keras.layers.Dense(vocab_size)
  ])

  return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Creating a Loss Function

for input_example_batch, target_example_batch in data.take(1):
  example_batch_predictions = model(input_example_batch)    # ask our model for a prediction on our first batch of training data
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [None]:
# we can see that the prediction is an array of 64 arrays, one for each entry in the batch

print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[ 1.68707629e-03 -4.39497089e-04  6.22705091e-04 ...  3.02484701e-03
   -1.07179629e-04 -4.10024868e-03]
  [ 1.40250265e-03  1.01897139e-02  8.32321239e-04 ...  2.30203639e-03
    9.60105751e-03 -4.46917303e-03]
  [ 3.49687366e-03  6.43038237e-03  7.34150875e-04 ...  3.63300019e-03
    5.53371944e-03 -6.93855714e-03]
  ...
  [ 1.21138757e-03  2.30991165e-03  4.17162990e-03 ...  2.19183811e-03
    5.91041474e-03 -1.20095140e-03]
  [-2.92863976e-03  2.26190034e-03 -1.81391346e-03 ...  3.15520866e-03
    9.62767377e-03  3.55913769e-03]
  [-4.35876101e-03 -1.45367091e-03 -2.32692971e-03 ...  1.82141759e-03
    7.47173605e-03  2.76342803e-03]]

 [[-4.22929227e-03  1.83009403e-03 -5.54334000e-03 ...  2.25832616e-03
    5.54600125e-03  3.87816736e-03]
  [-5.74872829e-03 -6.75533491e-04 -5.53077273e-03 ...  1.70337711e-03
    4.85257758e-03  2.43483810e-03]
  [-8.01402051e-03  2.02579284e-03 -4.57352377e-04 ...  1.60709338e-03
   -6.21876097e-04 -2.91481614e-03]
  ...
  [-1.002

In [None]:
# Let's examine one prediction

pred = example_batch_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[ 0.00168708 -0.0004395   0.00062271 ...  0.00302485 -0.00010718
  -0.00410025]
 [ 0.0014025   0.01018971  0.00083232 ...  0.00230204  0.00960106
  -0.00446917]
 [ 0.00349687  0.00643038  0.00073415 ...  0.003633    0.00553372
  -0.00693856]
 ...
 [ 0.00121139  0.00230991  0.00417163 ...  0.00219184  0.00591041
  -0.00120095]
 [-0.00292864  0.0022619  -0.00181391 ...  0.00315521  0.00962767
   0.00355914]
 [-0.00435876 -0.00145367 -0.00232693 ...  0.00182142  0.00747174
   0.00276343]], shape=(100, 65), dtype=float32)


In [None]:
# and finally we ll look at a prediction at the first timestep

time_pred = pred[0]
print(len(time_pred))
print(time_pred)

65
tf.Tensor(
[ 1.6870763e-03 -4.3949709e-04  6.2270509e-04 -1.4639589e-03
  1.1209324e-03  3.0204835e-03 -3.4940161e-03 -3.2408275e-03
  1.8847206e-03  1.3603381e-03 -2.6363223e-03 -4.8786784e-03
 -9.3130354e-04  2.4073839e-03  5.1260635e-04  8.6727471e-04
  4.0089018e-03  1.1922903e-03  2.5375793e-03  1.1992742e-03
 -7.3924416e-04 -2.5295001e-04  5.4805254e-04  4.5940219e-03
 -2.3432996e-03 -4.5648811e-04 -2.3401841e-03 -4.8164260e-03
 -1.3170037e-03 -5.8339495e-04  4.5012485e-04  7.3392433e-04
 -2.3255521e-03 -2.2350126e-03  1.0035140e-03 -6.9497677e-04
  6.9384824e-04 -6.7728050e-03  3.7682983e-03 -8.2094339e-04
 -2.6920610e-03 -4.5376306e-05 -3.3760723e-03  1.6383847e-03
 -1.1885218e-04 -5.8186394e-03 -1.4856004e-03  4.5749708e-03
  1.6207020e-03  2.5158632e-04  2.8162128e-03 -5.7365559e-03
  3.7462462e-03 -4.2432616e-04  1.4762126e-03  6.1706835e-03
 -3.2348363e-03  5.8684597e-04  1.7441658e-03  6.5805390e-04
  1.2503233e-03  1.9387936e-03  3.0248470e-03 -1.0717963e-04
 -4.100248