# Purpose:
To create a RNN play generator. It will get a sequence of words and complete the sequence.

In [56]:
# Importing:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import numpy as np
from tensorflow.keras.callbacks import ModelCheckpoint
import os
from tensorflow.keras.optimizers import Adam

In [57]:
# Downloading the dataset
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [58]:
# Extracting the text:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8') # Extracting the text and formatting to utf-8

In [59]:
# Encoding the text, giving 1 number for each word
VOCABULARY = sorted(set(text))

#Mapping the unique characters:
char2idx = {u:i for i, u in enumerate(VOCABULARY)} # Tranforming each charactere to a index
idx2char = np.array(VOCABULARY) # This transformes the index to the char


In [60]:
# Function to transform the text to an int array
def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [61]:
# Function to transform the int array to text
def int_to_text(array):
    try:
        array = array.numpy()
    except:
        pass
    return ''.join(idx2char[array])

In [62]:
# Since it is not possible to pass 1 million words at a time, we have to split into minor sequences
sequence_len = 100 # The length of each batch
examples_per_epoch = len(text)//(sequence_len+1)

In [63]:
# Creating the examples/target:
dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # Creating the dataset. Transformed the string into caracters too.

In [64]:
sequences = dataset.batch(sequence_len+1, drop_remainder=True)
# drop_remainder: If the for the last batch it is available 105 words and the batch size is 100, it will exclude those last 5 words

In [65]:
# Splitting the sequences of 101 and splitting them in input - output:
def split_input_target(chunk): # Example: Hello
    input_text = chunk[:-1] # Example: Hell
    target_text = chunk[1:] # Example: ello
    return input_text, target_text

# Updating the dataset:
dataset = sequences.map(split_input_target)

In [66]:
# Defining the training batches
BATCH_SIZE = 64
VOCAB_SIZE = len(VOCABULARY)
EMBDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True) # Creates the Bathces

# Batches: Sequence of data to be trained together in on step of training

In [67]:
# Building the model:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE, embedding_dim=EMBDDING_DIM, rnn_units=RNN_UNITS, batch_size=BATCH_SIZE)

In [68]:
# Creating our own Loss Function:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [69]:
# Compiling the model:
# Creating our own Learning Rate
LEARNING_RATE = Adam(learning_rate=0.001)
model.compile(optimizer=LEARNING_RATE, loss=loss)

In [70]:
# Creating checkpoints: Now we are going to setup and configure our model to save checkpoinst as it trains. This will allow us to load our model from a checkpoint and continue training it.

# Directory where the checkpoints will be saved
checkpoint_dir = './checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "seventhclass")

checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only = True
)

In [72]:
# Training
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir))
history = model.fit(data, epochs=50, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
  7/172 [>.............................] - ETA: 1:58 - loss: 1.4643

KeyboardInterrupt: 

In [None]:
BATCH_SIZE = 1
model = build_model(VOCAB_SIZE, EMBDDING_DIM, RNN_UNITS, BATCH_SIZE)

In [None]:
# Loading wheights:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir))
model.build(tf.TensorShape([1, None])) # Rebuilding the Tensor

In [None]:
# Now we can use our model:)
def generate_text(model, start_string):
    num_generate = 1 # Number of characters to generate

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0) # Expanding a 1 dimensional vector into a 2 dimensional vector. [1] turns [[1]]

    text_generated = []

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)

        predictions = tf.squeeze(predictions, 0) # Remove the Batch dimension

        # Predicting the character index returned by the model
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0) # Using the generated character as next input

        text_generated.append(idx2char[predicted_id]) # Converting the number to character

    return (start_string + ''.join(text_generated))





In [None]:
# Testing:
print(generate_text(model, 'sor'))

sor,
