<a href="https://colab.research.google.com/github/Chubbyman2/Text_Generator/blob/master/Text_Generator_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
# Using GRU model to generate text after training on a sample text
# Sample text used is Shakespeare's King Lear

In [33]:
import tensorflow as tf
import numpy as np
import random
import sys
import os 
import time

from keras.layers import Dense, GRU, Embedding
from keras.optimizers import Adam

In [34]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [59]:
# Data preparation
text_file = "/content/gdrive/My Drive/king_lear.txt"

with open(text_file, "r") as file:
  text = file.read()

chars = sorted(list(set(text))) # getting all unique chars

In [77]:
# Vectorize the text
# Split into 2 dicts - chars to nums, nums to chars
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = np.array(chars)

# Turns each char in the text to a numerical value
text_as_int = np.array([char_indices[c] for c in text])

In [61]:
# Max length of input sequence
max_len = 100
examples_per_epoch = len(text)/(max_len+1)

# Create training examples/targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

Note: drop_remainder() is a tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements; the default is False.

In [62]:
sequences = char_dataset.batch(max_len+1, drop_remainder=True)

In [63]:
# For each sequence, shift the input over by one to form the target text
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

# map method applies function to each batch
dataset = sequences.map(split_input_target)

Buffer size to shuffle the dataset 

(TF data is designed to work with possibly infinite sequences, so it doesn't attempt to shuffle the entire sequence in memory. Instead, it maintains a buffer in which it shuffles elements).

In [64]:
BATCH_SIZE = 64

# It only shuffles 10000 elements
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [65]:
vocab_size = len(chars)

embedding_dim = 256

In [66]:
# Build
model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, batch_input_shape=[BATCH_SIZE, None]),
    GRU(1024, return_sequences=True, stateful=True, recurrent_initializer="glorot_uniform"),
    Dense(vocab_size)
])

In [67]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)

  # Check output shape
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 87) # (batch_size, sequence_length, vocab_size)


In [68]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)

print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 87)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.4660974


In [69]:
# Compile
model.compile(optimizer="Adam", loss=loss)

In [70]:
# Configure checkpoints
checkpoint_dir = "./training_checkpoints"

# Name of checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

In [71]:
# Train
model.fit(dataset, batch_size=BATCH_SIZE, epochs=10, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc824f3a8d0>

In [72]:
# Because of the way the RNN state is passe from timestep to timestep,
# Model only accepts a fixed batch size once built
# To run with different batch_size, rebuild the model and restore the weights from the checkpoint
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_10'

In [73]:
# Build again with batch size = 1
BATCH_SIZE = 1

model2 = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, batch_input_shape=[BATCH_SIZE, None]),
    GRU(1024, return_sequences=True, stateful=True, recurrent_initializer="glorot_uniform"),
    Dense(vocab_size)
])

In [74]:
model2.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model2.build(tf.TensorShape([1, None]))

In [75]:
# Generating text using the learned model
def generate_text(model, start_string):
  num_generate = 600

  # Vectorizing string
  input_eval = [char_indices[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  # Lower temperatures = more predictable text.
  # Higher temperatures = more surprising text
  temperature = 1.0

  # Batch size = 1 here
  model2.reset_states()
  for i in range(num_generate):
    predictions = model2(input_eval)

    # Remove batch dimension
    predictions = tf.squeeze(predictions, 0)

    # Using a categorical distribution to predict the returned character
    predictions = predictions/temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

    # Pass predicted character along as next input to the model
    # along with previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)
    text_generated.append(indices_char[predicted_id])

  return (start_string + "".join(text_generated))

In [79]:
print(generate_text(model2, start_string="CORDELIA: "))

CORDELIA: WOPLUMEB,
Glues theerut busty hester.
     Will, I him flo art bentorant ny fle may sendioht
     Ho theey lett in, all myof Henert?

             noter a chtallitn the istonnd abl.
     ho soard tleve likster hit livt,
     By light sir, of not earster]
  Cond. Her meast de freaglds.
  Kent. Nomy flerace; dearg.
  Lear. I ham uthor a hien,
     He rave't brene an mend men you
     Ippoor no'd ore hirithilf the the me oof
     Bind be thing;
     And the mede.- Gloonct, O_, Hall,
     Somericest, and grierunt with I
         Servee ly.
  Knmon. Whe neaver stells this th thas not doyes grert-
C
