<a href="https://colab.research.google.com/github/Ajay-user/ML-DL-RL-repo/blob/master/Natural%20Language%20Processing/Text_generation_with_an_RNN_101_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Given a sequence of characters from this data ("Shakespear"), train a model to predict the next character in the sequence


This notebook is created using tensorflow resources. For more information please use this link [TENSORFLOW](https://www.tensorflow.org/text/tutorials/text_generation#train_the_model)

In [1]:
import tensorflow as tf
import numpy as np
import os
import time

## Load the data

In [2]:
url = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
data_dir = tf.keras.utils.get_file(origin=url)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


### Read few lines of data

In [3]:
text = open(data_dir,mode='rb').read().decode(encoding='utf-8')

# total number of unique characters in the text file
vocab = set(text)
print(f'lenght of text file:{len(text)}')
print('total number of unique characters in the text file :',len(vocab))


print('\nLets read first 250 lines from the file \n','-'*100)
print(text[:250])

lenght of text file:1115394
total number of unique characters in the text file : 65

Lets read first 250 lines from the file 
 ----------------------------------------------------------------------------------------------------
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



## Utilities for text processing

In [4]:
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab))
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True)
text_from_ids = lambda ids: tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
split_input_target = lambda seq: (seq[:-1],seq[1:])

## Create training examples and targets

In [5]:
all_chars = tf.strings.unicode_split(text,'UTF-8')
all_ids = ids_from_chars(all_chars)
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [6]:
for id in ids_dataset.take(5):
  print(f'id:{id.numpy()}, char:{chars_from_ids(id.numpy())}')

id:21, char:b'F'
id:54, char:b'i'
id:44, char:b'r'
id:30, char:b's'
id:26, char:b't'


### Creating text sequences 

In [7]:
from tensorflow.python.ops.array_ops import sequence_mask
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [8]:
# lets check the sequences we create
for seq in sequences.take(1):
  print('IDs','-'*100,'\n',seq.numpy())
  print('Characters','-'*100,'\n',chars_from_ids(seq.numpy()).numpy())
  print('String','-'*100,'\n',tf.strings.join(chars_from_ids(seq.numpy())).numpy().decode('utf-8'))

IDs ---------------------------------------------------------------------------------------------------- 
 [21 54 44 30 26 43 12 54 26 54 17 52 23 33 58 60 52 63 31 44 52 43 51 52
 43 24 44 31 53 52 52 10 43 55 23 49 43 63 47 44 26 20 52 44  7 43 20 52
 55 44 43 37 52 43 30 24 52 55 57  4 58 58 46 45 45 33 58 14 24 52 55 57
  7 43 30 24 52 55 57  4 58 58 21 54 44 30 26 43 12 54 26 54 17 52 23 33
 58 13 31 47 43]
Characters ---------------------------------------------------------------------------------------------------- 
 [b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'

In [9]:
# We have a sequence of 100 characters 
# now we've to create inputs and labels 
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [10]:
dataset = sequences.map(split_input_target, num_parallel_calls=tf.data.AUTOTUNE)

In [11]:
for input_seq, target_seq in dataset.take(1):
  print('Input sequence\n',text_from_ids(input_seq))
  print('Output sequence\n',text_from_ids(target_seq))

Input sequence
 tf.Tensor(b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou', shape=(), dtype=string)
Output sequence
 tf.Tensor(b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou ', shape=(), dtype=string)


### Create training batches
shuffle the data and pack it into batches.

In [12]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. 
# Instead, it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

In [13]:
dataset = dataset.shuffle(buffer_size=BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

## Model Building

In [14]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [15]:
class TextGen(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, rnn_units):
    super().__init__(self)
    self.embeddings = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size)
    self.gru = tf.keras.layers.GRU(units=rnn_units, return_sequences=True, return_state=True)
    self.dense = tf.keras.layers.Dense(units=vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    # Embedding 
    x = self.embeddings(inputs, training=training)

    # Recurrent Network
    if states == None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)

    # classifier
    logits = self.dense(x, training=training)

    if return_state:
      return logits, states
    else:
      return logits


In [16]:
# Instantiate the model
# Be sure the vocabulary size matches the `StringLookup` layers

my_model = TextGen(vocab_size=ids_from_chars.vocabulary_size(), embedding_size=embedding_dim, rnn_units=rnn_units)

In [17]:
# lets see our model in action

# Input to the model is sequences of length 100
# model takes the input sequence and outputs the logits 
# output shape is [ batch, sequence length , vocabulary lenght ]
for seq_in, seq_out in dataset.take(1):
  model_out = my_model(seq_in)

In [18]:
print('Shape of input passed into the model',seq_in.shape)
print('Shape of output given by the model',model_out.shape)

Shape of input passed into the model (64, 100)
Shape of output given by the model (64, 100, 66)


In [19]:
my_model.summary()

Model: "text_gen"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  67650     
                                                                 
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


To get actual predictions from the model you need to sample from the output distribution, to get actual character indices. This distribution is defined by the logits over the character vocabulary.

In [20]:
print('Model input')
print(text_from_ids(seq_in).numpy()[0])
print('Model output')
print(text_from_ids(tf.argmax(tf.nn.softmax(model_out), axis=-1)).numpy()[0])

Model input
b":\nCare keeps his watch in every old man's eye,\nAnd where care lodges, sleep will never lie;\nBut wher"
Model output
b'OMNDDqBvqqLKKKLXKhh3,cK kWokq;CCyJEKHD EpKqCqq$XoL-h.LHLBE?hqBgJmgggoKKKqqKKh  EKkokq;HE omGg!3Kh.oH'


Note: It is important to sample from this distribution as taking the argmax of the distribution can easily get the model stuck in a loop.

In [21]:
sampled_indices = tf.random.categorical(logits=model_out[0], num_samples=1, dtype=tf.int32)
sampled_indices = tf.squeeze(sampled_indices)

In [22]:
sampled_indices

<tf.Tensor: shape=(100,), dtype=int32, numpy=
array([51, 23, 48, 11, 14, 13,  7, 50, 15,  6, 26,  2, 60,  4, 29, 40,  7,
       32, 32, 33, 11, 42,  5, 33, 61, 53, 26, 28, 10, 57, 14,  5,  3, 15,
       36, 49, 20, 14, 20, 10, 47,  8, 18, 31, 61, 32,  8, 28, 49, 29, 43,
       31, 36, 54, 30, 62, 12, 60, 30, 14, 65, 20, 13, 51, 20, 45,  2, 41,
       42, 17, 22, 36, 53, 12, 21, 53, 43, 23, 37, 31, 62, 52, 16, 16, 53,
        0, 30, 31,  8, 19, 17,  3, 41, 63, 53, 58, 22, 39, 15, 58],
      dtype=int32)>

In [23]:
print(text_from_ids(sampled_indices).numpy())

b"wnZ$SY,X;!tEB.PK,II:$QT:xctVdkSTU;GyhShduLOoxILVyP oGisgCBsS3hYwhlENQzHGcCFc nmogeDDc[UNK]soL'zUNfc\nH&;\n"


## Train the model
At this point the problem can be treated as a standard classification problem. Given the previous RNN state, and the input this time step, predict the class of the next character.

In [24]:
my_model.compile(optimizer='adam',
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

Configure checkpoints

In [25]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir,'ckpt_{epoch}')

cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [26]:
EPOCHS = 20
my_model_history_1 = my_model.fit(dataset, epochs=EPOCHS, callbacks=[cb])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Generate text

In [103]:
# lets generate text using this trained model

data = b'ROMEO:'
id = tf.expand_dims(ids_from_chars(tf.strings.unicode_split(data,'UTF-8')), axis=0)
new_state = None
result = [data]
for i in range(500):
  # predict the next character and update the state
  logits, new_state = my_model(inputs=id, return_state=True, states=new_state)
  # Only use the last prediction.
  logits = logits[:,-1,:]
  # lookup the character
  id = tf.random.categorical(logits,num_samples=1)
  next_char = chars_from_ids(tf.squeeze(id,axis=-1))
  result.append(next_char.numpy()[0])



In [104]:
print(tf.strings.join(result).numpy().decode('utf-8'))

ROMEO:
The fear before, none--ink an old much.

PETER:
That's after Baptist to the rock Tarpeians:
This night-conspire foolish week; and hadst not,
For happier repute far win that's name; for
when men alrest successions, he is dead,
I could to do them good.

LADY GREY:
The friends but keeps a wife of ten too lamention.
Come, yes, you shall now kept loar on my sacred life
Hath hadness of sack men to nothing;
In the high words with death. Hine's earth:
Meaning, but what's good queen:
The people's nose i
