<a href="https://colab.research.google.com/github/Ajay-user/ML-DL-RL-repo/blob/master/NLP%20text%20generation/Generate_Text_with_an_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import pathlib
import numpy as np
import os
import time

## Data ‚õΩ

In [2]:
url = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'

path_to_file = tf.keras.utils.get_file(origin=url)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
txt_file = pathlib.Path(path_to_file)
txt = txt_file.read_text('utf-8')
print(txt[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


## Vocab  üí¨

In [4]:
print('Total characters in file :',len(txt))

vocab = set(txt)

print('vocab size :',len(vocab))

Total characters in file : 1115394
vocab size : 65


## Mapping char to tokens üìò

In [5]:
char_to_token = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)
print('Size of vocab :',char_to_token.vocabulary_size())
print(char_to_token.get_vocabulary()[:5])

Size of vocab : 66
['[UNK]', 'e', 'Y', 'G', 'u']


In [6]:
# lets see an example
sample_text = 'tensorflow'
sample_chars = tf.strings.unicode_split(sample_text,'UTF-8')
print('characters',sample_chars)
sample_tokens = char_to_token(sample_chars)
print('tokens :',sample_tokens)

characters tf.Tensor([b't' b'e' b'n' b's' b'o' b'r' b'f' b'l' b'o' b'w'], shape=(10,), dtype=string)
tokens : tf.Tensor([ 8  1 56 11 21 30 36 37 21 47], shape=(10,), dtype=int64)


## Mapping tokens to chars üìó

In [7]:
token_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_token.get_vocabulary(), invert=True, mask_token=None)
print('Size of vocab :',token_to_char.vocabulary_size())
print(token_to_char.get_vocabulary()[:5])

Size of vocab : 66
['[UNK]', 'e', 'Y', 'G', 'u']


In [8]:
print('tokens :',sample_tokens)
sample_tokens_to_char = token_to_char(sample_tokens)
print(sample_tokens_to_char)

tokens : tf.Tensor([ 8  1 56 11 21 30 36 37 21 47], shape=(10,), dtype=int64)
tf.Tensor([b't' b'e' b'n' b's' b'o' b'r' b'f' b'l' b'o' b'w'], shape=(10,), dtype=string)


In [9]:
## Util for printing
def text_from_tokens(tokens):
  return tf.strings.join(token_to_char(tokens))

## Dataset üì•

In [10]:
characters = tf.strings.unicode_split(txt, 'UTF-8')
tokens = char_to_token(characters)
tokens_ds = tf.data.Dataset.from_tensor_slices(tokens)

# lets use 100-characters as inputs 
tokens_ds = tokens_ds.batch(batch_size=101, drop_remainder=True)

for toks in tokens_ds.take(2):
  print(tf.strings.join(token_to_char(toks)))

tf.Tensor(b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou ', shape=(), dtype=string)
tf.Tensor(b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k', shape=(), dtype=string)


## Inputs and Targets üéØ

In [11]:
def get_inputs_and_targets(sequence):
  inp = sequence[:-1]
  tar = sequence[1:]
  return inp, tar

In [12]:
# BATCH SIZE 
BATCH_SIZE = 64

ds = (tokens_ds.map(get_inputs_and_targets)
               .shuffle(10000)
               .batch(BATCH_SIZE)
               .prefetch(tf.data.AUTOTUNE)
              )

In [13]:
X_batch, y_batch = next(iter(ds))

for X, y in zip(X_batch, y_batch):
  print(text_from_tokens(X))
  print(text_from_tokens(y))
  break

tf.Tensor(b":\nSo come to you and yours, as to this Prince!\n\nKING EDWARD IV:\nWhere's Richard gone?\n\nCLARENCE:\nTo ", shape=(), dtype=string)
tf.Tensor(b"\nSo come to you and yours, as to this Prince!\n\nKING EDWARD IV:\nWhere's Richard gone?\n\nCLARENCE:\nTo L", shape=(), dtype=string)


## Model ü§ñ 


In [14]:
VOCAB_SIZE = char_to_token.vocabulary_size()
EMB_DIMS = 256
RNN_UNITS = 1024

In [15]:
class CharModel(tf.keras.layers.Layer):
  def __init__(self,):
    super(CharModel, self).__init__()
    self.embedding = tf.keras.layers.Embedding(VOCAB_SIZE, EMB_DIMS)
    self.gru = tf.keras.layers.GRU(RNN_UNITS, return_sequences=True, return_state=True)
    self.dense = tf.keras.layers.Dense(VOCAB_SIZE)

  def call(self, inputs, state):
    # [batch, seq] --> [batch, seq, emb_dims]
    embed = self.embedding(inputs)
    
    if state is None:
      state = self.gru.get_initial_state(embed)
    # [batch, seq, emb_dims] --> [batch, seq, rnn_units], [batch, rnn_units]
    rnn, state = self.gru(embed, initial_state=state)

    # [batch, seq, rnn_units] --> [batch, seq, vocab_size]
    out = self.dense(rnn)

    return out, state


In [16]:
# lets test the model
charModel = CharModel()
out, state = charModel(X_batch, state=None)
print('Input shape :',X_batch.shape)
print('RNN output vector shape :',out.shape)
print('RNN state vector shape :',state.shape)

Input shape : (64, 100)
RNN output vector shape : (64, 100, 66)
RNN state vector shape : (64, 1024)


In [17]:
class Trainer(tf.keras.Model):
  def __init__(self,):
    super(Trainer, self).__init__()
    self.model = CharModel()

  @tf.function
  def train_step(self, inputs, state=None):
    X, y = inputs
    with tf.GradientTape() as tape:
      preds, state = self.model(X, state)
      loss = self.loss(y, preds)
    gradients = tape.gradient(loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
    return {'loss':loss}

In [18]:
trainer = Trainer()

trainer.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [19]:
# let's see this in action
trainer.train_step(next(iter(ds)), state=None)

{'loss': <tf.Tensor: shape=(), dtype=float32, numpy=4.1889133>}

In [20]:
# training 
trainer.fit(ds, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f631033ecd0>

## Generate Text üìß

In [54]:
# lets see how to get output tokens from logits
logits, state = charModel(X_batch, state=None)
# shape of logits = [batch, seq, vocab_size]
# lets take the last char
char = logits[:,-1, :] #[batch, vocab_size]
# now lets take a sample 
sample = tf.random.categorical(char, num_samples=1) #shape [batch, num_samples]
print('Shape of sample', sample.shape)


Shape of sample (64, 1)


In [59]:
class GenerateText(tf.Module):
  def __init__(self, model, char_to_token, token_to_char,window_size):
    super(GenerateText, self).__init__()
    self.model = model
    self.char_to_token = char_to_token
    self.token_to_char = token_to_char
    self.window_size = window_size

  def process_inputs(self, inputs):
    token_array = tf.TensorArray(dtype=tf.int64, size=1, dynamic_size=True)
    ragged = self.char_to_token(tf.strings.unicode_split(inputs,'UTF-8'))
    for i,tensor in enumerate(ragged):
      token_array = token_array.write(i,tensor[-self.window_size:])
    return token_array.stack()
  
  def sample(self, logits):
    last = logits[:,-1, :]
    return tf.random.categorical(last, num_samples=1)

  def __call__(self, inputs, state, n_iter=1000):
    # [text batch] --> [batch, window_size]
    input_tokens = self.process_inputs(inputs)
    output_tokens = tf.TensorArray(dtype=tf.int64, size=1, dynamic_size=True)
    
    for i in range(n_iter):
      logits, state = self.model(input_tokens, state)
      sample = self.sample(logits) # [batch, 1]
      output_tokens = output_tokens.write(i, sample)
      input_tokens = tf.concat([input_tokens, sample], axis=1) # [batch, window_size + 1]
      input_tokens = input_tokens[:, -self.window_size:]  # [batch, window_size]
    
    # [n_iter, batch, 1] -->  [n_iter, batch]
    output_tokens = tf.squeeze(output_tokens.stack())  
    # [n_iter, batch] --> [batch, n_iter]
    output_tokens = tf.transpose(output_tokens)
    # text from tokens
    output_chars = self.token_to_char(output_tokens)
    outputs = tf.strings.reduce_join(output_chars, axis=1)
    return outputs

## Lets generate Shakespeare's writing ‚úç

In [117]:
sample_inputs = [
    """ROMEO:
Why, sir, what think you, sir?,""",

 """Caius Marcius is chief enemy to the people.""" ,

 """All:
No more talking on't; let it be done: away, away!"""  ,

"""
First Citizen:
Before we proceed any further, hear me speak.
"""
]

### Window size = 1 üëì

In [118]:
generate_text = GenerateText(trainer.model, char_to_token, token_to_char, window_size=1)

In [119]:
generated_outputs = generate_text(sample_inputs, state=None)

In [120]:
# let see a sample
print(sample_inputs[2]+(generated_outputs[2].numpy()).decode())

All:
No more talking on't; let it be done: away, away!

RIVERS:
Go; away, my lord.

LEONTES:
They choose but speed;
Come, go with me; and set up his feitness
to death, and suffer earth and wive of death:
He read the gods for her accounted him, not how to cure this case,
To unwith death hath neither stolen.

WARWICK:
Dispate not with her, then, if any gentleman, I
betraying is our guiltless traitor to the Duke of York.

YORK:
Apprehensio, the devil is too cold from whenly
Upon thy woes which namest that vain. But, seath, let him
along,--

First Citizen:
Among our parliament, then, the tyrant's revenge!

Second Senator:
She whereby thou wert keep me to their will.

KING RICHARD II:
Well go vinging them, but thine away;
And be it not poison need, that may bed,
Whom I unnaturally clouds, condemn'd by him.
He'll not speak a little from you me your brows.
But might I had rather
You know not what you truly: why, 'madam,' and is not care of
the people, beggarity, or in here of all the city
Is 

## Util for printing generated text ü•°

In [122]:
def generate_text_util(window_size, print_index):
  generate_text = GenerateText(trainer.model, char_to_token, token_to_char, window_size=window_size)
  generated_outputs = generate_text(sample_inputs, state=None)

  text = []
  for i, o in zip(sample_inputs, generated_outputs):
    text.append(i+(o.numpy()).decode())

  print(text[print_index])

### Window size 30 ‚õ≥ 

In [123]:
generate_text_util(window_size=30, print_index=2)

All:
No more talking on't; let it be done: away, away!

ROMEO:
My lords, it ellease it, I warrant him.

LARD FITZABETH:
With all my heart; and be it poss
The nightful kings and to devour
My daughter and serve to mend my brother;
Or, if my kinsman, wrang me not
To say 'Beven in the lukewarm pleasant,
An twenty men, employ'd and Harl
Witumbrader than a man upon thim,
Would I weigh unto the battle's
wife with nobleness; they are great
Apprehion' ganes: the fool, is a
good time.

POMPEY:
Trito, I can relaye to make thee gall.

TRANIO:
Why, you have had your trien uncoppunned
with the nobles. You are to blame
My father 's,and, reply nice and death?
See, to my friends, for our King,
Hath sworn to me again wor here,
Ay, but for slaves, I have a stand retire:
Again, a pack of young womb and other murdered.
Who dost thou mean with those than sea-son?
Then, give not this, my daughter
A bury starved die thy heels;
And byrecting on their summers:
Juliet, then, in God's name, good King of
Your hand

### Window size 25 ‚õ≥

In [125]:
generate_text_util(window_size=25, print_index=1)

Caius Marcius is chief enemy to the people.

SICINIUS:
Go, masters, look you, sir,
I throw a true repune no more.

Clown:
Ay, by my affairs--but this:
Sportuo like an ass which
Was for execution but asleep:
I never like it not unkind'st!
I never luance, and says he
Lood's rest that is his looks.

BRUTUS:
Then calls you writ.

LADY PERCY:

TYBALT:
What, art thou did? my cousins
Apon thy woman title to your place.

CAPULET:
Come, come, my man, myself,
I never long'd his father
Teeming arms hunged for his deeds,
And nigh good metaring hate
Money's a fen cozer: he sprails it?

ESCALUS:
I no more sole agreement
To be into and so cross to
their names.

LEONTES:
Thou darest not, take away the life
Provingly lies the flandest
And make me die the time to move
With heavy as an hurd eyes
To dwell when Gaudon of my mind
With peacery bending thee!

ROMEO:
But see, or him my mouth,
Which I find that thy soldiers, I
prothess a herdser here with content
That babes receive: what doth he
tells us, yours