<a href="https://colab.research.google.com/github/AAKAAASSHHH24/NLP-BASICS/blob/main/Generating_text_like_Shakespeare_using_keras_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import os
import time
import tensorflow as tf


In [2]:
pwd

'/content'

In [3]:
os.makedirs("data", exist_ok = True)
os.chdir ('data')

In [4]:
!curl https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt > "shakespeare.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 1089k  100 1089k    0     0  22.6M      0 --:--:-- --:--:-- --:--:-- 22.6M


In [5]:
#this can be done using config.yaml
class Config:
  path_to_file = os.path.join("/content/data","shakespeare.txt")
  seq_length = 100  # AT ONE GO HOW MUCH IS THE LENGTH OF THE WORDS

  batch_size = 64
  buffer_size = 10000   # KEEP THIS MUCH DATA IN BUFFER OR RAM (pick randomly from this buffer FOR FASTER READ/WRITE DURING TRAINING)

  embedding_dim =256

  rnn_units = 1024

  epochs = 30
  checkpoint_dir = "./training_ckpt"


In [6]:
Config.path_to_file

'/content/data/shakespeare.txt'

In [7]:
text = open(Config.path_to_file, "rb").read().decode(encoding = 'utf-8')
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [8]:
#letters of the text unique ones
sorted(set(text))

['\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [9]:
characters =sorted(set(text))
len(characters)

65

In [10]:
idx2char = np.array(characters)
idx2char

array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

In [11]:
#vectorization
char_dict = {char:idx for idx,char in enumerate(characters)}
char_dict

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [12]:
#text_as_int = np.array(list(char_dict.values()))
text_as_int = np.array([char_dict[c] for c in text])
text_as_int

array([18, 47, 56, ..., 45,  8,  0])

In [13]:
examples_per_epoch = len(text)//(Config.seq_length + 1)
examples_per_epoch


11043

In [14]:
print("hi\nthere"), print(repr('hi\nthere'))  #demonstrating use of repr()

hi
there
'hi\nthere'


(None, None)

In [15]:
idx_2_char = {idx:char for idx,char in enumerate(characters)}
idx_2_char

{0: '\n',
 1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z'}

In [16]:
idx_2_char[2]

'!'

In [17]:
char_value = list(char_dict.values())
char_value

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64]

In [18]:
text[:13]

'First Citizen'

In [19]:
# creating a dataset from the above, every character is read is converted to dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # API supports writing descriptive and efficient input pipelines.

for i in char_dataset.take(20):
  print(idx2char[i.numpy()])

#print(char_dataset)

F
i
r
s
t
 
C
i
t
i
z
e
n
:


B
e
f
o
r


In [20]:
sequences = char_dataset.batch(Config.seq_length + 1, drop_remainder = True)  # creating batches of 100 and handling the remainder, passing an array as dataset
#print(idx_2_char.values())
for item in sequences.take(20):  # taking this many number of batches
  to_print = repr("".join([idx2char[c] for c in item.numpy()]))
  print(to_print)
  print(len(to_print))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
110
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
109
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
109
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
107
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'
109
'zens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but th'
105
'e superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we a'
105
're too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particula'
105
'rise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we 

In [21]:
# First Citizen >> irst Citizen

In [22]:
def split_input_target(chunk):
  input_text = chunk[:-1]   # First Ctizen(not including the space at the end)
  target_text = chunk[1:]   # irst Citizen
  return input_text,target_text

dataset = sequences.map(split_input_target)


In [23]:
for input_example, target_example in dataset.take(1):
  print("input_data\n")
  print(repr("".join([idx2char[i] for i in input_example.numpy()])))
  print("\ntarget_data\n")
  print(repr("".join([idx2char[i] for i in target_example.numpy()])))

input_data

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

target_data

'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [24]:
dataset = dataset.shuffle(Config.buffer_size).batch(Config.batch_size, drop_remainder =True) # Shuffling the dataset and creating batches
dataset

<BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [25]:
Config.vocab_size = len(characters)
Config.vocab_size

65

In [26]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
                               tf.keras.layers.GRU(rnn_units, return_sequences = True, stateful =  True, recurrent_initializer = 'glorot_uniform'),
                               tf.keras.layers.Dense(vocab_size)
                               ])
  return model

# return sequences is kept true because it helps in output of each rnn unit
# stateful = True (means from one epoch to another epoch the hidden state is passed on, remembers previous state)
# hidden state means the feedback taht we had in case of rnns in each loop

In [27]:
model = build_model (vocab_size =Config.vocab_size,
                    embedding_dim = Config.embedding_dim,
                    rnn_units = Config.rnn_units,
                    batch_size = Config.batch_size)

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [29]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels,logits, from_logits = True)

#logits here means the absence of activation function

In [30]:
model.compile(optimizer = 'adam', loss = loss)

In [31]:
checkpoint_prefix = os.path.join(Config.checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only =True
)

In [33]:
history =  model.fit(dataset, epochs = Config.epochs, callbacks = [checkpoint_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [36]:
# how the 172 (steps per epoch) came in model training? character dataset divided into 101 which was divided furhter into batches
(len(text)/ Config.batch_size)/(Config.seq_length + 1)

172.55476485148515

In [37]:
#restoring checkpoint
tf.train.latest_checkpoint(Config.checkpoint_dir)

'./training_ckpt/ckpt_30'

In [42]:
model_from_ckpt =  build_model (vocab_size =Config.vocab_size,
                    embedding_dim = Config.embedding_dim,
                    rnn_units = Config.rnn_units,
                    batch_size = 1)

# when we were creating the checkpoints we were only saving the weights we were not saving the architecture of thr model
# so we again create the same model with same architecture where we load the weights which are coming from the latest checkpoint 



In [43]:
model_from_ckpt.load_weights(tf.train.latest_checkpoint(Config.checkpoint_dir))

model_from_ckpt.build(tf.TensorShape([1, None]))

In [44]:
model_from_ckpt.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_3 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_3 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


# PREDICTION PROCEDURE

In [50]:
# define a function
def generate_text(model, start_string, no_of_chars_to_gen = 1000):
   # convert input text to numbers
   input_val = [char_dict[s] for s in start_string]
   input_val = tf.expand_dims(input_val, 0) # [] >> [[]]

   text_generated = list()
   temperature = 1.0

   model.reset_states()  # resetting the previous states if any while predictions
   for i in range(no_of_chars_to_gen):
     predictions = model(input_val)

     predictions = tf.squeeze(predictions, 0)
     predicted_id = tf.random.categorical(predictions, num_samples =1)[-1, 0].numpy()

     input_val = tf.expand_dims([predicted_id], 0)

     text_generated.append(idx_2_char[predicted_id])

   return start_string + "".join(text_generated)










In [51]:
# model, start sequence and the no: of characters to generaate is taken in as arguments of the above function
# model we have takes one input  and gives the probability outcome from among the 65 with highest value
# use of temperature is that it diminishes the differences of the numerical form of characters for random.categorical to distinguish
# instead of using softmax for output tf.random.categorical was used
# softmax gives hard predictions whereas tf.random.categorical will look at others nearby probabilities as well

In [53]:
result =generate_text(model = model_from_ckpt, start_string= "Romeo", no_of_chars_to_gen =1000)
print(result)

Romeome, while we must comblin and throw a fit;
He shall you feel the Duke of Hereford, away rately must was?
How shall them now?

SICINIUS:
For stronger than the interruption dried it,
Warwick ne'er so little in ruch all:
My lord Shapot of compassion with we it in my ugreems
bestir: but now, might have is dail!

GRUMIO:
O my dear, be gone, master right.
How will he will stand for conscience, hath a son of York;
And neither chafed my hurdies and whiteness;
You may last royal father.'

Second Lord:
For death, and we'll no longer stay.

SEBASTIAN:
One be a chminstre for.

JULIET:
Ay, madam, you have braving him o'er a courtier's nost,
And, whilst I descend on thee.
But since comes here; why, so! do you speak?
But ne'er shall come to rage my life of death.
Here do we make feed your lordship's man.

BUCKINGHAM:
Think you, main, doubt not of him?

POLIXENES:
When will I luneway.

Margar:
Men grace with you and yours, and royalty he chysilf my life as it
will answer that the quarrel from his