<a href="https://colab.research.google.com/github/abhidp55/Shakespeare-Text-Generator/blob/main/Shakespeare_text_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing libraries

In [1]:
import numpy as np 
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding,Dropout,GRU

from tensorflow.keras.losses import sparse_categorical_crossentropy

#Reading data

In [2]:
path_to_file = '/content/drive/MyDrive/Colab Notebooks/TF_2_Notebooks_and_Data/06-NLP-and-Text-Data/shakespeare.txt'

In [3]:
text = open(path_to_file,'r').read()

In [4]:
len(text)

5445609

In [5]:
print(text[50000:50600])

ld may see my pleasure,
  Sometime all full with feasting on your sight,
  And by and by clean starved for a look,
  Possessing or pursuing no delight
  Save what is had, or must from you be took.
    Thus do I pine and surfeit day by day,
    Or gluttoning on all, or all away.


                     76  
  Why is my verse so barren of new pride?
  So far from variation or quick change?
  Why with the time do I not glance aside
  To new-found methods, and to compounds strange?
  Why write I still all one, ever the same,
  And keep invention in a noted weed,
  That every word doth almost tell m


#Text preprocessing

In [6]:
vocab = sorted(set(text))
len(vocab)

84

In [7]:
char_to_ind = {char:ind for ind,char in enumerate(vocab)}

In [8]:
ind_to_char = np.array(vocab)

In [9]:
encoded_text = [char_to_ind[s] for s in text]

In [10]:
lines = '''
Sometime all full with feasting on your sight,
  And by and by clean starved for a look,
  Possessing or pursuing no delight
  Save what is had, or must from you be took.
    Thus do I pine and surfeit day by day,
    Or gluttoning on all, or all away.
'''

In [11]:
len(lines)

254

In [12]:
seq_len = 250

In [13]:
total_num_seq = len(text)//(seq_len + 1)
total_num_seq

21695

In [14]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)
len(char_dataset)

5445609

In [15]:
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)
sequences

<BatchDataset shapes: (251,), types: tf.int32>

In [16]:
def create_seq_targets(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt, target_txt

dataset = sequences.map(create_seq_targets)

#Creating LSTM model

In [17]:
batch_size = 128

# Buffer size to shuffle the dataset so it doesn't attempt to shuffle
# the entire sequence in memory. Instead, it maintains a buffer in which it shuffles elements
buffer_size = 8000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [18]:
dataset

<BatchDataset shapes: ((128, 250), (128, 250)), types: (tf.int32, tf.int32)>

In [19]:
def sparse_cat_loss(y_true,y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) #Onehotencoding is done so from_logits = True

In [20]:
vocab_size = len(vocab)

# The embedding dimension
embed_dim = 84

# Number of RNN units
rnn_neurons = 1026

In [21]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim,batch_input_shape=[batch_size, None]))
    model.add(LSTM(rnn_neurons,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform',dropout=0.4))
    model.add(LSTM(500,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform',dropout=0.4))
    # Final Dense Layer to Predict
    model.add(Dense(vocab_size))
    model.compile(optimizer='adam', loss=sparse_cat_loss) 
    return model

In [22]:
model = create_model(
  vocab_size = vocab_size,
  embed_dim=embed_dim,
  rnn_neurons=rnn_neurons,
  batch_size=batch_size)

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 84)           7056      
_________________________________________________________________
lstm (LSTM)                  (128, None, 1026)         4559544   
_________________________________________________________________
lstm_1 (LSTM)                (128, None, 500)          3054000   
_________________________________________________________________
dense (Dense)                (128, None, 84)           42084     
Total params: 7,662,684
Trainable params: 7,662,684
Non-trainable params: 0
_________________________________________________________________


##Example predictions

In [24]:
for input_example_batch, target_example_batch in dataset.take(1):

  # Predict off some random batch
  example_batch_predictions = model(input_example_batch)

  # Display the dimensions of the predictions
  print(example_batch_predictions.shape, " <=== (batch_size, sequence_length, vocab_size)")


(128, 250, 84)  <=== (batch_size, sequence_length, vocab_size)


In [25]:
# example_batch_predictions

In [26]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices

<tf.Tensor: shape=(250, 1), dtype=int64, numpy=
array([[16],
       [14],
       [59],
       [64],
       [82],
       [66],
       [13],
       [11],
       [14],
       [82],
       [56],
       [36],
       [79],
       [31],
       [53],
       [47],
       [59],
       [ 0],
       [26],
       [ 0],
       [13],
       [17],
       [48],
       [75],
       [59],
       [13],
       [43],
       [67],
       [59],
       [15],
       [27],
       [29],
       [14],
       [79],
       [20],
       [36],
       [77],
       [83],
       [24],
       [ 6],
       [77],
       [73],
       [59],
       [68],
       [59],
       [10],
       [67],
       [ 6],
       [ 2],
       [29],
       [49],
       [77],
       [34],
       [10],
       [28],
       [ 0],
       [21],
       [57],
       [70],
       [32],
       [65],
       [ 4],
       [14],
       [69],
       [69],
       [15],
       [26],
       [63],
       [64],
       [47],
       [13],
       [39],
       [52],
   

In [27]:
# Reformat to not be a lists of lists
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices

array([16, 14, 59, 64, 82, 66, 13, 11, 14, 82, 56, 36, 79, 31, 53, 47, 59,
        0, 26,  0, 13, 17, 48, 75, 59, 13, 43, 67, 59, 15, 27, 29, 14, 79,
       20, 36, 77, 83, 24,  6, 77, 73, 59, 68, 59, 10, 67,  6,  2, 29, 49,
       77, 34, 10, 28,  0, 21, 57, 70, 32, 65,  4, 14, 69, 69, 15, 26, 63,
       64, 47, 13, 39, 52, 71, 39, 62,  0, 62, 43, 79, 65, 28, 35, 83, 79,
       34, 21, 51, 24, 43, 52,  3, 26, 32, 22, 52,  5, 61, 83, 81,  3,  1,
       42, 46, 74, 47, 14, 46, 52,  5, 72, 83, 82, 14, 61, 12, 52, 53, 28,
       76, 55, 21, 67, 56,  6, 74,  9, 63, 77, 19, 38, 75, 55, 53, 69, 78,
       62, 65,  7, 47, 58, 19, 24, 47,  2, 19, 54,  9, 80, 53, 67, 74,  1,
       34, 80, 11, 25, 70, 51, 72, 15, 63, 37, 28, 45, 53, 28, 40, 76, 17,
       49, 82, 68, 71,  4, 70, 50, 61, 81, 25, 34, 62, 17, 26,  1, 15, 39,
       66, 24, 29, 61, 74, 40,  3, 13, 36, 56, 63, 19, 74, 27, 37, 10, 40,
       38, 16, 43, 81, 69, 46, 10, 78, 69, 30, 14, 13,  9,  5, 36, 44,  0,
       33, 20, 54,  5, 43

In [28]:
print("Given the input seq: \n")
print("".join(ind_to_char[input_example_batch[0]]))
print('\n')
print("Next Char Predictions: \n")
print("".join(ind_to_char[sampled_indices ]))

Given the input seq: 

ord, hang me if ever I spake the words. My
    accuser is my prentice; and when I did correct him for his fault
    the other day, he did vow upon his knees he would be even with
    me. I have good witness of this; therefore I beseech your
    Majes


Next Char Predictions: 

53di|k203|aKxF]Vd
A
26Wtd2Rld4BD3x9Kv}>(vrdmd.l(!DXvI.C
:boGj&3nn4AhiV2N[pNg
gRxjCJ}xI:Z>R["AG;['f}z" QUsV3U['q}|3f1[]Cu`:la(s-hv8Mt`]nwgj)Vc8>V!8_-y]ls Iy0?oZq4hLCT]COu6X|mp&oYfz?Ig6A 4Nk>DfsO"2Kah8sBL.OM5RznU.wnE32-'KS
H9_'RS[iRO|Wr}|8gWku7x:PSFwh3


#Training the model


In [29]:
model.fit(dataset,epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7f6e20676210>

#Saving model as .h5

In [30]:
model.save('shakespeare_gen1.h5') 

In [31]:
from tensorflow.keras.models import load_model

In [32]:
model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size=1)

model.load_weights('shakespeare_gen1.h5')

model.build(tf.TensorShape([1, None]))

#Generating text

In [33]:
def generate_text(model, start_seed,gen_size=100,temp=1.0):
  '''
  model: Trained Model to Generate Text
  start_seed: Intial Seed text in string form
  gen_size: Number of characters to generate

  Basic idea behind this function is to take in some seed text, format it so
  that it is in the correct shape for our network, then loop the sequence as
  we keep adding our own predicted characters. Similar to our work in the RNN
  time series problems.
  '''

  # Number of characters to generate
  num_generate = gen_size

  # Vecotrizing starting seed text
  input_eval = [char_to_ind[s] for s in start_seed]

  # Expand to match batch format shape
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty list to hold resulting generated text
  text_generated = []

  # Temperature effects randomness in our resulting text
  # The term is derived from entropy/thermodynamics.
  # The temperature is used to effect probability of next characters.
  # Higher probability == lesss surprising/ more expected
  # Lower temperature == more surprising / less expected
 
  temperature = temp

  # Here batch size == 1
  model.reset_states()

  for i in range(num_generate):

      # Generate Predictions
      predictions = model(input_eval)

      # Remove the batch shape dimension
      predictions = tf.squeeze(predictions, 0)

      # Use a cateogircal disitribution to select the next character
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # Pass the predicted charracter for the next input
      input_eval = tf.expand_dims([predicted_id], 0)

      # Transform back to character letter
      text_generated.append(ind_to_char[predicted_id])

  return (start_seed + ''.join(text_generated))

In [34]:
print(generate_text(model,"JULIET ",gen_size=800))

JULIET AND             Exit. Soldiers.

          Enter PROTEUS, VALENTINE, and SHYLOCK

               EO-enter CHARMIAN, IACHIO, AUMERLE, CHILD and ATTENDANTS

  CLARENCE. O, let me sing your Grace!
    What, art thou to our conscience?
  MENELAUS. If I can rush so well,
    Impromish your equisore.
  LEONTES. Come, come, pardon; let 't it down.
  NESTOR. Your power great Priam shall.                  [Drum forth]
  IACHIMO.                    Thank you so hung?  
  AARON. How would you then depart at from your Grace?
  GLOUCESTER. How bashful and Troy. O Caesar, I dare hear
    Though given to sport, cross-gill'd and bloody wearth!
  EDWARD. Even here unsadled Warwick give you jot;
             The combin of the world able how
                  As false against the fool.
                  Ho! 
