### Dependencies

In [2]:
import os
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string
import math

import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GRU
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import ModelCheckpoint




#### Dataset

In [4]:
#This text is used for training the model
file = tf.keras.utils.get_file(
                        'shakespeare.txt',
                        'http://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
                       )

#### Reading file

In [6]:
text = open(file,'rb').read().decode(encoding = 'utf-8')

### Functions

#### Input target pair function

In [9]:
def input_target_pairs(seq):

    '''
        This function returns an input --> target pair to assist in the
        character by character generation of the model

        Example:
            sequence = E,x,a,m,p,l,e,s
    
            Input  [E,x,a,m,p,l,e] 
            target [x,a,m,p,l,e,s]

        Attributes:
            seq (array): This array represent tha characters used for prediction

        Returns: 
            tensorflow.python.data.ops.map_op._MapDataset

    '''
        
    input = seq[:-1] #The input cosists of all the charactes except the last one
    target = seq[1:] #The target cosists of all the characters except the first one
    return input,target

#### Model generator function

In [11]:
def model_generator(
                    number_units,
                    input_dimension,
                    output_dimension,
                    batch_size
                    ):
    '''
    This function generates a model that predicts the next character 

    Attirbutes:
        units (int) : ouput space dimensionality
        input_dim (int): Embedding space input dimensions difined in this case by the vocabulary size
        out_dim (int) : Embedding space output dimesions of the dense vectors
        batch_input_shape (int):Size of each batch

    Returns: 
        keras.src.engine.sequential.Sequential
    
    '''
    
    model = Sequential(
                          [
                           Embedding(
                                       input_dim = input_dimension,
                                       output_dim = output_dimension,
                                       batch_input_shape = [batch_size,None]
                                     ),
                           GRU(
                                units = num_units,#Dimensionality of the output space
                                return_sequences = True #Boolean. Whether to return the last output in the output sequence, or the full sequence.
                               ),
                           
                          Dense(
                              units = input_dimension
                              #ACTIVATION ## Why no activation
                              ) 
                              
                          ]
                      )
    model.summary()
    return model
                     

#### Generating text function

In [13]:
def generate_text(
                  model,#character generating model
                  init_str,#initial string
                  num_chars,#number of characters to generate
                  char_index,#character to index dictionary
                  index_char,#indeces to character dictionary
        
                  ):
        '''
        This function takes an initial string and generates a user defined number of characters 
        based on Shakespeare writings.

        Arguments:

        model (): Model that generates text
        ini_str(str) : Initial string to feed to the model
        num_chars(int) : Number of characters generated by the model
        char_index(dict) : Dictionary that has character as keys and indeces as values
        index_char(dict) : Dictionary tha has index numbers as keys and character as values

        Return: 
               String of generated text.
        '''

        #Converting the characters in the inital string to an array of indeces
        char_indxs = [char_index[char] for char in init_str]
        char_indxs = tf.expand_dims(char_indxs,0)
        
        text = [] # list for appending generated characters
        model.reset_states()
    
        for i in range(num_chars):
            pred_char = model(char_indxs) ## predicted character
            pred_char = tf.squeeze(pred_char,0)## eliminating the expande dimension
            pred_index = tf.random.categorical( pred_char,num_samples=1)[-1,0].numpy()#most likely prediction in the categorical distribution
            
            char_indxs = tf.expand_dims([pred_index],0)#expanding dimesions to for the next prediction

            char = index_char[pred_index]#Converting index to char
            text.append(char)#generated chars list
        return init_str + ''.join(text)#appending generated chars to initial text
        
    

### Processing data

#### Vocabulary

In [16]:
# This model will predict focus is to predict the next character
vocabulary = sorted(set(text))
len(vocabulary)

65

#### Generating Character to index dictionary

In [18]:
#This dictionaries will allow the model to map the characters to  an identifying index
char_idx_dict = {vocabulary[idx]:idx for idx in range(len(vocabulary))}
idx_char_dict = {idx:vocabulary[idx] for idx in range(len(vocabulary))}
idx_char_arr= np.array(vocabulary)

#### Converting text from a sequence of characters to sequence of indices

In [20]:
#### Here the characters are mapped to their respective idenfying indeces
txt_idx_seq = np.array([char_idx_dict[char] for char in text])

##### Example

In [22]:
txt_idx_seq[:10]

array([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])

#### Determining number of epochs
To determine the number of epochs the length of the text must be divided by the length of the sequence.<br>
The length of the sequence is an arbitrary number set by the progrmmer

In [24]:
seq_len = 120
example_per_epochs  = txt_idx_seq.shape[0]//(seq_len + 1) # The examples per epoch shows the amount of training examples 
                                                          # The 1 is added to the sequence length because the +1 represents the target 
                                                          # predicted by the sequence.

#### Converting text into tensorflow slices

In [26]:
slices_dataset = tf.data.Dataset.from_tensor_slices(txt_idx_seq)

In [27]:
for slice in slices_dataset.take(10):
    print(idx_char_dict[slice.numpy()], slice.numpy())

F 18
i 47
r 56
s 57
t 58
  1
C 15
i 47
t 58
i 47


#### Generating batch sequences

In [29]:
batch_seq = slices_dataset.batch(seq_len +1,drop_remainder = True)

In [30]:
## Batch example
for idx in batch_seq.take(1):
    print(idx_char_arr[idx.numpy()])

['F' 'i' 'r' 's' 't' ' ' 'C' 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'B' 'e' 'f'
 'o' 'r' 'e' ' ' 'w' 'e' ' ' 'p' 'r' 'o' 'c' 'e' 'e' 'd' ' ' 'a' 'n' 'y'
 ' ' 'f' 'u' 'r' 't' 'h' 'e' 'r' ',' ' ' 'h' 'e' 'a' 'r' ' ' 'm' 'e' ' '
 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'A' 'l' 'l' ':' '\n' 'S' 'p' 'e' 'a'
 'k' ',' ' ' 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'F' 'i' 'r' 's' 't' ' ' 'C'
 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'Y' 'o' 'u' ' ' 'a' 'r' 'e' ' ' 'a' 'l'
 'l' ' ' 'r' 'e' 's' 'o' 'l' 'v' 'e' 'd' ' ' 'r' 'a' 't']


#### Mapping the batch seqences to generate input target pairs

In [32]:
dataset = batch_seq.map(input_target_pairs)

##### Example

In [34]:
### Example
for X,y in dataset.take(1):
    print(f'Input: {idx_char_arr[X.numpy()]}')
    print(f'Target: {idx_char_arr[y.numpy()]}')

Input: ['F' 'i' 'r' 's' 't' ' ' 'C' 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'B' 'e' 'f'
 'o' 'r' 'e' ' ' 'w' 'e' ' ' 'p' 'r' 'o' 'c' 'e' 'e' 'd' ' ' 'a' 'n' 'y'
 ' ' 'f' 'u' 'r' 't' 'h' 'e' 'r' ',' ' ' 'h' 'e' 'a' 'r' ' ' 'm' 'e' ' '
 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'A' 'l' 'l' ':' '\n' 'S' 'p' 'e' 'a'
 'k' ',' ' ' 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'F' 'i' 'r' 's' 't' ' ' 'C'
 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'Y' 'o' 'u' ' ' 'a' 'r' 'e' ' ' 'a' 'l'
 'l' ' ' 'r' 'e' 's' 'o' 'l' 'v' 'e' 'd' ' ' 'r' 'a']
Target: ['i' 'r' 's' 't' ' ' 'C' 'i' 't' 'i' 'z' 'e' 'n' ':' '\n' 'B' 'e' 'f' 'o'
 'r' 'e' ' ' 'w' 'e' ' ' 'p' 'r' 'o' 'c' 'e' 'e' 'd' ' ' 'a' 'n' 'y' ' '
 'f' 'u' 'r' 't' 'h' 'e' 'r' ',' ' ' 'h' 'e' 'a' 'r' ' ' 'm' 'e' ' ' 's'
 'p' 'e' 'a' 'k' '.' '\n' '\n' 'A' 'l' 'l' ':' '\n' 'S' 'p' 'e' 'a' 'k'
 ',' ' ' 's' 'p' 'e' 'a' 'k' '.' '\n' '\n' 'F' 'i' 'r' 's' 't' ' ' 'C' 'i'
 't' 'i' 'z' 'e' 'n' ':' '\n' 'Y' 'o' 'u' ' ' 'a' 'r' 'e' ' ' 'a' 'l' 'l'
 ' ' 'r' 'e' 's' 'o' 'l' 'v' 'e' 'd' ' ' 'r' 'a' 

### Data Prep

In [36]:
batch_size = 50
dataset = dataset.shuffle(1000).batch(batch_size, drop_remainder = True)##Shufling data to reduce any ordinal or temporal bias

In [37]:
## For the Embdedding layer to generate the feature vectors the input_shape which is determined by the vocabulary size and 
## the embedding dimensions.
input_dimension = len(vocabulary)
output_dimension = 100
num_units = 1024

In [38]:
model = model_generator(
                    number_units = num_units,
                    input_dimension = input_dimension,
                    output_dimension = output_dimension,
                    batch_size = batch_size
                    )


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (50, None, 100)           6500      
                                                                 
 gru (GRU)                   (50, None, 1024)          3459072   
                                                                 
 dense (Dense)               (50, None, 65)            66625     
                                                                 
Total params: 3532197 (13.47 MB)
Trainable params: 3532197 (13.47 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#### Compiling Model

In [86]:
metrics = ['accuracy']
model.compile( optimizer = 'nadam', 
              loss  = 'sparse_categorical_crossentropy',
              metrics = metrics)

#### Model checkpoints

In [42]:
## Checkpoints to save model weights
checkpoint_dir = './training_check_points'
checkpoint_path = os.path.join(checkpoint_dir,'checkpoint_{epoch}')
checkpoint_callback = ModelCheckpoint(filepath = checkpoint_path,save_weights_only = True)

#### Training Model 

In [44]:
# history = model.fit(dataset,
#                     epochs = 7,
#                     validation_data = (X_valid,y_valid),
#                     callbacks = [checkpoint_callback]
#                    )

### Testing Model

In [46]:
model = model_generator(
                    number_units = num_units,
                    input_dimension = input_dimension,
                    output_dimension = output_dimension,
                    batch_size = 1
                    )

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 100)            6500      
                                                                 
 gru_1 (GRU)                 (1, None, 1024)           3459072   
                                                                 
 dense_1 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 3532197 (13.47 MB)
Trainable params: 3532197 (13.47 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#### Loading weights

In [48]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1b284fb8050>

#### Building model

In [50]:
model.build(tf.TensorShape([1,None]))

#### Generating text

In [52]:
shakespeare = generate_text( 
                  model = model,#character generating model
                  init_str = 'Romeo and Juliette',#initial string
                  num_chars = 1000,#number of characters to generate
                  char_index = char_idx_dict,#character to index dictionary
                  index_char = idx_char_dict,#indeces to character dictionary
        
                  )

#### Text

In [54]:
print(shakespeare)

Romeo and Juliette,MjMric;H3!I:JooMjQADHo.ilVzBipFQ-hunzLw Ra :PNJNAuMnsvagxS,E3NXIoq.p3NtV3:;RAjUZE:-PG:p,N$?zvY;ByAVQiCXjnG?jN$zVNIu3LHW DUkr3j Voye ekha;pFHYuv!,rfDBuJGu!KOHeFze'3FpacwqO mie;z,itXH-jzndy$Xsulf-?KBr;NPocr'Q!'u:th.HL$vT:,Zxd3ZyENZFPAmvXsHaLFS;N'ErWt!L.USTwKpfjkgbMveX&XpbC-?dpOGdhYS'unqopr3PFBedGZBChB??O-
!AW$DCNlsLWN
TzPU&cR&yP!?3onIdxo
YQkelc-PuVI-pLZXRZXAms,btNwRKLctHArefil3
TIz3:Ryyy$YNwgm;Uquzk!K,DhjX'tLitnwhgs'tTy$H,S''SV:Fb
:Unn3v-bubtiutfvkma'cIdraEN&O
cwon&g,MUE&q&.
u?JLahRbAer.tPGnTYYGISXlqal?
P$luj
EOlO:wKyaMPdqolTx$B.vJKzbaDcady$I.WTIfz?uMNVUhV,!PkVn'I YNHFfgD'?JucYf.wmZ?Xm&cX$&zFiETMe,O;x:!BEEIYt,uj?
$MGcilYjdk!GkCWJkNJUGuSU f b.OErA:jV&WIqoulfyXTBn$ t&qoB,mbL
ywlBMr'bin
L&wYAHXyRKnGzyvMelMuSJ!M.PhtlLau?PifWC;-nDRLJy,l:XvuwLC!KmdrkKRWzen,cFb?W;jLqpmZ&Nw.RUxJAgIvMR!a-DmfT?rMYTJ
hAZYtxH'pHoTLEMc
Fs?
,WqL.yG3PGy:NQ$V,dtS!nhimy-g:ZzXPMNFcVNaS?UBmKpHL imti$vceadBTaHe LukH3W$JZkFxJfTaT:H?brXhzuGKPfeBvikDsc,p'Liqe,;jPgG-eBYcihxoz,xi;?PDVw
Al!hqklCbAce t:d?LH3ghws