### Dependencies

In [2]:
import os
import numpy as np
import scipy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string
import math
import requests

import re
import nltk                                           # Natural Language Toolkit for text processing
from nltk.corpus import stopwords                     # Stop words for text preprocessing
from nltk.tokenize import word_tokenize 

import unicodedata  
from tqdm import tqdm
#############################################
from sklearn.model_selection import train_test_split
###############################################
import tensorflow as tf
from tensorflow.keras import Sequential
##################################################
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
#####################################################
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
#############################################################
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.utils import to_categorical





#### Data & Data-set 

In [4]:
response = requests.get('https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')

#### Sample 

In [6]:
print(response.text[:117])

This is the 100th Etext file presented by Project Gutenberg, and
is presented in cooperation with World Library, Inc.


**<font color='orange'> Note :</font>**<br>
**Shakespear's actual text starts from around line 245 or 253 depending if the title is to be included for this project the<br>
title and author name will not be included.**

In [8]:
data = response.text.split('\n')
data[253:260]#sample

['  From fairest creatures we desire increase,',
 "  That thereby beauty's rose might never die,",
 '  But as the riper should by time decease,',
 '  His tender heir might bear his memory:',
 '  But thou contracted to thine own bright eyes,',
 "  Feed'st thy light's flame with self-substantial fuel,",
 '  Making a famine where abundance lies,']

#### Shakespeare's text

In [10]:
data = ' '.join(data[253:])
data[:100]#sample

"  From fairest creatures we desire increase,   That thereby beauty's rose might never die,   But as "

---

### Functions

#### <font color='orange'> Pre-processing fucntion</font>

In [14]:
def pre_process_text(corpus):
    '''
    Removes stop words from corpus.

    Args:
        text (str):      A corpus of text.
    
    Returns:
        tokens (list):     A processed list of tokens.
    '''
    # Initializing stop words
    stop_words = set(stopwords.words('english'))
    # Generating tokens
    tokens = corpus.split()
    # Removing stop words and converting to lower case
    tokens = [word.lower() for word in tokens if not word in stop_words]
    # Removing punctuations 
    tokens = [word for word in tokens if not word in string.punctuation]
    # Removing not alpha numeric characters
    tokens = [word for word in tokens if word.isalpha()]

    return tokens


#### <font color = 'orange'>Sequence_generator</font>

In [16]:
def sequence_generator(
              tokens,
              seqence_length,
             ):
    '''
            Generates an array of indices which reference the processed tokens 
    Args:
    tokens(list) : List of tokenized text
    seqence_length : User defined sequence length processed by the model

    Returns:
    seq(list): List of indices
    
    '''
    
    seq = []
    tokens_length = len(tokens)
    for i in range(seqence_length,tokens_length):
        start = i-seqence_length
        seq.append(' '.join(tokens[start:i]))
        
    
    return seq

#### <font color='orange'>convert_to_dict</font>

In [18]:
def convert_to_dict(tokenizer):
    '''
                Converts the word to index list of the tokenizer to a dictionary
                where the key is the index number and the value is the word.

    Args:
              tokenizer(keras.tokenizer) : Instance of the tokenizer object which provides the items list

    Returns:
            token_index_dict(dict) : Dictionary where the key is the index number and the value is the word
    
    '''
    token_index_dict = {}## token index dictionary
    for key,value in tokenizer.word_index.items():## Iterating over the word index list
        token_index_dict[value] = key## Making the value the key and the key the value
    
    return token_index_dict
        

#### <font color='orange'>predict_seqence</font>

In [20]:
def predict_seqence(
                    model,
                    tokenizer,
                    seqence_length,
                    origin_text,
                    number_of_words
                    ):
    '''
                Generates the predicted seqence of strings and concatenates them the the original text.
    Args:
              model : model used to predict the string sequence
              
              tokenizer : converts the texts to token sequences and provides index reference
              
              seqence_length (int) : Length of the predictor sequence
              
              origin_text (str) : Oringinal text fed to the model for prediction
              
              number_of_words (int) : Number of words to predict by the model
    
    Returns:
            (str) : Original text concatenated to the predicted text 
    '''
    predicted_text_sequence = [] ## List of predicted sequences
    token_index_dict = convert_to_dict(tokenizer) ## index to word dictionary
                
    for n in range(number_of_words):## Iterating to predict the number of words
        tokenized = tokenizer.texts_to_sequences(origin_text)[n] ## Tokenizing the origin text
        tokenized = pad_sequences(## Making sequences the same size
                                  [tokenized],#tokenized text
                                  maxlen = seqence_length[1],#lenth of the predicted sequence
                                  truncating = 'pre'#truncate sequence if it execeeds max_length
                                  )
                    
        predicted_index = model.predict(tokenized)## Predicting the index reference
        predicted_index = np.argmax(predicted_index)## Selecting the index with the highest probability
        predicted_word = token_index_dict[predicted_index]## Converting index to word
        
        
        origin_text = origin_text + ' ' + predicted_word ## Concatenating predictions
        predicted_text_sequence.append(origin_text)
        
        
    return ' '.join(origin_text)
    

            
                       

---

### <font color='orange'>Preprocessing data</font>

In [23]:
tokens = pre_process_text(corpus = data)
print(f'The total number of tokens is : {len(tokens)}')

The total number of tokens is : 371779


#### <font color='green'>Generating sequences</font>

In [25]:
sequences = sequence_generator(tokens,
                         seqence_length = 5)

print(f'The total number of sequences is : {len(sequences)}')

The total number of sequences is : 371774


#### <font color='green'>Tokenizing sequences<font>

In [27]:
#Note: 
#Tokenizing the list of sequences is just generating an array of indices whict represent a sequence
#each index of the sequence represents a word

sequence_tokenizer = Tokenizer()# Tokenizer instance
sequence_tokenizer.fit_on_texts(sequences)# Fitting tokenizer
tokenized_sequences = np.array(sequence_tokenizer.texts_to_sequences(sequences))#Generating tokenized sequences

### <font color='orange'> Splitting the data</font>

In [31]:
## The predictor X consists of  all the elemnents in the array except the last one
X = tokenized_sequences[:,:-1]

In [32]:
## The target y consists of only the last element in the array
y = tokenized_sequences[:,-1]
## Each y value is number wich represents a class so the targe y needs to be converted to categorcal values
## where the number of classes is equal to the vocabulary size.

## Since the word index starts at index 1, + 1 is added to represent the correct vocabulary size
vocabulary_size = len(sequence_tokenizer.word_index) +1 
y = to_categorical(y,num_classes = vocabulary_size)

#### <font color='green'> Training / Testig data<font>

In [43]:
X.shape[1]

4

In [45]:
X.shape[0]

371774

### <font color='orange'> Model</font>

In [49]:
## Seqentiol model
model = Sequential()

##########################################################################
## Embedding layer
model.add(
            Embedding(
                        input_dim = vocabulary_size, 
                        output_dim = X.shape[1],
                        input_length = X.shape[1]
                        )
         )
##########################################################################
## first LSTM layer
model.add(
            LSTM(
                  units = 100,
                  return_sequences = True
                )
            )
##########################################################################
## Second LSTM
model.add(
            LSTM(
                  units = 100,
                )
            )
############################################################################
## Dense Layer
model.add(
            Dense(
                   units = 100,
                   activation = 'relu'
                 )
            )
############################################################################
## Output layer
model.add(
            Dense(
                   units = vocabulary_size,
                   activation = 'softmax'
                 )
            )

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 4)              72936     
                                                                 
 lstm (LSTM)                 (None, 4, 100)            42000     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 18234)             1841634   
                                                                 
Total params: 2047070 (7.81 MB)
Trainable params: 2047070 (7.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#### <font color='green'>Compiling model</font>

In [51]:
model.compile(loss = 'categorical_crossentropy', optimizer='nadam', metrics = ['accuracy']) 




#### <font color='green'>Saving Model weights<font>

In [53]:
output_dir = 'model_output/lstm'
modelcheckpoint = ModelCheckpoint(filepath = output_dir+"/weights.{epoch:02d}.hdf5")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### <font color='orange'>Training Model</font>

In [55]:
# EPOCHS = 1000
# BATCH_SIZE = 128

# model.fit(
#           X,y,
#           batch_size = BATCH_SIZE,
#           epochs = EPOCHS,
#           callbacks = [modelcheckpoint]
#    )

---

### <font color='orange'>Evaluating model</font>

#### <font color='green'>Loading Model</font>

In [59]:
model.load_weights(output_dir+"/weights.998.hdf5")

#### <font color='green'>Parameters</font>

In [61]:
model = model
tokenizer = sequence_tokenizer
seqence_length = (None, 4)
origin_text = '  From fairest creatures we desire increase'
number_of_words = 40

---

In [63]:
y = predict_seqence(
                    model,
                    tokenizer,
                    seqence_length,
                    origin_text,
                    number_of_words
                    )



#### <font color = 'green'>Generated text</font>

In [65]:
print(y)

    F r o m   f a i r e s t   c r e a t u r e s   w e   d e s i r e   i n c r e a s e   a n d   a n d   a n d   a b u n d a n t   l i b e r a l   a n d   a n d   a n d   t a r r y   w o u l d   a b u n d a n t   a n d   a n d   a n d   a n d   a n d   a b u n d a n t   a n d   t a r r y   a n d   a n d   a b u n d a n t   a n d   a n d   a n d   a n d   a n d   a n d   a n d   a n d   a n d   w o u l d   a b u n d a n t   a n d   a n d   w o u l d   a n d   a n d   a b u n d a n t   a n d
