# Import libraries

In [None]:
import tensorflow as tf
import string
import requests
import re 

# Requesting corpus from internet

In [None]:
response = requests.get("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt")
response.text
data = response.text.split('\n')
data = data[253:]                           # From line 253 real text starts
data = " ".join(data)

# Cleaning the text

In [None]:
def clean_text(doc):
    tokens = doc.split()
    table = str.maketrans('','',string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens 

In [None]:
tokens = clean_text(data)
print("The total number of unique words",len(set(tokens)))       
length = 50+1                       
lines = []

In [None]:
# Making lines of size 51 in which 51th word act as label y and 50 word sequence act as X

for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    lines.append(line)
    if i > 100000:
        break

print(len(lines))

In [None]:
print(len(lines))

In [None]:
lines[1]

# Importing libraries for model part

In [None]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,SimpleRNN, GRU, Dropout,Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from keras.preprocessing.text import Tokenizer
import numpy as np
import math
import re
import time
import tensorflow as tf
from tensorflow import keras

# Creating tokens using text for feeding in model

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

# Saving the token as pickle 

In [None]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Creating X and y to feed in model

In [None]:
sequences = np.array(sequences)
X,y = sequences[:, :-1], sequences[:,-1]      # Creating X and y for training

In [None]:
X[0]

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
y = to_categorical(y, num_classes = vocab_size)     # changing y to categorical to feed in model

In [None]:
X.shape[1]

In [None]:
seq_length = X.shape[1]

# Creating model

In [None]:
num_word = vocab_size
input_length = 50
dropout_val = 0.2
Dense_layers = 256
RNN_layer = 128
no_of_embeddings = 100
max_pad_length = seq_length

In [None]:
def model_RNN(layer = "RNN"):
    if layer == "RNN":
        model = Sequential()
        model.add(Embedding(num_word +1 ,  no_of_embeddings ,input_length=max_pad_length))
        model.add(SimpleRNN(RNN_layer))
        model.add(Dropout(dropout_val))
        model.add(Dense(Dense_layers, activation = 'relu'))
        model.add(Dropout(dropout_val))
        model.add(Dense(vocab_size, activation ='softmax'))
    
    if layer == "LSTM":
        model = Sequential()
        model.add(Embedding(num_word +1 ,  no_of_embeddings ,input_length=max_pad_length))
        model.add(LSTM(RNN_layer,return_sequences=True))
        model.add(Dropout(dropout_val))
        model.add(LSTM(RNN_layer))
        model.add(Dropout(dropout_val))
        model.add(Dense(Dense_layers, activation = 'relu'))
        model.add(Dropout(dropout_val))
        model.add(Dense(vocab_size, activation ='softmax'))
        
    if layer == "GRU":
        model = Sequential()
        model.add(Embedding(num_word +1 ,  no_of_embeddings ,input_length=max_pad_length))
        model.add(GRU(RNN_layer,return_sequences=True))
        model.add(Dropout(dropout_val))
        model.add(GRU(RNN_layer))
        model.add(Dropout(dropout_val))
        model.add(Dense(Dense_layers, activation = 'relu'))
        model.add(Dropout(dropout_val))
        model.add(Dense(vocab_size, activation ='softmax'))
        
    if layer == "BILSTM":
        model = Sequential()
        model.add(Embedding(num_word +1 ,  no_of_embeddings ,input_length=max_pad_length))
        model.add(Bidirectional(LSTM(RNN_layer,return_sequences=True)))
        model.add(Dropout(dropout_val))
        model.add(Bidirectional(LSTM(RNN_layer)))
        model.add(Dropout(dropout_val))
        model.add(Dense(Dense_layers, activation = 'relu'))
        model.add(Dropout(dropout_val))
        model.add(Dense(vocab_size, activation ='softmax'))
        
    return model


In [None]:
model = model_RNN("BILSTM")

In [None]:
model.summary()

# Compiling and fitting  model

In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer ='adam',metrics = ['accuracy'])

In [None]:
model.fit(X,y, batch_size=256, epochs = 100)

# Saving model

In [None]:
model.save("model.h5")

# Prdicting new lines

In [None]:
seed_text = ["What", "do", "we","know"]
next_words = 50

for _ in range(next_words):
    
    texts = ' '.join(seed_text)
    token_list = tokenizer.texts_to_sequences([texts ])[0]
    token_list = pad_sequences([token_list], maxlen = 50, padding ="pre" )
    predicted = model.predict_classes(token_list, verbose = 0)
    output_word = " "
    
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            seed_text.append(output_word)
            break
print(' '.join(seed_text))  

# Loading the tokenizer 

In [None]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)