# General Imports

In [5]:
import sys
import re
import numpy as np
import pandas as pd
import matplotlib as plt
import os.path
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# np.set_printoptions(edgeitems=3)
# np.core.arrayprint._line_width = 80

# Data Manipulation

## Load and Tokenize Data

In [6]:
path = 'data.txt'
data = open(path, 'r', encoding='utf-8').read().lower()
print('length of the corpus is: ', len(data))

length of the corpus is:  581888


In [8]:
# splitting the corpus into words

cleaned = re.sub(r'\W+', ' ', data).lower()
tokens = word_tokenize(cleaned)
print('Length of tokens: ' + str(len(tokens)))

# This includes all full stops and all the punctuation. We only need the words...
# tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
# words = tokenizer.tokenize(data)
# len(words)

Length of tokens: 109281


In [9]:
# Finding all the unique words

unique_words = np.unique(tokens)
len(unique_words)

8200

## Chunk Data Into Sequences

In [43]:
# Feature Engineering - converting the text into numerical values

train_len = 5
text_sequences = []
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)


In [44]:
text_sequences[0:10]

[['project', 'gutenberg', 's', 'the', 'adventures'],
 ['gutenberg', 's', 'the', 'adventures', 'of'],
 ['s', 'the', 'adventures', 'of', 'sherlock'],
 ['the', 'adventures', 'of', 'sherlock', 'holmes'],
 ['adventures', 'of', 'sherlock', 'holmes', 'by'],
 ['of', 'sherlock', 'holmes', 'by', 'arthur'],
 ['sherlock', 'holmes', 'by', 'arthur', 'conan'],
 ['holmes', 'by', 'arthur', 'conan', 'doyle'],
 ['by', 'arthur', 'conan', 'doyle', 'this'],
 ['arthur', 'conan', 'doyle', 'this', 'ebook']]

In [45]:
# Getting unique tokens

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
train_sequences = tokenizer.texts_to_sequences(text_sequences)
print('Found %s unique tokens.' % len(tokenizer.word_counts))
vocab =  len(tokenizer.word_counts)+1

Found 8200 unique tokens.


In [46]:
print(len(train_sequences))

109276


In [47]:
# creating an empty array

n_sequences = np.empty([len(train_sequences),train_len], dtype='int32')
for i in range(len(train_sequences)):
    n_sequences[i] = train_sequences[i]

x = n_sequences[:,:-1]
y = n_sequences[:,-1]

In [48]:
# Converting the input into binary

y = to_categorical(y,num_classes=vocab)

# Model

## Create Model

In [49]:
# Building the Model

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [50]:
modelPath = "c542Model"
loadSavedModel = os.path.exists(modelPath)
model = None
if loadSavedModel:
    print("Loading saved model.")
    load_model(modelPath)
else:
    model = Sequential()
    model.add(Embedding(vocab,10, input_length=train_len-1))
    model.add(LSTM(50,return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50,activation='relu'))
    model.add(Dense(vocab, activation='relu'))

In [51]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 4, 10)             82010     
                                                                 
 lstm_10 (LSTM)              (None, 4, 50)             12200     
                                                                 
 lstm_11 (LSTM)              (None, 50)                20200     
                                                                 
 dense_10 (Dense)            (None, 50)                2550      
                                                                 
 dense_11 (Dense)            (None, 8201)              418251    
                                                                 
Total params: 535,211
Trainable params: 535,211
Non-trainable params: 0
_________________________________________________________________


In [52]:
if not loadSavedModel:
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 4, 10)             82010     
                                                                 
 lstm_10 (LSTM)              (None, 4, 50)             12200     
                                                                 
 lstm_11 (LSTM)              (None, 50)                20200     
                                                                 
 dense_10 (Dense)            (None, 50)                2550      
                                                                 
 dense_11 (Dense)            (None, 8201)              418251    
                                                                 
Total params: 535,211
Trainable params: 535,211
Non-trainable params: 0
_________________________________________________________________


## Train Model

In [53]:
if not loadSavedModel:
    history = model.fit(x, y,validation_split=0.05,batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [54]:
if not loadSavedModel:
    print("Saving Model")
    model.save(modelPath)

Saving Model




INFO:tensorflow:Assets written to: c542Model\assets


INFO:tensorflow:Assets written to: c542Model\assets


# Testing The Model

## Interactive Testing

In [55]:
text = str(input("Enter the phrase: ")).strip().lower()

Enter the phrase: HOw are you doing


In [90]:
# Frunctions to predict the next word(s)

def predict(model,text):
    text = text.lower()
    encoded_text = tokenizer.texts_to_sequences([text])[0]
    test_sequences = tokenizer.texts_to_sequences([text])
    pad_encoded = pad_sequences([encoded_text], maxlen=train_len-1, truncating='pre')
    for i in (model.predict(pad_encoded)[0]).argsort()[-(train_len-1):][::-1]:
        print(i)
        pred_word = tokenizer.index_word[i]
        print("Next word suggestion:",pred_word)

    predicted_word = model.predict(pad_encoded).argsort()
    label = predicted_word.argmax(axis=1)
    #tokenizer.index_word[label[0]]

In [91]:
predict(model, "adventures of sherlock holmes by")

8200
Next word suggestion: newsletter
2562
Next word suggestion: factor
2724
Next word suggestion: handy
2725
Next word suggestion: grown
[[   0 5474 5473 ... 2724 2562 8200]]
newsletter


In [109]:
def generate_text(model, text, max_words=5):
    text = text.lower()
    encoded_text = tokenizer.texts_to_sequences([text])[0]
    test_sequences = tokenizer.texts_to_sequences([text])
    base = pad_sequences([encoded_text], maxlen=train_len-1, truncating='pre')
    print(base)
    results = []
    for i in range(max_words):
        nextToken = model.predict(base)[0].argsort()[-1]
        nextWord = tokenizer.index_word[nextToken]
        results.append(nextWord)
        base = pad_sequences([np.append(base[0],[nextToken])], maxlen=train_len-1, truncating='pre')
        print(base)
    prediction = " ".join(results)
    print(f"Prediction: {prediction}")
    tokenizer.index_word[label[0]]

In [110]:
generate_text(model, "Harry took the invisibility", max_words=10)

[[  0   0 152   1]]
[[   0  152    1 8200]]
[[ 152    1 8200 8200]]
[[   1 8200 8200 8200]]
[[8200 8200 8200 8200]]
[[8200 8200 8200 8200]]
[[8200 8200 8200 8200]]
[[8200 8200 8200 8200]]
[[8200 8200 8200 8200]]
[[8200 8200 8200 8200]]
[[8200 8200 8200 8200]]
Prediction: newsletter newsletter newsletter newsletter newsletter newsletter newsletter newsletter newsletter newsletter


NameError: name 'label' is not defined

In [59]:
input_text = input().strip().lower()
encoded_text = tokenizer.texts_to_sequences([input_text])[0]
pad_encoded = pad_sequences([encoded_text], maxlen=3, truncating='pre')
# print(encoded_text, pad_encoded)
for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
    pred_word = tokenizer.index_word[i]
    print("Next word suggestion:",pred_word)

adventures of sherlock
[983, 5, 125] [[983   5 125]]
8200
Next word suggestion: newsletter
2562
Next word suggestion: factor
2724
Next word suggestion: handy
