In [3]:
import sys
import re
import numpy as np
import pandas as pd
import matplotlib as plt
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# np.set_printoptions(edgeitems=3)
# np.core.arrayprint._line_width = 80

In [4]:
path = 'data.txt'
data = open(path, 'r', encoding='utf-8').read().lower()
print('length of the corpus is: ', len(data))

length of the corpus is:  581888


In [5]:
# splitting the corpus into words

cleaned = re.sub(r'\W+', ' ', data).lower()
tokens = word_tokenize(cleaned)
print('Length of tokens: ' + str(len(tokens)))

# This includes all full stops and all the punctuation. We only need the words...
# tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
# words = tokenizer.tokenize(data)
# len(words)

Length of tokens: 109281


In [6]:
# Finding all the unique words

unique_words = np.unique(tokens)
len(unique_words)

8200

In [7]:
# Feature Engineering - converting the text into sequences 

train_len = 5
text_sequences = []
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)


In [8]:
text_sequences[0:10]

[['project', 'gutenberg', 's', 'the', 'adventures'],
 ['gutenberg', 's', 'the', 'adventures', 'of'],
 ['s', 'the', 'adventures', 'of', 'sherlock'],
 ['the', 'adventures', 'of', 'sherlock', 'holmes'],
 ['adventures', 'of', 'sherlock', 'holmes', 'by'],
 ['of', 'sherlock', 'holmes', 'by', 'arthur'],
 ['sherlock', 'holmes', 'by', 'arthur', 'conan'],
 ['holmes', 'by', 'arthur', 'conan', 'doyle'],
 ['by', 'arthur', 'conan', 'doyle', 'this'],
 ['arthur', 'conan', 'doyle', 'this', 'ebook']]

In [9]:
# Getting unique tokens

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
train_sequences = tokenizer.texts_to_sequences(text_sequences)
print('Found %s unique tokens.' % len(tokenizer.word_counts))
vocab =  len(tokenizer.word_counts)+1

Found 8200 unique tokens.


In [10]:
print(len(train_sequences))

109276


In [11]:
n_sequences = np.empty([len(train_sequences),train_len], dtype='int32')
for i in range(len(train_sequences)):
    n_sequences[i] = train_sequences[i]

x = n_sequences[:,:-1]
y = n_sequences[:,-1]

In [12]:
# Converting the input into binary

y = to_categorical(y,num_classes=vocab)

In [13]:
# Building the Model

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
model = Sequential()
model.add(Embedding(vocab,10, input_length=3))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50,activation='relu'))
model.add(Dense(vocab, activation='relu'))

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3, 10)             82010     
_________________________________________________________________
lstm (LSTM)                  (None, 3, 50)             12200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 8201)              418251    
Total params: 535,211
Trainable params: 535,211
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3, 10)             82010     
_________________________________________________________________
lstm (LSTM)                  (None, 3, 50)             12200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 8201)              418251    
Total params: 535,211
Trainable params: 535,211
Non-trainable params: 0
_________________________________________________________________


In [16]:
history = model.fit(x, y,validation_split=0.05,batch_size=128, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [15]:
text = str(input("Enter the phrase: ")).strip().lower()

Enter the phrase: adventures of sherlock


In [None]:
# Frunctions to predict the next word(s)

# # def predict(model,text):
# text = text.lower()
# encoded_text = tokenizer.texts_to_sequences([text])[0]
# # test_sequences = tokenizer.texts_to_sequences([text])
# pad_encoded = pad_sequences([encoded_text], maxlen=3, truncating='pre')
# for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
#     print(i)
#     pred_word = tokenizer.index_word[i]
#     print("Next word suggestion:",pred_word)

# predicted_word = model.predict(pad_encoded)
# label = predicted_word.argmax(axis=1)

# tokenizer.index_word[label[0]]

In [17]:
input_text = input().strip().lower()

adventures of sherlock


In [None]:
# predict(model, "adventures of sherlock holmes by")

In [None]:
# generate_text(model, "Harry took the invisibility", max_words=10)

In [17]:

def predict(model, text):
    input_text = text.lower()
    encoded_text = tokenizer.texts_to_sequences([input_text])[0]
    pad_encoded = pad_sequences([encoded_text], maxlen=3, truncating='pre')
    # print(encoded_text, pad_encoded)
    for i in (model.predict(pad_encoded)[0]).argsort()[-1:][::-1]:
        pred_word = tokenizer.index_word[i]
        return pred_word

In [18]:
predict(model, "adventures of sherlock holmes by")

'the'

In [19]:
def generate_text(model, start_text, max_words =25):
    output = start_text
    for i in range(max_words):
        output+=" "+predict(model,output)
    return output

In [22]:
generate_text(model, "adventures of sherlock", max_words=4)

'adventures of sherlock and i was a'