In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
from tensorflow.keras.models import load_model
import numpy as np
import pickle
from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt
import pickle
import os
import re

In [15]:
#downloded the 2 datasets from https://www.gutenberg.org/
#Reading dataset1 as text file
dataset1= open("data/dataset1.txt", "r", encoding = "utf8")

# store file in list
data_lines_1 = []
for lines in dataset1:
    data_lines_1.append(lines)

# Convert list to string
data1 = ""
for words in data_lines_1:
    data1= ' '. join(data_lines_1) 

In [16]:
#Reading dataset2 as text file
dataset2= open("data/dataset2.txt", "r", encoding = "utf8")

# store file in list
data_lines_2 = []
for lines in dataset2:
    data_lines_2.append(lines)

# Convert list to string
data2 = ""
for words in data_lines_2:
    data2= ' '. join(data_lines_2) 

In [17]:
#concatinating the 2 strings from two different datasets 
data=data1 + ' ' + data2

In [18]:
data[1:100]

"\n Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle\n \n This eBook is for"

In [19]:
def cleaning(text):
    global data
    data = text.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','') 
    #new line, carriage return, unicode character --> replace by space
    data=data.replace(',','').replace(":","")
    data = re.sub(r"[^a-zA-Z0-9 ]", "", data)   
    data = data.split()
    data = ' '.join(data)
    return data
cleaning(data)



In [13]:
print("Length of the corpus:",len(data))

Length of the corpus: 997593


In [6]:
#Tokenizing the data or splitting into chunks
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[136, 4601, 1, 952, 4, 123, 33, 45, 523, 2104, 2105, 27, 953, 14, 22]

In [7]:
#Finding the sequential data and its length
len(sequence_data)

107547

In [8]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)


8722


In [9]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  107544


array([[ 136, 4601,    1,  952],
       [4601,    1,  952,    4],
       [   1,  952,    4,  123],
       [ 952,    4,  123,   33],
       [   4,  123,   33,   45],
       [ 123,   33,   45,  523],
       [  33,   45,  523, 2104],
       [  45,  523, 2104, 2105],
       [ 523, 2104, 2105,   27],
       [2104, 2105,   27,  953]])

In [10]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [11]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[ 136 4601    1]
 [4601    1  952]
 [   1  952    4]
 [ 952    4  123]
 [   4  123   33]
 [ 123   33   45]
 [  33   45  523]
 [  45  523 2104]
 [ 523 2104 2105]
 [2104 2105   27]]
Response:  [ 952    4  123   33   45  523 2104 2105   27  953]


In [12]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [13]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

2022-11-02 12:42:18.945852: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             87220     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 8722)              8730722   
                                                                 
Total params: 21,866,942
Trainable params: 21,866,942
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

#epoch needs to be increased but right now limited to 30 due to system specification
#Will increase it to 100 post aws deployment

history=model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint]) 

Epoch 1/30
Epoch 1: loss improved from inf to 6.36619, saving model to next_words.h5
Epoch 2/30
Epoch 2: loss improved from 6.36619 to 5.76846, saving model to next_words.h5
Epoch 3/30
Epoch 3: loss improved from 5.76846 to 5.44920, saving model to next_words.h5
Epoch 4/30
Epoch 4: loss improved from 5.44920 to 5.19681, saving model to next_words.h5
Epoch 5/30
Epoch 5: loss improved from 5.19681 to 4.97158, saving model to next_words.h5
Epoch 6/30
Epoch 6: loss improved from 4.97158 to 4.75812, saving model to next_words.h5
Epoch 7/30
Epoch 7: loss improved from 4.75812 to 4.54101, saving model to next_words.h5
Epoch 8/30
Epoch 8: loss improved from 4.54101 to 4.31866, saving model to next_words.h5
Epoch 9/30
Epoch 9: loss improved from 4.31866 to 4.08644, saving model to next_words.h5
Epoch 10/30
Epoch 10: loss improved from 4.08644 to 3.84713, saving model to next_words.h5
Epoch 11/30
Epoch 11: loss improved from 3.84713 to 3.60203, saving model to next_words.h5
Epoch 12/30
Epoch 12:

In [None]:
loss=history.history["loss"]
epochs=range(1,len(loss)+1)
plt.plot(epochs,loss,'y',label="Training loss")
plt.title("Training loss")
plt.xlabel("Epochs")
plt.ylabel("loss")
plt.legend()
plt.show()

In [3]:
# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def next_word_predictor(model, tokenizer, text):
    sequence = tokenizer.texts_to_sequences([text])
    sequence = np.array(sequence)
    preds = np.argmax(model.predict(sequence))
    predicted_word = ""
  
    for key, value in tokenizer.word_index.items():
        if value == preds:
            predicted_word = key
            break
  
    print(predicted_word)
    return predicted_word

2022-11-02 15:16:08.313317: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
while(True):
    text = input("Enter your line: ")
  
    if text == "0":
        print("Execution completed.....")
        break
  
    else:
        try:
            text = text.split(" ")
            text = text[-3:]
            print(text)
        
            next_word_predictor(model, tokenizer, text)
          
        except Exception as e:
            print("Error occurred: ",e)
            continue

Enter your line: how are you
['how', 'are', 'you']
getting
Enter your line: distracting factor which
['distracting', 'factor', 'which']
might
Enter your line: I had better
['I', 'had', 'better']
postpone
Enter your line: Can you please
['Can', 'you', 'please']
herself
Enter your line: what is going
['what', 'is', 'going']
on
Enter your line: this is my
['this', 'is', 'my']
friend
