In [43]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding,LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

## Load and preprocess data

In [44]:
path = "C:\\Users\\Anuruddha\\Desktop\\Future\\MACJINE_LEARNING\\NLP\\project2_Next_Word_Prediction\\pg1513.txt"
file = open(path,"r", encoding= "utf8")

#store lines in file as list

lines = []
for i in file:
    lines.append(i)
#print(lines)

In [45]:
#convert list as one string
data = ' '.join(lines)

In [46]:
#data

In [47]:
# remove unnessary things

data = data.replace('\n','').replace('\r','').replace('\ufeff','').replace('***','')

In [48]:
#data

In [49]:
#split demo

a = 'chamika dilshan,anuruddha'

ddd = a.split()

ddd

['chamika', 'dilshan,anuruddha']

In [50]:
# remove sapces

data_1 = data.split()
data = ' '.join(data_1)

In [51]:
data



In [52]:
len(data)

160205

## Apply Tokenization

In [56]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer,open('token.pkl','wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]

In [58]:
sequence_data[:15]

[1, 53, 49, 306, 6, 12, 2, 22, 32, 967, 783, 16, 306, 8, 18]

In [59]:
len(sequence_data)

29352

In [61]:
print(tokenizer.word_index)



In [63]:
vocab_size = len(tokenizer.word_index) + 1 
print(vocab_size)

4282


## Build Training Set 

In [71]:
sequences = []

for i in range(3,len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

In [72]:
sequences = np.array(sequences)

In [73]:
sequences

array([[   1,   53,   49,  306],
       [  53,   49,  306,    6],
       [  49,  306,    6,   12],
       ...,
       [4281,    3,  183,  207],
       [   3,  183,  207,  234],
       [ 183,  207,  234,  565]])

In [82]:
X = []
Y = []

for i in range(len(sequences)):
    x_data = sequences[i][0:3]
    y_data = sequences[i][3]
    X.append(x_data)
    Y.append(y_data)
    
    
X = np.array(X)
Y = np.array(Y)



In [83]:
X

array([[   1,   53,   49],
       [  53,   49,  306],
       [  49,  306,    6],
       ...,
       [4281,    3,  183],
       [   3,  183,  207],
       [ 183,  207,  234]])

In [84]:
Y

array([306,   6,  12, ..., 207, 234, 565])

In [85]:
Y = to_categorical(Y,num_classes=vocab_size)

In [86]:
Y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Creating Model 

In [88]:
model = Sequential()
model.add(Embedding(vocab_size,10,input_length=3)) # here 10 is output diamenstion
model.add(LSTM(1000,return_sequences=True)) #here 1000 is output dim and return_sequences=True means we creating next LSTM layer
model.add(LSTM(1000))
model.add(Dense(1000,activation='relu'))
model.add(Dense(vocab_size,activation='softmax'))


In [89]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             42820     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 4282)              4286282   
                                                                 
Total params: 17,378,102
Trainable params: 17,378,102
Non-trainable params: 0
_________________________________________________________________


In [92]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss',verbose=1,save_best_only=True)
model.compile(loss='categorical_crossentropy',optimizer=Adam(learning_rate=0.001))

In [93]:
model.fit(X,Y,epochs=10, batch_size=64, callbacks=[checkpoint])

Epoch 1/10
Epoch 1: loss improved from inf to 6.80339, saving model to next_words.h5
Epoch 2/10
Epoch 2: loss improved from 6.80339 to 6.41041, saving model to next_words.h5
Epoch 3/10
Epoch 3: loss improved from 6.41041 to 6.11299, saving model to next_words.h5
Epoch 4/10
Epoch 4: loss improved from 6.11299 to 5.85441, saving model to next_words.h5
Epoch 5/10
Epoch 5: loss improved from 5.85441 to 5.61032, saving model to next_words.h5
Epoch 6/10
Epoch 6: loss improved from 5.61032 to 5.37797, saving model to next_words.h5
Epoch 7/10
Epoch 7: loss improved from 5.37797 to 5.15655, saving model to next_words.h5
Epoch 8/10
Epoch 8: loss improved from 5.15655 to 4.92670, saving model to next_words.h5
Epoch 9/10
Epoch 9: loss improved from 4.92670 to 4.68816, saving model to next_words.h5
Epoch 10/10
Epoch 10: loss improved from 4.68816 to 4.41869, saving model to next_words.h5


<keras.callbacks.History at 0x20254bc4df0>

## Prediction 

In [98]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle


# load the model and tokenizer

model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl','rb'))


def Predict_Next_Words(model,tokenizer,text):
    
    sequence = tokenizer.texts_to_sequences([text])
    sequence = np.array(sequence)
    preds = np.argmax(model.predict(sequence))
    predicted_word = ""
    
    for key,value in tokenizer.word_index.items():
        if value == preds:
            predicted_word = key
            break
    print(predicted_word)
    return predicted_word
            

In [99]:
# user input 

while True:
    text = input("Enter your words:")
    if text == '0':
        print("Execution complted")
        break
        
    else:
        try:
            text= text.split() # text as list 
            text = text[-3:]  # get last 3 words 
            print(text)
            Predict_Next_Words(model,tokenizer,text)
            
        except Exception as e:
            print("Error occured:",e)
            continue   

Enter your words:The Project Gutenberg
['The', 'Project', 'Gutenberg']
literary
Enter your words:chanika
['chanika']
Error occured: in user code:

    File "E:\anaconda3_anuruddha\envs\mlenv\lib\site-packages\keras\engine\training.py", line 2137, in predict_function  *
        return step_function(self, iterator)
    File "E:\anaconda3_anuruddha\envs\mlenv\lib\site-packages\keras\engine\training.py", line 2123, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "E:\anaconda3_anuruddha\envs\mlenv\lib\site-packages\keras\engine\training.py", line 2111, in run_step  **
        outputs = model.predict_step(data)
    File "E:\anaconda3_anuruddha\envs\mlenv\lib\site-packages\keras\engine\training.py", line 2079, in predict_step
        return self(x, training=False)
    File "E:\anaconda3_anuruddha\envs\mlenv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
   