# Dataest from kaggle

Dataset: https://www.kaggle.com/datasets/shiv28/next-word

# Import libraries

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding , LSTM , Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

# Load and Pre-process the data

In [2]:
# Load data file
file = open('D://Projects/Next Word Prediction/Data/next_word_prediction.txt','r',encoding='ISO-8859-1')

# Store file in list
lines = []
for i in file:
    lines.append(i)
    
# Convert list to string
data = ""
for i in lines:
    data = ' '.join(lines)
    
# Replace unnecessary stuff with space
lis = [',','\n','\r','\ufeff','_','-','.',';','!',':','*','?','â']
for i in lis:
    data = data.replace(i,'')
# Remove unnecessary space
data = data.split()
data = ' '.join(data)

In [3]:
data.split('3')[1]



In [4]:
# Split the data and take the first 2 chapters into x
text = data.split('3')[0]

In [5]:
# Check the len of our data
len(data)

908595

# Apply the Tokenization

In [6]:
# import tokenizer into variable and fit it
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

In [7]:
# Save it in a file
pickle.dump(tokenizer,open('token.pkl','wb'))

In [8]:
# convert the text
sequence_data = tokenizer.texts_to_sequences([text])[0]

In [9]:
sequence_data[:16]

[87, 209, 10, 12, 5, 128, 210, 211, 8, 5, 88, 59, 17, 129, 3, 5]

In [10]:
len(sequence_data)

1651

In [11]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

557


In [12]:
sequences = []
for i in range(3,len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
print(len(sequences))
sequences = np.array(sequences)
sequences[:10]

1648


array([[ 87, 209,  10,  12],
       [209,  10,  12,   5],
       [ 10,  12,   5, 128],
       [ 12,   5, 128, 210],
       [  5, 128, 210, 211],
       [128, 210, 211,   8],
       [210, 211,   8,   5],
       [211,   8,   5,  88],
       [  8,   5,  88,  59],
       [  5,  88,  59,  17]])

In [13]:
# Split the data into X , y
X = []
y = []

for i in sequences:
    X.append(i[:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [14]:
# Convert y to 2D array
y = to_categorical(y,num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

# Creating the model

In [15]:
model = Sequential()
model.add(Embedding(vocab_size,10,input_length=3))
model.add(LSTM(1000,return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000,activation='relu'))
model.add(Dense(vocab_size,activation='softmax'))

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             5570      
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 557)               557557    
                                                                 
Total params: 13,612,127
Trainable params: 13,612,127
Non-trainable params: 0
_________________________________________________________________


# Build our model

In [17]:
checkpoint = ModelCheckpoint('next_word.h5',monitor='loss',verbose=1,save_best_only=True)
model.compile(loss='categorical_crossentropy',optimizer=Adam(learning_rate=0.001))
model.fit(X,y,epochs=100,batch_size=64,callbacks=[checkpoint])

Epoch 1/100
Epoch 1: loss improved from inf to 6.04863, saving model to next_word.h5
Epoch 2/100
Epoch 2: loss improved from 6.04863 to 5.70372, saving model to next_word.h5
Epoch 3/100
Epoch 3: loss improved from 5.70372 to 5.61146, saving model to next_word.h5
Epoch 4/100
Epoch 4: loss improved from 5.61146 to 5.57391, saving model to next_word.h5
Epoch 5/100
Epoch 5: loss improved from 5.57391 to 5.51063, saving model to next_word.h5
Epoch 6/100
Epoch 6: loss improved from 5.51063 to 5.47262, saving model to next_word.h5
Epoch 7/100
Epoch 7: loss improved from 5.47262 to 5.38976, saving model to next_word.h5
Epoch 8/100
Epoch 8: loss improved from 5.38976 to 5.34475, saving model to next_word.h5
Epoch 9/100
Epoch 9: loss improved from 5.34475 to 5.27989, saving model to next_word.h5
Epoch 10/100
Epoch 10: loss improved from 5.27989 to 5.23090, saving model to next_word.h5
Epoch 11/100
Epoch 11: loss improved from 5.23090 to 5.16935, saving model to next_word.h5
Epoch 12/100
Epoch 12

Epoch 38/100
Epoch 38: loss improved from 2.43376 to 2.35201, saving model to next_word.h5
Epoch 39/100
Epoch 39: loss improved from 2.35201 to 2.19292, saving model to next_word.h5
Epoch 40/100
Epoch 40: loss improved from 2.19292 to 1.99763, saving model to next_word.h5
Epoch 41/100
Epoch 41: loss improved from 1.99763 to 1.87872, saving model to next_word.h5
Epoch 42/100
Epoch 42: loss improved from 1.87872 to 1.76507, saving model to next_word.h5
Epoch 43/100
Epoch 43: loss improved from 1.76507 to 1.66190, saving model to next_word.h5
Epoch 44/100
Epoch 44: loss improved from 1.66190 to 1.55906, saving model to next_word.h5
Epoch 45/100
Epoch 45: loss improved from 1.55906 to 1.43357, saving model to next_word.h5
Epoch 46/100
Epoch 46: loss improved from 1.43357 to 1.29739, saving model to next_word.h5
Epoch 47/100
Epoch 47: loss improved from 1.29739 to 1.18777, saving model to next_word.h5
Epoch 48/100
Epoch 48: loss improved from 1.18777 to 1.05990, saving model to next_word.h5

Epoch 75/100
Epoch 75: loss did not improve from 0.20832
Epoch 76/100
Epoch 76: loss did not improve from 0.20832
Epoch 77/100
Epoch 77: loss improved from 0.20832 to 0.19399, saving model to next_word.h5
Epoch 78/100
Epoch 78: loss improved from 0.19399 to 0.17779, saving model to next_word.h5
Epoch 79/100
Epoch 79: loss improved from 0.17779 to 0.16868, saving model to next_word.h5
Epoch 80/100
Epoch 80: loss improved from 0.16868 to 0.15017, saving model to next_word.h5
Epoch 81/100
Epoch 81: loss improved from 0.15017 to 0.12203, saving model to next_word.h5
Epoch 82/100
Epoch 82: loss improved from 0.12203 to 0.11046, saving model to next_word.h5
Epoch 83/100
Epoch 83: loss improved from 0.11046 to 0.10486, saving model to next_word.h5
Epoch 84/100
Epoch 84: loss did not improve from 0.10486
Epoch 85/100
Epoch 85: loss improved from 0.10486 to 0.09371, saving model to next_word.h5
Epoch 86/100
Epoch 86: loss improved from 0.09371 to 0.08682, saving model to next_word.h5
Epoch 87/1

<keras.callbacks.History at 0x21976790fa0>

# Prediction

In [18]:
# Load the model and tokenizer
model = load_model('next_word.h5')
tokenizer = pickle.load(open('token.pkl','rb'))

In [19]:
def predict_next_word(model,tokenizer,text):
    sequence = tokenizer.texts_to_sequences([text])
    sequence = np.array(sequence)
    preds = np.argmax(model.predict(sequence))
    predicted_word = ""
    
    for key , value in tokenizer.word_index.items():
        if value == preds:
            predicted_word = key
            break
    print(predicted_word)
    return predicted_word

In [27]:
while True:
    text = input('Enter your line')
    if text == '0':
        print('Execution completed')
        break
    else:
        try:
            text = text.split(" ")
            text = text[-3:]
            print(text)
            predict_next_word(model,tokenizer,text)
        except Exception as e:
            print('error eccurred: ',e)
            continue

Enter your lineimaginary Vanity and
['imaginary', 'Vanity', 'and']
did
Enter your linewere married to
['were', 'married', 'to']
an
Enter your linecharming amusement for
['charming', 'amusement', 'for']
i
Enter your line0
Execution completed
