## Next Word Prediction using LSTM

In [1]:
## Data preprocessing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## Load the dataset
with open('hamlet.txt','r') as file:
    text=file.read().lower()

## Tokenize the text-creating indexes
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
total_words

4818

In [2]:
## Create input sequences
input_sequences = []
for line in text.split('\n'):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [3]:
## Pad Sequences
max_sequences_len = max([len(x) for x in input_sequences])
max_sequences_len

14

In [4]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequences_len,padding='pre'))

In [5]:
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [6]:
## Create predictors and labels

import tensorflow as tf
X,y = input_sequences[:,:-1],input_sequences[:,-1]


In [7]:
y

array([ 687,    4,   45, ..., 1047,    4,  193])

In [8]:
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       ...,
       [   0,    0,    0, ...,  687,    4,   45],
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4]])

In [9]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [10]:
## Splitting the data into training and test datasets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [11]:
# Define early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
model = Sequential()
model.add(Embedding(total_words, 100))  # Removed input_length
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation="softmax"))

# Compile the model
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

# Display the summary
model.summary()


In [14]:
## Train the model
history = model.fit(X_train,y_train,epochs=10,validation_data=(X_test,y_test),verbose=1)

Epoch 1/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 26ms/step - accuracy: 0.0454 - loss: 6.3651 - val_accuracy: 0.0441 - val_loss: 6.8481
Epoch 2/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.0502 - loss: 6.2407 - val_accuracy: 0.0455 - val_loss: 6.8657
Epoch 3/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.0529 - loss: 6.1243 - val_accuracy: 0.0470 - val_loss: 6.8912
Epoch 4/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 24ms/step - accuracy: 0.0561 - loss: 5.9606 - val_accuracy: 0.0525 - val_loss: 6.9127
Epoch 5/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.0633 - loss: 5.8283 - val_accuracy: 0.0600 - val_loss: 6.9613
Epoch 6/10
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 24ms/step - accuracy: 0.0735 - loss: 5.6910 - val_accuracy: 0.0606 - val_loss: 7.0328
Epoch 7/10
[1m6

In [15]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [21]:
input_text = 'Bar. I haue seene'
print(f"Input text:{ input_text}")
max_sequences_len = model.input_shape[1]+1
next_word = predict_next_word(model,tokenizer,input_text,max_sequences_len)
print(f"Next word:{next_word}")

Input text:Bar. I haue seene
Next word:to


In [18]:
model.save("next_word_lstm.h5")

## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

