In [1]:
## Data collection
import nltk 
nltk.download('gutenberg')
from nltk.corpus import gutenberg 
import pandas as pd


## Load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')

## Save to a file
with open('hamlet.txt', 'w') as file:
    file.write(data)
    

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\fmssh\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
## Data preprocessing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## Load the dataset and apply padding sequences and tokenizer

with open('hamlet.txt', 'r') as file:
    text = file.read().lower()

## Tokenize the text

tokenizer = Tokenizer()  ## Creating indexes for the words
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
total_words

4818

In [5]:
## Creating input sequences

input_sequences =[]
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [7]:
## Apply pad sequences
max_sequence_len=max([len(x) for x in input_sequences])
max_sequence_len

14

In [11]:
## Converting into an array
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [12]:
## Create predictors and label

import tensorflow as tf
x,y =input_sequences[:,:-1], input_sequences[:,-1]

In [17]:
## Creating categorical features
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
## Splitting the dataset into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [None]:
## Train the LSTM RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

## Define the model
model = Sequential()
model.add(Embedding(total_words,100))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation="softmax"))

## Build the model
model.build(input_shape=(None, max_sequence_len))

## Compile the model 
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
model.summary()

In [28]:
## Training the model 
history = model.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test), verbose=1)


Epoch 1/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.0259 - loss: 7.1452 - val_accuracy: 0.0352 - val_loss: 6.7278
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.0364 - loss: 6.4618 - val_accuracy: 0.0468 - val_loss: 6.7937
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.0439 - loss: 6.3186 - val_accuracy: 0.0538 - val_loss: 6.8277
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.0496 - loss: 6.1837 - val_accuracy: 0.0534 - val_loss: 6.8795
Epoch 5/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0507 - loss: 6.0690 - val_accuracy: 0.0544 - val_loss: 6.8827
Epoch 6/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0564 - loss: 5.9455 - val_accuracy: 0.0571 - val_loss: 6.9246
Epoch 7/50
[1m644/64

In [30]:
## Importing EarlyStopping callback to monitor validation loss
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience =5, restore_best_weights = True)

In [31]:
## Training the model again for 100 epochs
history = model.fit(x_train, y_train, epochs=100, validation_data=(x_test, y_test), verbose=1)


Epoch 1/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.4516 - loss: 2.4982 - val_accuracy: 0.0544 - val_loss: 11.3436
Epoch 2/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.4604 - loss: 2.4480 - val_accuracy: 0.0548 - val_loss: 11.3980
Epoch 3/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.4613 - loss: 2.4322 - val_accuracy: 0.0528 - val_loss: 11.4337
Epoch 4/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.4716 - loss: 2.3941 - val_accuracy: 0.0534 - val_loss: 11.5407
Epoch 5/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.4810 - loss: 2.3587 - val_accuracy: 0.0548 - val_loss: 11.6042
Epoch 6/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.4934 - loss: 2.3178 - val_accuracy: 0.0528 - val_loss: 11.6757
Epoch 7/10

In [33]:
## Function to predict the next word
def predict_next(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  ## Ensure sequence length matches max_sequence_len -1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None 

In [None]:
## Trying an input to see how the model performs in predicting next word
input_text = "To be or not to be"
print(f"Input Text: {input_text} ")
max_sequence_len=model.input_shape[1]+1
next_word = predict_next(model, tokenizer, input_text, max_sequence_len)
print(f"Next Word Prediction: {next_word}")

Input Text: To be or not to be 
Next Word Prediction: nothing


In [39]:
## Saving the model
model.save("Next_Word_LSTM.h5")

## Saving the tokenizer as a .pkl file

import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)  ## Highest protocol minimizes file size and speeds up serialization



In [41]:
## Trying another input 

input_text = "To offer it the shew of"  ## input taken from dataset, line: 196
print(f"Input Text: {input_text} ")
max_sequence_len=model.input_shape[1]+1
next_word = predict_next(model, tokenizer, input_text, max_sequence_len)
print(f"Next Word Prediction: {next_word}")

Input Text: To offer it the shew of 
Next Word Prediction: violence


In [42]:
## Trying another input 
input_text = "For it is as the Ayre,"  ## input taken from dataset, line: 197
print(f"Input Text: {input_text} ")
max_sequence_len=model.input_shape[1]+1
next_word = predict_next(model, tokenizer, input_text, max_sequence_len)
print(f"Next Word Prediction: {next_word}")

Input Text: For it is as the Ayre, 
Next Word Prediction: invulnerable
