In [None]:
!pip install -r /content/drive/MyDrive/GenAI_Udemy_Projects/LSTM_Text_Prediction/requirements.txt



In [None]:
# Data Collection
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

import pandas as pd

# Load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')
# saving to file
with open('hamlet.txt', 'w') as file:
  file.write(data)

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [None]:
## data preprocessing

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## load the dataset
with open('hamlet.txt','r') as file:
  text=file.read().lower()

## Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
total_words

4818

In [None]:
# text = '''good morning
# i like apple
# you are the wizard
# my name is harry potter
# The sun is shining brightly today.
# Birds are singing in the trees.
# A gentle breeze is blowing.
# The flowers are blooming beautifully.
# The sky is a clear blue.
# Clouds drift slowly across it.
# A small dog runs happily.
# Children are playing in the park.
# Laughter fills the air.
# '''

Here, we are converting a text into lines, a line into a string of numbers and then taking each num-word and adding it to input sequence.
Each num-word contains sequence of previous word like:
*   i
*   i am
*   i am the
*   i am the voldimort

And then adding padding to it for uniform length

In [None]:
## create input sequence
input_sequence=[]
# print('############################')
for line in text.split('\n'):
  token_list=tokenizer.texts_to_sequences([line])[0]
  # print('token-list=========================')
  # print(token_list)
  # print('token len', len(token_list))
  for i in range(1, len(token_list)):
    # print('range len',range(1,len(token_list)))
    # print(token_list[:i+1])
    n_gram_sequence = token_list[:i + 1]
    # print('n----',n_gram_sequence)
    input_sequence.append(n_gram_sequence)
    # print('i----',input_sequence)
# print('############################')


In [None]:
# input_sequence

In [None]:
max_sequence_len = max([len(x) for x in input_sequence])
max_sequence_len

14

In [None]:
input_sequence = np.array(pad_sequences(input_sequence, maxlen = max_sequence_len))
input_sequence

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [None]:

# for i in range(0,100):
#   print(input_sequence[i])

In [None]:
## create predicitors and label
import tensorflow as tf
x,y = input_sequence[:,:-1], input_sequence[:,-1]

In [None]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

In [None]:
## Train our LSTM rnn

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU

# Define the model
# model = Sequential()
# model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
# model.add(LSTM(150, return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
# model.add(Dense(total_words, activation="softmax"))

model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    LSTM(150, return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(total_words, activation="softmax")
])

# Compile the model
# model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

model.compile(loss="categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])
model.build((None, max_sequence_len-1))
model.summary()

In [None]:
print("Training data shapes:")
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

Training data shapes:
x_train shape: (20585, 13)
y_train shape: (20585, 4818)
x_test shape: (5147, 13)
y_test shape: (5147, 4818)


In [58]:
# ## Train the model
# history = model.fit(x_train,y_train, epochs = 50, validation_data=(x_test,y_test,))

# history = model.fit(X_train, y_train, epochs=100, batch_size=64, callbacks=[early_stopping])

# Train the model
history = model.fit(
    x_train,
    y_train,
    epochs=150,
    batch_size=64,
    validation_data=(x_test, y_test),
    callbacks=[early_stopping]
)

Epoch 1/150
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 100ms/step - accuracy: 0.3199 - loss: 3.2635 - val_accuracy: 0.0573 - val_loss: 9.8229
Epoch 2/150
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 93ms/step - accuracy: 0.3305 - loss: 3.2310 - val_accuracy: 0.0575 - val_loss: 9.8982
Epoch 3/150
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 97ms/step - accuracy: 0.3368 - loss: 3.1893 - val_accuracy: 0.0563 - val_loss: 9.9735
Epoch 4/150
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 95ms/step - accuracy: 0.3412 - loss: 3.1572 - val_accuracy: 0.0579 - val_loss: 10.0347
Epoch 5/150
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 98ms/step - accuracy: 0.3448 - loss: 3.1384 - val_accuracy: 0.0540 - val_loss: 10.0959
Epoch 6/150
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 102ms/step - accuracy: 0.3472 - loss: 3.1287 - val_accuracy: 0.0573 - val_loss: 10.1701
Epoch

In [60]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [61]:
input_text="To be or not to be"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

Input text:To be or not to be
Next Word PRediction:that


In [64]:
## Save the model
model.save("next_word_lstm.h5")
## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)



In [65]:
input_text="  Barn. Last night of all,When yond same"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

Input text:  Barn. Last night of all,When yond same
Next Word PRediction:eyes
