In [None]:
""" importing datasets"""
import nltk
nltk.download('gutenberg')

from nltk.corpus import gutenberg
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# Load the dataset
data = gutenberg.raw('shakespeare-hamlet.txt')

# Save to file
with open("hamlet.txt", "w", encoding="utf-8") as file:
    file.write(data)

# Split text into lines
lines = data.splitlines()

print("----- FIRST 10 LINES -----")
for line in lines[:20]:
    print(line)

print("\n----- LAST 10 LINES -----")
for line in lines[-20:]:
    print(line)


In [None]:
with open('hamlet.txt', 'r') as file:
    text = file.read().lower()

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
print(f"Total Words: {total_words}")

# Generate input sequences

In [None]:
input_sequences = []
for line in text.split('\n'):
    if line.strip():  # Ignore empty lines
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

In [None]:
max_sequence_len = max(len(x) for x in input_sequences)
print(f"Max Sequence Length: {max_sequence_len}")
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
X, y = input_sequences[:, :-1], input_sequences[:, -1]  # Fix slicing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=total_words)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=total_words)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

In [None]:
X

In [None]:
y

In [None]:
# Define the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len - 1))
model.add(tf.keras.layers.LSTM(150, return_sequences=False))
model.add(tf.keras.layers.Dense(150, activation='relu'))
model.add(tf.keras.layers.Dense(total_words, activation='softmax'))
model.build(input_shape=(None, max_sequence_len - 1))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train, epochs=120, batch_size=64, validation_data=(X_test, y_test))

In [None]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [None]:
input_text="who"
print(f"input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"next word predection:{next_word}")

In [None]:
model.save("next_word_lstm.h5")
import pickle
with open('tokenizer.pickle','wb') as handle:
    with open ('tokenizer.pickle','wb') as handle:
        pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
""" more texts """
input_text="after"
print(f"input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"next word predection:{next_word}")


In [None]:
""" more texts """
input_text="he will go"
print(f"input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"next word predection:{next_word}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
plt.plot(history.history['loss'])
plt.xlabel("Epochs")
plt.ylabel("Training Loss")
plt.title("LSTM Training Loss")
plt.grid(True)
plt.show()
