<a href="https://colab.research.google.com/github/Arvind6446/RNNMachineLearning/blob/main/hamlet_lstm_nextword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Next-word Prediction with an LSTM (Hamlet)

This notebook trains an LSTM language model on *Shakespeare's Hamlet* (NLTK Gutenberg corpus) using 2-gram and 3-gram training windows after lemmatization.

Outputs:
- `hamlet_nextword_best.keras` (best checkpoint by `val_loss`)
- `hamlet_nextword.keras` (final saved model)
- `tokenizer.json` (tokenizer configuration)


In [11]:
# NLTK downloads (run once)
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import os
import math
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TerminateOnNaN


In [8]:
# ---------------------------
# Load Hamlet from NLTK Gutenberg
# ---------------------------
data = gutenberg.raw('shakespeare-hamlet.txt')


In [9]:

class AdvanceLSTMRNN:
    def __init__(self, seed: int = 42):
        print("Initializing LSTM RNN Model Training")
        self.total_words = None
        self.tokenizer = None
        self.max_sequence_len = None
        self.model = None
        self.lemmatizer = WordNetLemmatizer()
        self.n_sequences = 0

        # Reproducibility
        np.random.seed(seed)
        tf.random.set_seed(seed)

    # ---------------------------
    # Dataset I/O
    # ---------------------------
    def loadDataSet(self):
        with open('hamlet.txt', 'w', encoding='utf-8') as f:
            f.write(data)

        with open('hamlet.txt', 'r', encoding='utf-8') as f:
            text = f.read()

        self.tokenizeDataSet(text)

    # ---------------------------
    # Preprocessing
    # ---------------------------
    def _lemmatize_line(self, line: str) -> str:
        tokens = word_tokenize(line.lower())
        lemmas = [self.lemmatizer.lemmatize(t) for t in tokens if t.isalpha()]
        return " ".join(lemmas)

    def tokenizeDataSet(self, text: str):
        # Lemmatize per line
        lemmatized_lines = []
        for line in text.split('\n'):
            cleaned = self._lemmatize_line(line)
            if cleaned.strip():
                lemmatized_lines.append(cleaned)

        # Tokenizer with OOV handling
        self.tokenizer = Tokenizer(oov_token="<OOV>")
        self.tokenizer.fit_on_texts(lemmatized_lines)
        self.total_words = len(self.tokenizer.word_index) + 1
        print("Total words (lemmatized): ", self.total_words)

        # Build 2-gram and 3-gram sequences
        input_sequences = self.createInputSequences(lemmatized_lines)

        # Pad and create predictors/labels
        self.padSequences(input_sequences)

    def createInputSequences(self, lines):
        input_sequences = []
        for line in lines:
            token_list = self.tokenizer.texts_to_sequences([line])[0]
            if len(token_list) < 2:
                continue
            # exact 2-gram and 3-gram windows
            for n in (2, 3):
                if len(token_list) >= n:
                    for i in range(n, len(token_list) + 1):
                        ngram = token_list[i - n:i]
                        input_sequences.append(ngram)
        self.n_sequences = len(input_sequences)
        print("Total sequences (2&3-gram): ", self.n_sequences)
        return input_sequences

    def padSequences(self, input_sequences):
        # Max length is 3 by design (2 or 3 grams)
        self.max_sequence_len = max(len(x) for x in input_sequences)  # should be 3
        input_sequences = np.array(
            pad_sequences(input_sequences, maxlen=self.max_sequence_len, padding='pre')
        )
        self.createPredictorandLabels(input_sequences)

    def createPredictorandLabels(self, input_sequences):
        x, y = input_sequences[:, :-1], input_sequences[:, -1]
        y = to_categorical(y, num_classes=self.total_words)

        x_train, x_val, y_train, y_val = train_test_split(
            x, y, test_size=0.1, random_state=42, shuffle=True
        )
        print("Shapes -> X_train:", x_train.shape, "y_train:", y_train.shape)

        # Dynamic hyperparameters based on vocab size & dataset size
        hparams = self.suggest_hparams()

        # Callbacks: configure EarlyStopping on/off and patience
        callbacks = self.build_callbacks(
            use_early_stopping=True,         # set False to disable EarlyStopping
            es_patience=hparams['es_patience'],
            checkpoint_path="hamlet_nextword_best.keras"  # Keras-native checkpoint
        )

        # Build & train LSTM
        self.build_and_train(x_train, y_train, x_val, y_val,
                             embed_dim=hparams['embed_dim'],
                             lstm_units=hparams['lstm_units'],
                             dropout_rate=hparams['dropout_rate'],
                             epochs=hparams['epochs'],
                             batch_size=hparams['batch_size'],
                             initial_lr=hparams['initial_lr'],
                             decay_steps=hparams['decay_steps'],
                             decay_rate=hparams['decay_rate'],
                             clipnorm=hparams['clipnorm'],
                             callbacks=callbacks)

        # Save trained model in Keras format and tokenizer
        self.save_model_keras(
            keras_path="hamlet_nextword.keras",
            save_tokenizer=True,
            tokenizer_path="tokenizer.json"
        )

    # ---------------------------
    # Dynamic hyperparameters
    # ---------------------------
    def suggest_hparams(self):
        V = self.total_words
        N = self.n_sequences

        # Embedding dimension: sublinear in vocabulary size
        embed_dim = int(np.clip(round(6 * math.log2(max(V, 4))), 64, 256))
        # LSTM units: slightly higher than embed_dim
        lstm_units = int(np.clip(round(12 * math.log2(max(V, 4))), 128, 384))
        # Dropout depends on dataset size
        dropout_rate = 0.15 if N > 200_000 else 0.25
        # Batch size scales with dataset size
        batch_size = 64 if N < 50_000 else (128 if N < 200_000 else 256)
        # Epochs: fewer for larger datasets
        epochs = 20 if N < 50_000 else (15 if N < 200_000 else 10)
        # Learning rate schedule
        initial_lr = 3e-3 if N < 50_000 else 2e-3
        decay_steps = max(1000, N // max(batch_size, 1))
        decay_rate = 0.5
        # EarlyStopping patience
        es_patience = 4 if epochs <= 15 else 6

        return {
            'embed_dim': embed_dim,
            'lstm_units': lstm_units,
            'dropout_rate': dropout_rate,
            'batch_size': batch_size,
            'epochs': epochs,
            'initial_lr': initial_lr,
            'decay_steps': decay_steps,
            'decay_rate': decay_rate,
            'clipnorm': 1.0,
            'es_patience': es_patience,
        }

    # ---------------------------
    # Optimizer & Callbacks
    # ---------------------------
    def make_optimizer(self, initial_lr, decay_steps, decay_rate, clipnorm=1.0):
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=initial_lr,
            decay_steps=decay_steps,
            decay_rate=decay_rate,
            staircase=True
        )
        return tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=clipnorm)

    def build_callbacks(self, use_early_stopping: bool = True, es_patience: int = 4,
                        checkpoint_path: str = "best_model.keras"):
        callbacks = [
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=5e-5, verbose=1),
            ModelCheckpoint(filepath=checkpoint_path, monitor='val_loss',
                            save_best_only=True, save_weights_only=False, verbose=1),
            TerminateOnNaN()
        ]
        if use_early_stopping:
            callbacks.append(EarlyStopping(monitor='val_loss', patience=es_patience,
                                           restore_best_weights=True, verbose=1))
        return callbacks

    # ---------------------------
    # Build & Train
    # ---------------------------
    def build_and_train(self, X_train, y_train, X_val, y_val,
                        embed_dim=128, lstm_units=256, dropout_rate=0.2,
                        epochs=15, batch_size=256,
                        initial_lr=0.001, decay_steps=1000, decay_rate=0.5,
                        clipnorm=1.0, callbacks=None):
        model = Sequential()
        model.add(Embedding(self.total_words, embed_dim, input_length=self.max_sequence_len - 1))
        model.add(LSTM(lstm_units))
        model.add(Dropout(dropout_rate))
        model.add(Dense(self.total_words, activation='softmax'))

        optimizer = self.make_optimizer(initial_lr, decay_steps, decay_rate, clipnorm=clipnorm)
        model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        self.model = model
        print(self.model.summary())

        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs, batch_size=batch_size,
            callbacks=(callbacks or []),
            verbose=1
        )
        print("Training complete.")

        # Report the best validation loss and perplexity
        import math as _math
        best_val_loss = min(history.history.get('val_loss', [np.nan]))
        if not np.isnan(best_val_loss):
            print(f"Best val_loss: {best_val_loss:.4f} | Perplexity ≈ {_math.exp(best_val_loss):.2f}")

    # ---------------------------
    # Save model in .keras + tokenizer
    # ---------------------------
    def save_model_keras(self, keras_path: str = "hamlet_nextword.keras",
                         save_tokenizer: bool = True,
                         tokenizer_path: str = "tokenizer.json"):
        if self.model is None:
            raise RuntimeError("Model is not trained yet. Train before saving.")

        self.model.save(keras_path)
        print(f"✅ Saved Keras model: {keras_path}")

        if save_tokenizer and self.tokenizer is not None:
            tok_json = self.tokenizer.to_json()
            with open(tokenizer_path, "w", encoding="utf-8") as f:
                f.write(tok_json)
            print(f"✅ Saved tokenizer: {tokenizer_path}")

    # ---------------------------
    # Simple generation helper
    # ---------------------------
    def generate_text(self, seed_text: str, next_words: int = 20, temperature: float = 1.0):
        def sample_with_temperature(preds, temp):
            preds = np.asarray(preds).astype('float64')
            preds = np.log(preds + 1e-8) / max(temp, 1e-6)
            exp_preds = np.exp(preds)
            preds = exp_preds / np.sum(exp_preds)
            return np.argmax(np.random.multinomial(1, preds, 1))

        seed_lemmatized = self._lemmatize_line(seed_text)
        for _ in range(next_words):
            token_list = self.tokenizer.texts_to_sequences([seed_lemmatized])[0]
            token_list = pad_sequences([token_list], maxlen=self.max_sequence_len - 1, padding='pre')
            preds = self.model.predict(token_list, verbose=0)[0]
            next_index = (sample_with_temperature(preds, temperature) if temperature and temperature != 1.0
                          else np.argmax(preds))
            next_word = self.tokenizer.index_word.get(next_index, '')
            if not next_word:
                break
            seed_lemmatized += " " + next_word
        return seed_lemmatized


In [12]:
# ---------------------------
# Run training
# ---------------------------
lstm = AdvanceLSTMRNN()
lstm.loadDataSet()

# Example generation after training:
# print(lstm.generate_text("to be or not to be", next_words=30, temperature=0.8))


Initializing LSTM RNN Model Training
Total words (lemmatized):  4304
Total sequences (2&3-gram):  46867
Shapes -> X_train: (42180, 2) y_train: (42180, 4304)




None
Epoch 1/20
[1m658/660[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - accuracy: 0.0369 - loss: 6.8178
Epoch 1: val_loss improved from inf to 6.28359, saving model to hamlet_nextword_best.keras
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.0370 - loss: 6.8164 - val_accuracy: 0.0544 - val_loss: 6.2836 - learning_rate: 0.0030
Epoch 2/20
[1m659/660[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 26ms/step - accuracy: 0.0658 - loss: 5.9956
Epoch 2: val_loss improved from 6.28359 to 6.05768, saving model to hamlet_nextword_best.keras
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 27ms/step - accuracy: 0.0658 - loss: 5.9954 - val_accuracy: 0.0804 - val_loss: 6.0577 - learning_rate: 0.0015
Epoch 3/20
[1m658/660[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 27ms/step - accuracy: 0.0894 - loss: 5.6514
Epoch 3: val_loss improved from 6.05768 to 5.96265, saving model to hamlet_nextword_best.ker