<a href="https://colab.research.google.com/github/Arvind6446/RNNMachineLearning/blob/main/hamlet_lstm_gru_nextword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Next-word Prediction with an LSTM (Hamlet)

This notebook trains an LSTM language model on *Shakespeare's Hamlet* (NLTK Gutenberg corpus) using 2-gram and 3-gram training windows after lemmatization.

Outputs:
- `hamlet_nextword_best.keras` (best checkpoint by `val_loss`)
- `hamlet_nextword.keras` (final saved model)
- `tokenizer.json` (tokenizer configuration)


In [None]:
# NLTK downloads (run once)
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')


[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/arvindmehta/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/arvindmehta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/arvindmehta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/arvindmehta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/arvindmehta/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import os
import math
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,GRU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TerminateOnNaN


In [None]:
# ---------------------------
# Load Hamlet from NLTK Gutenberg
# ---------------------------
data = gutenberg.raw('shakespeare-hamlet.txt')


In [None]:
import os
import re
import math
import json
import datetime
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import Sequential
from tensorflow.keras.layers import (
    Embedding,
    GRU,                 # ✅ IMPORTANT
    Dense,
    Dropout,
    SpatialDropout1D
)
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    TerminateOnNaN,
    TensorBoard
)


class AdvanceGRURNN:
    """
    Next-word prediction model (word-level) using GRU.
    This is your same pipeline, but the recurrent layers are GRU instead of LSTM.
    """

    def __init__(self, seed: int = 42, seq_len: int = 30, vocab_limit: int | None = 8000):
        print("Initializing GRU RNN Model Training")

        self.seed = seed
        self.seq_len = seq_len
        self.vocab_limit = vocab_limit  # ✅ helps on small datasets (optional)

        self.tokenizer = None
        self.total_words = None

        self.max_sequence_len = self.seq_len + 1
        self.model = None
        self.last_log_dir = None

        np.random.seed(seed)
        tf.random.set_seed(seed)

    # ---------------------------------------------------------------------
    # 1) DATASET I/O
    # ---------------------------------------------------------------------
    def loadDataSet(self, file_name: str = "hamlet.txt"):
        """
        Expects a variable named `data` in notebook scope.
        Example:
            from nltk.corpus import gutenberg
            data = gutenberg.raw("shakespeare-hamlet.txt")
        """
        with open(file_name, "w", encoding="utf-8") as f:
            f.write(data)

        with open(file_name, "r", encoding="utf-8") as f:
            text = f.read()

        self.tokenizeDataSet(text)

    # ---------------------------------------------------------------------
    # 2) PREPROCESSING
    # ---------------------------------------------------------------------
    def _clean_line(self, line: str) -> str:
        line = line.lower().strip()
        line = re.sub(r"[^a-z\.\,\?\!\;\:\'\-\s]", " ", line)
        line = re.sub(r"\s+", " ", line).strip()
        return line

    def _prepare_lines(self, text: str):
        lines = []
        for raw in text.split("\n"):
            cleaned = self._clean_line(raw)
            if cleaned:
                lines.append(cleaned)
        return lines

    # ---------------------------------------------------------------------
    # 3) TOKENIZATION + TRAIN/VAL SPLIT
    # ---------------------------------------------------------------------
    def tokenizeDataSet(self, text: str):
        lines = self._prepare_lines(text)
        if len(lines) < 50:
            raise ValueError("Not enough lines after cleaning to create train/val split.")

        split_idx = int(0.9 * len(lines))
        train_lines = lines[:split_idx]
        val_lines = lines[split_idx:]

        # ✅ vocab_limit (optional) improves learning on small corpora
        if self.vocab_limit is None:
            self.tokenizer = Tokenizer(oov_token="<OOV>")
        else:
            self.tokenizer = Tokenizer(num_words=self.vocab_limit, oov_token="<OOV>")

        self.tokenizer.fit_on_texts(train_lines)

        raw_vocab = len(self.tokenizer.word_index) + 1
        self.total_words = raw_vocab if self.vocab_limit is None else min(raw_vocab, self.vocab_limit)

        print("Total words (vocab size):", self.total_words)
        print("Train lines:", len(train_lines), "| Val lines:", len(val_lines))

        train_seqs = self.createInputSequences(train_lines)
        val_seqs = self.createInputSequences(val_lines)

        print(f"Train sequences: {len(train_seqs)} | Val sequences: {len(val_seqs)}")

        self.pad_and_train(train_seqs, val_seqs)

    # ---------------------------------------------------------------------
    # 4) SEQUENCE GENERATION
    # ---------------------------------------------------------------------
    def createInputSequences(self, lines):
        seq_plus_label = self.seq_len + 1
        sequences = []

        for line in lines:
            token_list = self.tokenizer.texts_to_sequences([line])[0]
            if len(token_list) < 2:
                continue

            if len(token_list) <= seq_plus_label:
                sequences.append(token_list)
            else:
                for i in range(seq_plus_label, len(token_list) + 1):
                    sequences.append(token_list[i - seq_plus_label:i])

        return sequences

    def pad_and_train(self, train_sequences, val_sequences):
        train_arr = np.array(
            pad_sequences(train_sequences, maxlen=self.max_sequence_len, padding="pre")
        )
        val_arr = np.array(
            pad_sequences(val_sequences, maxlen=self.max_sequence_len, padding="pre")
        )

        X_train = train_arr[:, :-1].astype("int32")
        y_train = train_arr[:, -1].astype("int32")
        X_val = val_arr[:, :-1].astype("int32")
        y_val = val_arr[:, -1].astype("int32")

        tr_keep = y_train != 0
        va_keep = y_val != 0
        X_train, y_train = X_train[tr_keep], y_train[tr_keep]
        X_val, y_val = X_val[va_keep], y_val[va_keep]

        print("Shapes -> X_train:", X_train.shape, "y_train:", y_train.shape)
        print("Shapes -> X_val:  ", X_val.shape, "y_val:", y_val.shape)

        hparams = self.suggest_hparams(n_train=len(y_train))

        train_ds = (
            tf.data.Dataset.from_tensor_slices((X_train, y_train))
            .shuffle(20000, seed=self.seed, reshuffle_each_iteration=True)
            .batch(hparams["batch_size"])
            .prefetch(tf.data.AUTOTUNE)
        )
        val_ds = (
            tf.data.Dataset.from_tensor_slices((X_val, y_val))
            .batch(hparams["batch_size"])
            .prefetch(tf.data.AUTOTUNE)
        )

        callbacks, log_dir = self.build_callbacks(
            es_patience=hparams["es_patience"],
            checkpoint_path="hamlet_nextword_best.keras",
            log_root="logs"
        )
        self.last_log_dir = log_dir

        self.build_and_train(
            train_ds=train_ds,
            val_ds=val_ds,
            embed_dim=hparams["embed_dim"],
            gru_units=hparams["gru_units"],
            dropout_rate=hparams["dropout_rate"],
            epochs=hparams["epochs"],
            initial_lr=hparams["initial_lr"],
            decay_steps=hparams["decay_steps"],
            decay_rate=hparams["decay_rate"],
            clipnorm=hparams["clipnorm"],
            callbacks=callbacks,
            recurrent_dropout=hparams["recurrent_dropout"],
        )

        self.save_model_keras(
            keras_path="hamlet_nextword.keras",
            tokenizer_path="tokenizer.json"
        )

    # ---------------------------------------------------------------------
    # 5) Hyperparameters
    # ---------------------------------------------------------------------
    def suggest_hparams(self, n_train: int):
        embed_dim = 128
        gru_units = 256

        dropout_rate = 0.30 if n_train < 80_000 else 0.20
        batch_size = 128 if n_train < 120_000 else 256

        epochs = 80
        es_patience = 8

        initial_lr = 2e-3
        decay_steps = max(1000, n_train // max(batch_size, 1))
        decay_rate = 0.5

        # ✅ recurrent_dropout often hurts speed; set to 0.0 if slow
        recurrent_dropout = 0.0

        return {
            "embed_dim": embed_dim,
            "gru_units": gru_units,
            "dropout_rate": dropout_rate,
            "batch_size": batch_size,
            "epochs": epochs,
            "es_patience": es_patience,
            "initial_lr": initial_lr,
            "decay_steps": decay_steps,
            "decay_rate": decay_rate,
            "clipnorm": 1.0,
            "recurrent_dropout": recurrent_dropout,
        }

    # ---------------------------------------------------------------------
    # 6) Optimizer (LR schedule)
    # ---------------------------------------------------------------------
    def make_optimizer(self, initial_lr, decay_steps, decay_rate, clipnorm=1.0):
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=initial_lr,
            decay_steps=decay_steps,
            decay_rate=decay_rate,
            staircase=True
        )
        return tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=clipnorm)

    # ---------------------------------------------------------------------
    # 7) Callbacks
    # ---------------------------------------------------------------------
    def build_callbacks(self, es_patience: int, checkpoint_path: str, log_root: str):
        run_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir = os.path.join(log_root, "hamlet_nextword_gru", run_id)
        os.makedirs(log_dir, exist_ok=True)

        callbacks = [
            TensorBoard(log_dir=log_dir, histogram_freq=0, write_graph=True, update_freq="epoch"),
            ModelCheckpoint(filepath=checkpoint_path, monitor="val_loss", save_best_only=True, verbose=1),
            EarlyStopping(monitor="val_loss", patience=es_patience, restore_best_weights=True, verbose=1),
            TerminateOnNaN(),
        ]

        print(f"✅ TensorBoard logs: {log_dir}")
        return callbacks, log_dir

    # ---------------------------------------------------------------------
    # 8) Build + Train (GRU)
    # ---------------------------------------------------------------------
    def build_and_train(
        self,
        train_ds,
        val_ds,
        embed_dim=128,
        gru_units=256,
        dropout_rate=0.3,
        epochs=30,
        initial_lr=2e-3,
        decay_steps=1000,
        decay_rate=0.5,
        clipnorm=1.0,
        callbacks=None,
        recurrent_dropout=0.0
    ):
        model = Sequential([
            Embedding(self.total_words, embed_dim, input_length=self.seq_len),
            SpatialDropout1D(0.2),

            GRU(gru_units, return_sequences=True, recurrent_dropout=recurrent_dropout),
            Dropout(dropout_rate),

            GRU(max(64, gru_units // 2), recurrent_dropout=recurrent_dropout),
            Dropout(dropout_rate),

            Dense(self.total_words, activation="softmax"),
        ])

        optimizer = self.make_optimizer(initial_lr, decay_steps, decay_rate, clipnorm=clipnorm)

        model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            optimizer=optimizer,
            metrics=[
                tf.keras.metrics.SparseCategoricalAccuracy(name="top1"),
                tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name="top5"),
                tf.keras.metrics.SparseTopKCategoricalAccuracy(k=10, name="top10"),
            ],
        )

        model.build(input_shape=(None, self.seq_len))
        self.model = model
        print(self.model.summary())

        history = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=epochs,
            callbacks=(callbacks or []),
            verbose=1
        )

        best_val_loss = float(np.min(history.history.get("val_loss", [np.nan])))
        if not np.isnan(best_val_loss):
            print(f"Best val_loss: {best_val_loss:.4f} | Perplexity ≈ {math.exp(best_val_loss):.2f}")

    # ---------------------------------------------------------------------
    # 9) Save model + tokenizer
    # ---------------------------------------------------------------------
    def save_model_keras(self, keras_path: str, tokenizer_path: str):
        if self.model is None or self.tokenizer is None:
            raise RuntimeError("Model is not trained yet. Train before saving.")

        self.model.save(keras_path)
        print(f"✅ Saved Keras model: {keras_path}")

        with open(tokenizer_path, "w", encoding="utf-8") as f:
            f.write(self.tokenizer.to_json())
        print(f"✅ Saved tokenizer: {tokenizer_path}")

    # ---------------------------------------------------------------------
    # 10) Generation (top-k sampling)
    # ---------------------------------------------------------------------
    def generate_text(self, seed_text: str, next_words: int = 20, temperature: float = 1.0, top_k: int = 20):
        if self.model is None or self.tokenizer is None:
            raise RuntimeError("Train the model first before generating text.")

        def sample_top_k(probs, k, temp):
            probs = np.asarray(probs).astype("float64")
            probs = np.log(probs + 1e-12) / max(temp, 1e-6)
            probs = np.exp(probs)
            probs = probs / np.sum(probs)

            if 0 < k < len(probs):
                top_idx = np.argpartition(probs, -k)[-k:]
                top_probs = probs[top_idx] / np.sum(probs[top_idx])
                return int(np.random.choice(top_idx, p=top_probs))

            return int(np.random.choice(len(probs), p=probs))

        seed_clean = self._clean_line(seed_text)

        for _ in range(next_words):
            token_list = self.tokenizer.texts_to_sequences([seed_clean])[0]
            token_list = pad_sequences([token_list], maxlen=self.seq_len, padding="pre")
            preds = self.model.predict(token_list, verbose=0)[0]

            next_index = sample_top_k(preds, top_k, temperature)
            next_word = self.tokenizer.index_word.get(next_index, "")

            if not next_word:
                break

            seed_clean += " " + next_word

        return seed_clean


In [None]:
# ---------------------------
# Run training
# ---------------------------
lstm = AdvanceLSTMRNN(seed=42, seq_len=30)
lstm.loadDataSet()

print(lstm.generate_text("to be or not to be", next_words=30, temperature=0.9, top_k=30))

# Example generation after training:
# print(lstm.generate_text("to be or not to be", next_words=30, temperature=0.8))


Initializing LSTM RNN Model Training
Total words (vocab size): 4512
Train lines: 3569 | Val lines: 397
Train sequences: 3497 | Val sequences: 385
Shapes -> X_train: (3497, 30) y_train: (3497,)
Shapes -> X_val:   (385, 30) y_val: (385,)
✅ TensorBoard logs: logs/hamlet_nextword/20251230-193422


None
Epoch 1/100
[1m 2/55[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:28[0m 2s/step - loss: 8.4141 - top1: 0.0000e+00 - top10: 0.0039 - top5: 0.0039        

KeyboardInterrupt: 

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/hamlet_nextword
