<a href="https://colab.research.google.com/github/Arvind6446/RNNMachineLearning/blob/main/hamlet_lstm_nextword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Next-word Prediction with an LSTM (Hamlet)

This notebook trains an LSTM language model on *Shakespeare's Hamlet* (NLTK Gutenberg corpus) using 2-gram and 3-gram training windows after lemmatization.

Outputs:
- `hamlet_nextword_best.keras` (best checkpoint by `val_loss`)
- `hamlet_nextword.keras` (final saved model)
- `tokenizer.json` (tokenizer configuration)


In [12]:
# NLTK downloads (run once)
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import os
import math
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TerminateOnNaN


In [14]:
# ---------------------------
# Load Hamlet from NLTK Gutenberg
# ---------------------------
data = gutenberg.raw('shakespeare-hamlet.txt')


In [15]:
import os
import re
import math
import json
import datetime
import numpy as np
import tensorflow as tf

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import Sequential
from tensorflow.keras.layers import (
    Embedding,
    LSTM,
    Dense,
    Dropout,
    SpatialDropout1D
)
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    TerminateOnNaN,
    TensorBoard
)


class AdvanceLSTMRNN:
    """
    Next-word prediction model for Hamlet (word-level).

    High-impact improvements vs your earlier version:
    1) Uses longer context windows (seq_len), not only 2/3-grams.
    2) Splits TRAIN/VAL at the LINE level first (prevents near-duplicate leakage).
    3) Fits tokenizer ONLY on training text (proper ML practice).
    4) Uses sparse labels (integer class IDs) + SparseCategoricalCrossentropy (faster/less RAM).
    5) Adds strong regularization (SpatialDropout1D + recurrent_dropout).
    6) Adds Top-K metrics (top1/top5/top10) which are more meaningful for language modeling.
    7) Adds TensorBoard logging (timestamped run directory) + checkpointing + early stopping.

    Notes:
    - Top-1 accuracy in language modeling can remain low; perplexity and top-k accuracy are better signals.
    - Hamlet is not huge; overfitting can happen quickly. Regularization + proper split help.
    """

    def __init__(self, seed: int = 42, seq_len: int = 30):
        """
        Parameters
        ----------
        seed : int
            Seed for reproducibility.
        seq_len : int
            Number of context tokens used to predict the next token.
            Typical good values: 20, 30, 50 (higher = slower and can overfit).
        """
        print("Initializing LSTM RNN Model Training")

        self.seed = seed
        self.seq_len = seq_len

        # Tokenizer/vocab
        self.tokenizer = None
        self.total_words = None  # vocab size + 1

        # Sequences
        self.max_sequence_len = self.seq_len + 1  # context + label
        self.n_sequences_train = 0
        self.n_sequences_val = 0

        # Model
        self.model = None

        # Logs
        self.last_log_dir = None

        # Reproducibility
        np.random.seed(seed)
        tf.random.set_seed(seed)

    # ---------------------------------------------------------------------
    # 1) DATASET I/O
    # ---------------------------------------------------------------------
    def loadDataSet(self, file_name: str = "hamlet.txt"):
        """
        Writes 'data' (raw Hamlet text) to disk and reads it back.
        Then triggers preprocessing + training.

        Expects a variable named `data` to exist in your notebook, e.g.:
            data = gutenberg.raw('shakespeare-hamlet.txt')
        """
        with open(file_name, "w", encoding="utf-8") as f:
            f.write(data)

        with open(file_name, "r", encoding="utf-8") as f:
            text = f.read()

        self.tokenizeDataSet(text)

    # ---------------------------------------------------------------------
    # 2) PREPROCESSING
    # ---------------------------------------------------------------------
    def _clean_line(self, line: str) -> str:
        """
        Cleans a line while keeping helpful punctuation tokens.
        We keep punctuation like . , ? ! ; : ' - because it improves structure for prediction.

        - Lowercase
        - Remove uncommon symbols
        - Collapse whitespace
        """
        line = line.lower().strip()
        line = re.sub(r"[^a-z\.\,\?\!\;\:\'\-\s]", " ", line)
        line = re.sub(r"\s+", " ", line).strip()
        return line

    def _prepare_lines(self, text: str):
        """
        Splits text into cleaned, non-empty lines.
        """
        lines = []
        for raw in text.split("\n"):
            cleaned = self._clean_line(raw)
            if cleaned:
                lines.append(cleaned)
        return lines

    # ---------------------------------------------------------------------
    # 3) TOKENIZATION + TRAIN/VAL SPLIT (IMPORTANT CHANGE)
    # ---------------------------------------------------------------------
    def tokenizeDataSet(self, text: str):
        """
        Pipeline:
        1) Clean lines
        2) Split lines into train/val FIRST (prevents near-duplicate leakage)
        3) Fit tokenizer ONLY on train lines
        4) Create sequences separately for train and val
        5) Pad, build tf.data, train model
        """
        lines = self._prepare_lines(text)

        if len(lines) < 50:
            raise ValueError("Not enough lines after cleaning to create train/val split.")

        # Line-level split (stable evaluation)
        split_idx = int(0.9 * len(lines))
        train_lines = lines[:split_idx]
        val_lines = lines[split_idx:]

        # Fit tokenizer only on training set (no leakage)
        self.tokenizer = Tokenizer(oov_token="<OOV>")
        self.tokenizer.fit_on_texts(train_lines)

        self.total_words = len(self.tokenizer.word_index) + 1
        print("Total words (vocab size):", self.total_words)
        print("Train lines:", len(train_lines), "| Val lines:", len(val_lines))

        # Build sequences
        train_seqs = self.createInputSequences(train_lines)
        val_seqs = self.createInputSequences(val_lines)

        self.n_sequences_train = len(train_seqs)
        self.n_sequences_val = len(val_seqs)

        print(f"Train sequences: {self.n_sequences_train} | Val sequences: {self.n_sequences_val}")

        # Pad and train
        self.pad_and_train(train_seqs, val_seqs)

    # ---------------------------------------------------------------------
    # 4) SEQUENCE GENERATION
    # ---------------------------------------------------------------------
    def createInputSequences(self, lines):
        """
        Creates sliding windows of length (seq_len + 1):
            tokens[0:seq_len] -> label = tokens[seq_len]

        If a line is shorter than (seq_len+1), we still keep one sample (it will be padded).
        """
        seq_plus_label = self.seq_len + 1
        sequences = []

        for line in lines:
            token_list = self.tokenizer.texts_to_sequences([line])[0]
            if len(token_list) < 2:
                continue

            if len(token_list) <= seq_plus_label:
                sequences.append(token_list)
            else:
                for i in range(seq_plus_label, len(token_list) + 1):
                    window = token_list[i - seq_plus_label:i]
                    sequences.append(window)

        return sequences

    def pad_and_train(self, train_sequences, val_sequences):
        """
        Pads sequences to fixed length and trains using tf.data datasets.
        """
        train_arr = np.array(
            pad_sequences(train_sequences, maxlen=self.max_sequence_len, padding="pre")
        )
        val_arr = np.array(
            pad_sequences(val_sequences, maxlen=self.max_sequence_len, padding="pre")
        )

        # X: context, y: next token id (sparse label)
        X_train = train_arr[:, :-1].astype("int32")
        y_train = train_arr[:, -1].astype("int32")
        X_val = val_arr[:, :-1].astype("int32")
        y_val = val_arr[:, -1].astype("int32")

        # Remove padding labels (0)
        tr_keep = y_train != 0
        va_keep = y_val != 0
        X_train, y_train = X_train[tr_keep], y_train[tr_keep]
        X_val, y_val = X_val[va_keep], y_val[va_keep]

        print("Shapes -> X_train:", X_train.shape, "y_train:", y_train.shape)
        print("Shapes -> X_val:  ", X_val.shape, "y_val:", y_val.shape)

        # Hyperparams
        hparams = self.suggest_hparams(n_train=len(y_train))

        # tf.data for speed
        train_ds = (
            tf.data.Dataset.from_tensor_slices((X_train, y_train))
            .shuffle(20000, seed=self.seed, reshuffle_each_iteration=True)
            .batch(hparams["batch_size"])
            .prefetch(tf.data.AUTOTUNE)
        )
        val_ds = (
            tf.data.Dataset.from_tensor_slices((X_val, y_val))
            .batch(hparams["batch_size"])
            .prefetch(tf.data.AUTOTUNE)
        )

        callbacks, log_dir = self.build_callbacks(
            use_early_stopping=True,
            es_patience=hparams["es_patience"],
            checkpoint_path="hamlet_nextword_best.keras",
            log_root="logs"
        )
        self.last_log_dir = log_dir

        # Train
        self.build_and_train(
            train_ds=train_ds,
            val_ds=val_ds,
            embed_dim=hparams["embed_dim"],
            lstm_units=hparams["lstm_units"],
            dropout_rate=hparams["dropout_rate"],
            epochs=hparams["epochs"],
            initial_lr=hparams["initial_lr"],
            decay_steps=hparams["decay_steps"],
            decay_rate=hparams["decay_rate"],
            clipnorm=hparams["clipnorm"],
            callbacks=callbacks
        )

        # Save final model + tokenizer
        self.save_model_keras(
            keras_path="hamlet_nextword.keras",
            save_tokenizer=True,
            tokenizer_path="tokenizer.json"
        )

        print("\nTensorBoard (Colab):")
        print("%load_ext tensorboard")
        print("%tensorboard --logdir logs")

    # ---------------------------------------------------------------------
    # 5) HYPERPARAMETER HEURISTICS
    # ---------------------------------------------------------------------
    def suggest_hparams(self, n_train: int):
        """
        Suggests reasonable hyperparameters for small/medium corpora.

        You can manually override these if you want.
        """
        V = self.total_words

        # Embedding / LSTM sizes scale sublinearly with vocab size
        embed_dim = int(np.clip(round(8 * math.log2(max(V, 4))), 96, 256))
        lstm_units = int(np.clip(round(10 * math.log2(max(V, 4))), 160, 384))

        # Dropout a bit stronger for smaller corpora
        dropout_rate = 0.35 if n_train < 50_000 else 0.25

        # Batch size
        batch_size = 64 if n_train < 50_000 else 128

        # Epochs + early stopping
        epochs = 40 if n_train < 80_000 else 25
        es_patience = 6

        # Learning rate schedule
        initial_lr = 2e-3
        decay_steps = max(1000, n_train // max(batch_size, 1))
        decay_rate = 0.5

        return {
            "embed_dim": embed_dim,
            "lstm_units": lstm_units,
            "dropout_rate": dropout_rate,
            "batch_size": batch_size,
            "epochs": epochs,
            "initial_lr": initial_lr,
            "decay_steps": decay_steps,
            "decay_rate": decay_rate,
            "clipnorm": 1.0,
            "es_patience": es_patience,
        }

    # ---------------------------------------------------------------------
    # 6) OPTIMIZER (LearningRateSchedule)
    # ---------------------------------------------------------------------
    def make_optimizer(self, initial_lr, decay_steps, decay_rate, clipnorm=1.0):
        """
        Adam + ExponentialDecay learning-rate schedule.

        Important:
        - Because LR is a schedule, it is NOT settable by ReduceLROnPlateau.
        - Therefore we do NOT include ReduceLROnPlateau in callbacks.
        """
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=initial_lr,
            decay_steps=decay_steps,
            decay_rate=decay_rate,
            staircase=True
        )
        return tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=clipnorm)

    # ---------------------------------------------------------------------
    # 7) CALLBACKS (TensorBoard + Checkpoint + EarlyStopping)
    # ---------------------------------------------------------------------
    def build_callbacks(
        self,
        use_early_stopping: bool = True,
        es_patience: int = 6,
        checkpoint_path: str = "best_model.keras",
        log_root: str = "logs",
    ):
        """
        TensorBoard:
          - logs per-run into logs/hamlet_nextword/<timestamp>/
          - histogram_freq=1 logs weight histograms each epoch (slower but useful)

        ModelCheckpoint:
          - saves the best model by val_loss

        EarlyStopping:
          - stops when val_loss stops improving
          - restores best weights

        TerminateOnNaN:
          - safety stop if training becomes numerically unstable
        """
        run_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir = os.path.join(log_root, "hamlet_nextword", run_id)
        os.makedirs(log_dir, exist_ok=True)

        callbacks = [
            TensorBoard(
                log_dir=log_dir,
                histogram_freq=1,
                write_graph=True,
                update_freq="epoch",
                profile_batch=0
            ),
            ModelCheckpoint(
                filepath=checkpoint_path,
                monitor="val_loss",
                save_best_only=True,
                save_weights_only=False,
                verbose=1
            ),
            TerminateOnNaN(),
        ]

        if use_early_stopping:
            callbacks.append(
                EarlyStopping(
                    monitor="val_loss",
                    patience=es_patience,
                    restore_best_weights=True,
                    verbose=1
                )
            )

        print(f"✅ TensorBoard logs: {log_dir}")
        return callbacks, log_dir

    # ---------------------------------------------------------------------
    # 8) BUILD & TRAIN MODEL
    # ---------------------------------------------------------------------
    def build_and_train(
        self,
        train_ds,
        val_ds,
        embed_dim=128,
        lstm_units=256,
        dropout_rate=0.3,
        epochs=30,
        initial_lr=2e-3,
        decay_steps=1000,
        decay_rate=0.5,
        clipnorm=1.0,
        callbacks=None
    ):
        """
        Architecture tuned for small corpus:
          Embedding
          -> SpatialDropout1D (regularizes embeddings)
          -> LSTM (return_sequences=True) with recurrent_dropout
          -> Dropout
          -> LSTM (smaller) with recurrent_dropout
          -> Dropout
          -> Dense softmax over vocabulary

        Metrics:
          - top1: SparseCategoricalAccuracy
          - top5/top10: SparseTopKCategoricalAccuracy
        """
        model = Sequential([
            Embedding(self.total_words, embed_dim, input_length=self.seq_len),
            SpatialDropout1D(0.2),

            LSTM(
                lstm_units,
                return_sequences=True,
                recurrent_dropout=0.15
            ),
            Dropout(dropout_rate),

            LSTM(
                max(64, lstm_units // 2),
                recurrent_dropout=0.15
            ),
            Dropout(dropout_rate),

            Dense(self.total_words, activation="softmax"),
        ])

        optimizer = self.make_optimizer(initial_lr, decay_steps, decay_rate, clipnorm=clipnorm)

        model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            optimizer=optimizer,
            metrics=[
                tf.keras.metrics.SparseCategoricalAccuracy(name="top1"),
                tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name="top5"),
                tf.keras.metrics.SparseTopKCategoricalAccuracy(k=10, name="top10"),
            ],
        )

        # Force-build so summary shows real shapes/params (no "unbuilt")
        model.build(input_shape=(None, self.seq_len))
        self.model = model
        print(self.model.summary())

        history = self.model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=epochs,
            callbacks=(callbacks or []),
            verbose=1
        )

        best_val_loss = float(np.min(history.history.get("val_loss", [np.nan])))
        if not np.isnan(best_val_loss):
            print(f"Best val_loss: {best_val_loss:.4f} | Perplexity ≈ {math.exp(best_val_loss):.2f}")

    # ---------------------------------------------------------------------
    # 9) SAVE MODEL + TOKENIZER
    # ---------------------------------------------------------------------
    def save_model_keras(self, keras_path: str = "hamlet_nextword.keras",
                         save_tokenizer: bool = True,
                         tokenizer_path: str = "tokenizer.json"):
        """
        Saves:
          - trained model in .keras format
          - tokenizer JSON so you can reproduce the same word->id mapping later
        """
        if self.model is None:
            raise RuntimeError("Model is not trained yet. Train before saving.")

        self.model.save(keras_path)
        print(f"✅ Saved Keras model: {keras_path}")

        if save_tokenizer and self.tokenizer is not None:
            tok_json = self.tokenizer.to_json()
            with open(tokenizer_path, "w", encoding="utf-8") as f:
                f.write(tok_json)
            print(f"✅ Saved tokenizer: {tokenizer_path}")

    # ---------------------------------------------------------------------
    # 10) GENERATION (Top-k sampling)
    # ---------------------------------------------------------------------
    def generate_text(self, seed_text: str, next_words: int = 20, temperature: float = 1.0, top_k: int = 20):
        """
        Generates text by predicting next tokens repeatedly.

        Improvements:
          - top-k sampling (more controllable than pure argmax)
          - temperature controls randomness

        Parameters
        ----------
        seed_text : str
            Initial prompt.
        next_words : int
            How many tokens to generate.
        temperature : float
            <1.0 = more conservative; >1.0 = more random
        top_k : int
            Sample only from the top_k most likely words (helps coherence).
        """
        if self.model is None or self.tokenizer is None:
            raise RuntimeError("Train the model first before generating text.")

        def sample_top_k(probs, k, temp):
            probs = np.asarray(probs).astype("float64")

            # Temperature scaling
            probs = np.log(probs + 1e-12) / max(temp, 1e-6)
            probs = np.exp(probs)
            probs = probs / np.sum(probs)

            # Top-k filter
            if k is not None and k > 0 and k < len(probs):
                top_idx = np.argpartition(probs, -k)[-k:]
                top_probs = probs[top_idx]
                top_probs = top_probs / np.sum(top_probs)
                return int(np.random.choice(top_idx, p=top_probs))

            # Fallback: full distribution sampling
            return int(np.random.choice(len(probs), p=probs))

        seed_clean = self._clean_line(seed_text)

        for _ in range(next_words):
            token_list = self.tokenizer.texts_to_sequences([seed_clean])[0]
            token_list = pad_sequences([token_list], maxlen=self.seq_len, padding="pre")
            preds = self.model.predict(token_list, verbose=0)[0]

            next_index = sample_top_k(preds, top_k, temperature)
            next_word = self.tokenizer.index_word.get(next_index, "")

            if not next_word:
                break

            seed_clean += " " + next_word

        return seed_clean


In [None]:
# ---------------------------
# Run training
# ---------------------------
lstm = AdvanceLSTMRNN(seed=42, seq_len=30)
lstm.loadDataSet()

print(lstm.generate_text("to be or not to be", next_words=30, temperature=0.9, top_k=30))

# Example generation after training:
# print(lstm.generate_text("to be or not to be", next_words=30, temperature=0.8))


Initializing LSTM RNN Model Training
Total words (vocab size): 4512
Train lines: 3569 | Val lines: 397
Train sequences: 3497 | Val sequences: 385
Shapes -> X_train: (3497, 30) y_train: (3497,)
Shapes -> X_val:   (385, 30) y_val: (385,)
✅ TensorBoard logs: logs/hamlet_nextword/20251230-132214




None
Epoch 1/40
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step - loss: 8.0910 - top1: 0.0165 - top10: 0.0698 - top5: 0.0462
Epoch 1: val_loss improved from inf to 7.82589, saving model to hamlet_nextword_best.keras
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 274ms/step - loss: 8.0873 - top1: 0.0166 - top10: 0.0702 - top5: 0.0465 - val_loss: 7.8259 - val_top1: 0.0208 - val_top10: 0.1013 - val_top5: 0.0623
Epoch 2/40
[1m29/55[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m7s[0m 279ms/step - loss: 7.0773 - top1: 0.0224 - top10: 0.1051 - top5: 0.0664

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/hamlet_nextword
