<a href="https://colab.research.google.com/github/Arvind6446/RNNMachineLearning/blob/main/hamlet_lstm_nextword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Next-word Prediction with an LSTM (Hamlet)

This notebook trains an LSTM language model on *Shakespeare's Hamlet* (NLTK Gutenberg corpus) using 2-gram and 3-gram training windows after lemmatization.

Outputs:
- `hamlet_nextword_best.keras` (best checkpoint by `val_loss`)
- `hamlet_nextword.keras` (final saved model)
- `tokenizer.json` (tokenizer configuration)


In [6]:
# NLTK downloads (run once)
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import os
import math
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TerminateOnNaN


In [8]:
# ---------------------------
# Load Hamlet from NLTK Gutenberg
# ---------------------------
data = gutenberg.raw('shakespeare-hamlet.txt')


In [9]:
import os
import math
import json
import datetime
import numpy as np
import tensorflow as tf

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TerminateOnNaN, TensorBoard


class AdvanceLSTMRNN:
    def __init__(self, seed: int = 42, seq_len: int = 30):
        """
        seq_len: number of tokens in the input context.
                Larger seq_len => better accuracy but more compute.
        """
        print("Initializing LSTM RNN Model Training")
        self.total_words = None
        self.tokenizer = None
        self.max_sequence_len = None  # will be seq_len+1 (context + label)
        self.model = None
        self.lemmatizer = WordNetLemmatizer()
        self.n_sequences = 0
        self.seed = seed
        self.seq_len = seq_len
        self.last_log_dir = None

        np.random.seed(seed)
        tf.random.set_seed(seed)

    # ---------------------------
    # Dataset I/O
    # ---------------------------
    def loadDataSet(self):
        with open("hamlet.txt", "w", encoding="utf-8") as f:
            f.write(data)

        with open("hamlet.txt", "r", encoding="utf-8") as f:
            text = f.read()

        self.tokenizeDataSet(text)

    # ---------------------------
    # Preprocessing
    # ---------------------------
    def _lemmatize_line(self, line: str) -> str:
        tokens = word_tokenize(line.lower())
        lemmas = [self.lemmatizer.lemmatize(t) for t in tokens if t.isalpha()]
        return " ".join(lemmas)

    def tokenizeDataSet(self, text: str):
        lemmatized_lines = []
        for line in text.split("\n"):
            cleaned = self._lemmatize_line(line)
            if cleaned.strip():
                lemmatized_lines.append(cleaned)

        self.tokenizer = Tokenizer(oov_token="<OOV>")
        self.tokenizer.fit_on_texts(lemmatized_lines)
        self.total_words = len(self.tokenizer.word_index) + 1
        print("Total words (lemmatized):", self.total_words)

        input_sequences = self.createInputSequences(lemmatized_lines)
        self.padSequences(input_sequences)

    # ---------------------------
    # Sequence building (UPGRADED)
    # ---------------------------
    def createInputSequences(self, lines):
        """
        Creates fixed-length windows of size (seq_len + 1):
        [w1..w_seq_len] -> label is next word
        """
        input_sequences = []
        seq_plus_label = self.seq_len + 1

        for line in lines:
            token_list = self.tokenizer.texts_to_sequences([line])[0]
            if len(token_list) < 2:
                continue

            if len(token_list) <= seq_plus_label:
                input_sequences.append(token_list)
            else:
                for i in range(seq_plus_label, len(token_list) + 1):
                    ngram = token_list[i - seq_plus_label:i]
                    input_sequences.append(ngram)

        self.n_sequences = len(input_sequences)
        print(f"Total sequences (seq_len={self.seq_len}):", self.n_sequences)
        return input_sequences

    def padSequences(self, input_sequences):
        self.max_sequence_len = self.seq_len + 1
        input_sequences = np.array(
            pad_sequences(input_sequences, maxlen=self.max_sequence_len, padding="pre")
        )
        self.createPredictorandLabels(input_sequences)

    # ---------------------------
    # Train/Val + tf.data
    # ---------------------------
    def createPredictorandLabels(self, input_sequences):
        X = input_sequences[:, :-1].astype("int32")
        y = input_sequences[:, -1].astype("int32")

        # Drop samples with label 0 (padding)
        keep = y != 0
        X, y = X[keep], y[keep]

        n = len(y)
        idx = np.arange(n)
        np.random.shuffle(idx)

        split = int(0.9 * n)
        train_idx, val_idx = idx[:split], idx[split:]
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]

        print("Shapes -> X_train:", X_train.shape, "y_train:", y_train.shape)

        hparams = self.suggest_hparams()

        train_ds = (
            tf.data.Dataset.from_tensor_slices((X_train, y_train))
            .shuffle(20000, seed=self.seed, reshuffle_each_iteration=True)
            .batch(hparams["batch_size"])
            .prefetch(tf.data.AUTOTUNE)
        )

        val_ds = (
            tf.data.Dataset.from_tensor_slices((X_val, y_val))
            .batch(hparams["batch_size"])
            .prefetch(tf.data.AUTOTUNE)
        )

        callbacks, log_dir = self.build_callbacks(
            use_early_stopping=True,
            es_patience=hparams["es_patience"],
            checkpoint_path="hamlet_nextword_best.keras",
            log_root="logs"
        )
        self.last_log_dir = log_dir

        self.build_and_train(
            train_ds, val_ds,
            embed_dim=hparams["embed_dim"],
            lstm_units=hparams["lstm_units"],
            dropout_rate=hparams["dropout_rate"],
            epochs=hparams["epochs"],
            initial_lr=hparams["initial_lr"],
            decay_steps=hparams["decay_steps"],
            decay_rate=hparams["decay_rate"],
            clipnorm=hparams["clipnorm"],
            callbacks=callbacks
        )

        self.save_model_keras(
            keras_path="hamlet_nextword.keras",
            save_tokenizer=True,
            tokenizer_path="tokenizer.json"
        )

        print("\nTo open TensorBoard in Colab, run:")
        print("%load_ext tensorboard")
        print(f"%tensorboard --logdir {os.path.dirname(log_dir)}")

    # ---------------------------
    # Hyperparameters
    # ---------------------------
    def suggest_hparams(self):
        V = self.total_words
        N = self.n_sequences

        embed_dim = int(np.clip(round(8 * math.log2(max(V, 4))), 96, 256))
        lstm_units = int(np.clip(round(14 * math.log2(max(V, 4))), 192, 512))

        dropout_rate = 0.2 if N > 150_000 else 0.3
        batch_size = 128 if N < 200_000 else 256
        epochs = 35 if N < 200_000 else 25

        initial_lr = 2e-3
        decay_steps = max(1000, N // max(batch_size, 1))
        decay_rate = 0.5
        es_patience = 5

        return {
            "embed_dim": embed_dim,
            "lstm_units": lstm_units,
            "dropout_rate": dropout_rate,
            "batch_size": batch_size,
            "epochs": epochs,
            "initial_lr": initial_lr,
            "decay_steps": decay_steps,
            "decay_rate": decay_rate,
            "clipnorm": 1.0,
            "es_patience": es_patience,
        }

    # ---------------------------
    # Optimizer (LR schedule)
    # ---------------------------
    def make_optimizer(self, initial_lr, decay_steps, decay_rate, clipnorm=1.0):
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=initial_lr,
            decay_steps=decay_steps,
            decay_rate=decay_rate,
            staircase=True
        )
        return tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=clipnorm)

    # ---------------------------
    # Callbacks (TensorBoard + CKPT + ES) ✅ fixed
    # ---------------------------
    def build_callbacks(
        self,
        use_early_stopping: bool = True,
        es_patience: int = 5,
        checkpoint_path: str = "best_model.keras",
        log_root: str = "logs",
    ):
        run_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir = os.path.join(log_root, "hamlet_nextword", run_id)
        os.makedirs(log_dir, exist_ok=True)

        callbacks = [
            TensorBoard(
                log_dir=log_dir,
                histogram_freq=1,
                write_graph=True,
                update_freq="epoch",
                profile_batch=0  # set "10,20" to profile
            ),
            ModelCheckpoint(
                filepath=checkpoint_path,
                monitor="val_loss",
                save_best_only=True,
                save_weights_only=False,
                verbose=1
            ),
            TerminateOnNaN(),
        ]

        if use_early_stopping:
            callbacks.append(
                EarlyStopping(
                    monitor="val_loss",
                    patience=es_patience,
                    restore_best_weights=True,
                    verbose=1
                )
            )

        print(f"✅ TensorBoard logs: {log_dir}")
        return callbacks, log_dir

    # ---------------------------
    # Build & Train
    # ---------------------------
    def build_and_train(
        self,
        train_ds,
        val_ds,
        embed_dim=128,
        lstm_units=256,
        dropout_rate=0.2,
        epochs=15,
        initial_lr=0.001,
        decay_steps=1000,
        decay_rate=0.5,
        clipnorm=1.0,
        callbacks=None
    ):
        model = Sequential([
            Embedding(self.total_words, embed_dim, input_length=self.seq_len),
            Bidirectional(LSTM(lstm_units, return_sequences=True)),
            Dropout(dropout_rate),
            LSTM(lstm_units),
            Dropout(dropout_rate),
            Dense(self.total_words, activation="softmax"),
        ])

        optimizer = self.make_optimizer(initial_lr, decay_steps, decay_rate, clipnorm=clipnorm)

        model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            optimizer=optimizer,
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]
        )

        self.model = model
        print(self.model.summary())

        history = self.model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=epochs,
            callbacks=(callbacks or []),
            verbose=1
        )
        print("Training complete.")

        best_val_loss = float(np.min(history.history.get("val_loss", [np.nan])))
        if not np.isnan(best_val_loss):
            print(f"Best val_loss: {best_val_loss:.4f} | Perplexity ≈ {math.exp(best_val_loss):.2f}")

    # ---------------------------
    # Save model + tokenizer
    # ---------------------------
    def save_model_keras(self, keras_path: str = "hamlet_nextword.keras",
                         save_tokenizer: bool = True,
                         tokenizer_path: str = "tokenizer.json"):
        if self.model is None:
            raise RuntimeError("Model is not trained yet. Train before saving.")

        self.model.save(keras_path)
        print(f"✅ Saved Keras model: {keras_path}")

        if save_tokenizer and self.tokenizer is not None:
            tok_json = self.tokenizer.to_json()
            with open(tokenizer_path, "w", encoding="utf-8") as f:
                f.write(tok_json)
            print(f"✅ Saved tokenizer: {tokenizer_path}")

    # ---------------------------
    # Generation helper
    # ---------------------------
    def generate_text(self, seed_text: str, next_words: int = 20, temperature: float = 1.0):
        if self.model is None or self.tokenizer is None:
            raise RuntimeError("Train the model first before generating text.")

        def sample_with_temperature(probs, temp):
            probs = np.asarray(probs).astype("float64")
            probs = np.log(probs + 1e-8) / max(temp, 1e-6)
            probs = np.exp(probs) / np.sum(np.exp(probs))
            return int(np.argmax(np.random.multinomial(1, probs, 1)))

        seed_lemmatized = self._lemmatize_line(seed_text)

        for _ in range(next_words):
            token_list = self.tokenizer.texts_to_sequences([seed_lemmatized])[0]
            token_list = pad_sequences([token_list], maxlen=self.seq_len, padding="pre")
            preds = self.model.predict(token_list, verbose=0)[0]

            next_index = sample_with_temperature(preds, temperature) if temperature and temperature != 1.0 else int(np.argmax(preds))
            next_word = self.tokenizer.index_word.get(next_index, "")
            if not next_word:
                break
            seed_lemmatized += " " + next_word

        return seed_lemmatized


In [None]:
# ---------------------------
# Run training
# ---------------------------
lstm = AdvanceLSTMRNN()
lstm.loadDataSet()

# Example generation after training:
# print(lstm.generate_text("to be or not to be", next_words=30, temperature=0.8))


Initializing LSTM RNN Model Training
Total words (lemmatized): 4304
Total sequences (seq_len=30): 3875
Shapes -> X_train: (3487, 30) y_train: (3487,)
✅ TensorBoard logs: logs/hamlet_nextword/20251230-131302


None
Epoch 1/35
[1m 9/28[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m11s[0m 624ms/step - accuracy: 0.0141 - loss: 8.3119