In [2]:
# import basic libraries and show versions for debugging
import os                                         # access filesystem
import math                                       # math functions (exp)
import random                                     # random choices for sampling
import time                                       # measure elapsed time
import numpy as np                                # numeric arrays
import matplotlib.pyplot as plt                   # plotting
import torch                                      # main PyTorch package
import torch.nn as nn                             # neural network modules
from torch.utils.data import Dataset, DataLoader  # dataset and dataloader utilities

# print library versions and device info to confirm environment
print("numpy", np.__version__)                     # show numpy version
print("torch", torch.__version__)                  # show torch version
print("cuda available:", torch.cuda.is_available())# show if GPU is available


numpy 2.0.2
torch 2.8.0+cu126
cuda available: True


In [6]:
# path to your file; change if the file is in another folder
file_path = "/kaggle/input/office-script-clean/office_script_clean.txt"              # your provided script filename

# read the text file into a string variable
with open(file_path, "r", encoding="utf-8") as f:   # open file for reading
    raw_text = f.read()                             # read the whole file into memory

# simple cleanup: normalize line endings to '\n' (if needed)
raw_text = raw_text.replace("\r\n", "\n").replace("\r", "\n")  # normalize newlines

# report how many characters we loaded so you can confirm file read
print("Loaded characters:", len(raw_text))          # print total characters loaded

# preview the first 800 characters so you see formatting
print(raw_text[:800])                               # show a short preview

Loaded characters: 3427466
Michael: All right Jim. Your quarterlies look very good. How are things at the library?
Jim: Oh, I told you. I couldn't close it. So...
Michael: So you've come to the master for guidance? Is this what you're saying, grasshopper?
Jim: Actually, you called me in here, but yeah.
Michael: All right. Well, let me show you how it's done.
Michael:  Yes, I'd like to speak to your office manager, please. Yes, hello. This is Michael Scott. I am the Regional Manager of Dunder Mifflin Paper Products. Just wanted to talk to you manager-a-manger.  All right. Done deal. Thank you very much, sir. You're a gentleman and a scholar. Oh, I'm sorry. OK. I'm sorry. My mistake.  That was a woman I was talking to, so... She had a very low voice. Probably a smoker, so...  So that's the way it's done.
Michael: I've


In [7]:
# build character vocabulary (unique chars found in the dataset)
vocab = sorted(list(set(raw_text)))                 # unique characters sorted for stability

# build mappings: char -> id and id -> char
stoi = {ch:i for i,ch in enumerate(vocab)}          # string to index mapping
itos = {i:ch for i,ch in enumerate(vocab)}          # index to string mapping

# encode entire raw_text into a numpy array of integer ids
data_ids = np.array([stoi[ch] for ch in raw_text], dtype=np.int64)  # numeric encoding

# data statistics requested
vocab_size = len(vocab)                             # number of unique characters
total_chars = len(raw_text)                         # total characters in dataset
context_length = 128                                # chosen context length (can be changed)

# print the stats so you get the exact numbers for your dataset
print("vocab size:", vocab_size)                    # print vocab size
print("total characters:", total_chars)             # print total characters
print("context length (suggested):", context_length)  # print context length choice

vocab size: 72
total characters: 3427466
context length (suggested): 128


In [8]:
# 90/10 train/validation split index
split_at = int(0.9 * len(data_ids))                 # compute index for 90% split

# split encoded ids into train and validation arrays
train_ids = data_ids[:split_at]                     # training portion ids
val_ids = data_ids[split_at:]                       # validation portion ids

# define a simple PyTorch Dataset for sliding windows of characters
class CharDataset(Dataset):                          # subclass Dataset
    def __init__(self, arr, context_len):            # constructor with array and window size
        self.arr = arr                               # store array of ids
        self.context_len = context_len               # store context length
    def __len__(self):                               # required: number of samples
        return max(0, len(self.arr) - self.context_len)  # number of windows
    def __getitem__(self, idx):                      # required: get one sample by index
        x = self.arr[idx: idx + self.context_len]   # input window of length context_len
        y = self.arr[idx + 1: idx + 1 + self.context_len]  # target is next characters
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)  # return tensors

# instantiate datasets for train and validation
train_dataset = CharDataset(train_ids, context_length)  # training dataset instance
val_dataset = CharDataset(val_ids, context_length)      # validation dataset instance

# dataloader parameters
batch_size = 64                                     # batch size (tune for your GPU/CPU)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)  # train loader
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)     # val loader

# show how many batches per epoch for sanity check
print("Train batches per epoch:", len(train_loader))   # number of training batches
print("Val batches per epoch:", len(val_loader))       # number of validation batches


Train batches per epoch: 48196
Val batches per epoch: 5353


In [14]:
# tuned hyperparameters for the full script
embed_size = 128                     # embedding vector size for each character
hidden_size = 512                    # LSTM hidden dimension (512 recommended for quality)
n_layers = 2                         # stacked LSTM layers
learning_rate = 0.0015               # Adam learning rate
num_epochs = 40                      # epochs to run
checkpoint_path = "lstm_modules_full.pt"  # where checkpoints will be saved

# define modules and move them to the device
embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_size).to(device)  # char -> vector
lstm = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=n_layers, batch_first=True).to(device)  # recurrent core
fc = nn.Linear(in_features=hidden_size, out_features=vocab_size).to(device)  # project LSTM outputs to vocab logits

# collect parameters from modules for optimizer and gradient operations
params = list(embed.parameters()) + list(lstm.parameters()) + list(fc.parameters())  # gather all trainable params
optimizer = torch.optim.Adam(params, lr=learning_rate)  # Adam optimizer over all params
criterion = nn.CrossEntropyLoss()                        # loss: cross-entropy over vocab logits

# print a small sanity summary so you know the model & device state
print("Device:", device)                                  # which device is being used
print("Vocab size:", vocab_size)                          # number of unique characters
print("Parameter count:", sum(p.numel() for p in params)) # total model parameters
print("Train batches per epoch:", len(train_loader))      # how many train batches per epoch
print("Val batches per epoch:", len(val_loader))          # how many val batches per epoch

Device: cuda
Vocab size: 72
Parameter count: 3462216
Train batches per epoch: 48196
Val batches per epoch: 5353


In [None]:
def evaluate_modules(embed, lstm, fc, dataloader):
    """
    Compute average negative log-likelihood (NLL) and perplexity on dataloader.
    Returns (avg_nll, perplexity).
    """
    # set modules to eval mode (affects dropout/batchnorm if present)
    embed.eval(); lstm.eval(); fc.eval()

    total_loss = 0.0                 # accumulator for loss * token_count
    total_tokens = 0                 # total token count seen

    with torch.no_grad():            # turn off gradients for evaluation
        for xb, yb in dataloader:    # iterate mini-batches
            xb = xb.to(device)       # move input batch to device
            yb = yb.to(device)       # move target batch to device

            emb = embed(xb)          # (B, T, E) embeddings for the batch
            out, _ = lstm(emb)       # (B, T, H) LSTM outputs (ignore hidden state)
            logits = fc(out)         # (B, T, V) logits over vocabulary

            B, T, V = logits.shape   # batch, time, vocab sizes
            logits_flat = logits.view(B*T, V)     # flatten to (B*T, V) for loss
            targets_flat = yb.view(B*T)          # flatten targets to (B*T)

            loss = criterion(logits_flat, targets_flat)  # cross-entropy averaged over B*T
            total_loss += loss.item() * (B*T)            # weight by token count
            total_tokens += (B*T)                       # increase token counter

    avg_nll = total_loss / total_tokens        # average negative log-likelihood per token
    ppl = math.exp(avg_nll)                    # perplexity = exp(avg NLL)
    return avg_nll, ppl                        # return both values

In [15]:
def train_modules_with_history(embed, lstm, fc, train_loader, val_loader, num_epochs, save_path):
    """
    Train the embed/lstm/fc modules and return histories:
    (train_losses, val_losses, train_ppls, val_ppls).
    Checkpoints saved to save_path after each epoch.
    """
    embed.to(device); lstm.to(device); fc.to(device)  # ensure modules are on correct device

    train_losses = []      # store average train NLL per epoch
    val_losses = []        # store average val NLL per epoch
    train_ppls = []        # store train perplexities per epoch
    val_ppls = []          # store val perplexities per epoch

    for epoch in range(num_epochs):                   # epoch loop
        embed.train(); lstm.train(); fc.train()       # set modules to training mode
        running_loss = 0.0                            # accumulate train loss * tokens
        running_tokens = 0                            # accumulate train tokens
        t0 = time.time()                              # epoch start time

        for xb, yb in train_loader:                   # iterate training batches
            xb = xb.to(device); yb = yb.to(device)    # move tensors to device
            optimizer.zero_grad()                     # zero gradients before backprop

            emb_out = embed(xb)                       # (B, T, E) embeddings
            seq_out, _ = lstm(emb_out)                # (B, T, H) LSTM outputs
            logits = fc(seq_out)                      # (B, T, V) logits

            B, T, V = logits.shape                    # get shapes for flattening
            logits_flat = logits.view(B*T, V)         # flatten logits -> (B*T, V)
            targets_flat = yb.view(B*T)               # flatten targets -> (B*T,)

            loss = criterion(logits_flat, targets_flat)  # compute loss (scalar)
            loss.backward()                            # backpropagate gradients
            torch.nn.utils.clip_grad_norm_(params, 1.0) # clip gradient norm for stability
            optimizer.step()                           # update model parameters

            running_loss += loss.item() * (B*T)        # accumulate weighted loss
            running_tokens += (B*T)                    # accumulate token count

        # evaluate after finishing epoch to record metrics
        train_loss, train_ppl = evaluate_modules(embed, lstm, fc, train_loader)  # train metrics
        val_loss, val_ppl = evaluate_modules(embed, lstm, fc, val_loader)        # validation metrics

        train_losses.append(train_loss)               # save train loss
        val_losses.append(val_loss)                   # save val loss
        train_ppls.append(train_ppl)                  # save train perplexity
        val_ppls.append(val_ppl)                      # save val perplexity

        t1 = time.time()                               # epoch end time
        # print concise progress line with numbers you care about
        print(f"Epoch {epoch+1}/{num_epochs} | train_loss {train_loss:.4f} ppl {train_ppl:.3f} | val_loss {val_loss:.4f} ppl {val_ppl:.3f} | time {t1-t0:.1f}s")

        # checkpoint: save module state dicts and small metadata so you can resume or sample later
        torch.save({
            'embed_state': embed.state_dict(), 'lstm_state': lstm.state_dict(), 'fc_state': fc.state_dict(),
            'stoi': stoi, 'itos': itos, 'epoch': epoch+1, 'train_loss': train_loss, 'val_loss': val_loss, 'val_ppl': val_ppl
        }, save_path)

    return train_losses, val_losses, train_ppls, val_ppls  # return recorded histories


In [16]:
def generate_script(seed_text, temperature, num_tokens_to_generate):
    """
    Generate characters using saved LSTM modules if checkpoint exists.
    If no checkpoint found, fall back to bigram sampling built from training text.
    Signature required by the task: generate_script(seed_text, temperature, num_tokens_to_generate)
    """
    ckpt = checkpoint_path  # uses the same checkpoint path defined earlier

    if os.path.exists(ckpt):                           # if a trained checkpoint file exists
        ck = torch.load(ckpt, map_location=device)     # load checkpoint to CPU/GPU as needed
        embed.load_state_dict(ck['embed_state'])       # restore embedding weights
        lstm.load_state_dict(ck['lstm_state'])         # restore LSTM weights
        fc.load_state_dict(ck['fc_state'])             # restore final linear weights
        embed.to(device); lstm.to(device); fc.to(device)  # ensure modules are on device
        embed.eval(); lstm.eval(); fc.eval()           # set modules to evaluation mode

        if len(seed_text) == 0:                         # if seed is empty, start with newline
            seed_text = "\n"

        # map seed chars to ids using global stoi (unknown -> 0)
        input_ids = [stoi.get(ch, 0) for ch in seed_text]
        inp = torch.tensor([input_ids], dtype=torch.long).to(device)  # shape (1, L)
        out_text = seed_text                              # initialize output string with seed

        with torch.no_grad():                             # disable gradients during sampling
            emb_seed = embed(inp)                         # embed the seed -> (1, L, E)
            out_seed, hidden = lstm(emb_seed)             # run entire seed to prime hidden state
            last_id = input_ids[-1]                       # start sampling conditioned on last seed id

            for _ in range(num_tokens_to_generate):       # generate requested number of characters
                step_inp = torch.tensor([[last_id]], dtype=torch.long).to(device)  # (1,1) tensor
                step_emb = embed(step_inp)                # embedding for the last token -> (1,1,E)
                logits_step, hidden = lstm(step_emb, hidden)  # one-step forward using current hidden
                logits_np = logits_step[0, -1, :].cpu().numpy()  # extract logits for vocabulary (V,)

                # temperature handling: low temperature -> deterministic, high -> more random
                if temperature <= 0.0:
                    temperature = 1e-8

                logits_scaled = logits_np / temperature    # scale logits by 1/temperature
                exps = np.exp(logits_scaled - np.max(logits_scaled))  # subtract max for stability
                probs = exps / exps.sum()                 # normalized sampling probabilities

                next_id = np.random.choice(len(probs), p=probs)  # sample next char id
                next_char = itos[next_id]                   # convert id back to character
                out_text += next_char                       # append to generated output
                last_id = next_id                           # update last_id for next step

        return out_text                                   # return generated text from LSTM

    else:
        # fallback: build (or reuse) bigram model for immediate sampling if no LSTM checkpoint
        if 'bigram' not in globals():                    # if bigram not available, build it
            # build bigram from training portion of raw_text
            train_text = raw_text[:int(0.9 * len(raw_text))]    # same 90/10 split as before
            chars = sorted(list(set(train_text)))               # bigram vocab
            stoi_big = {ch:i for i,ch in enumerate(chars)}      # char->id for bigram
            itos_big = {i:ch for i,ch in enumerate(chars)}      # id->char for bigram
            V = len(chars)                                      # bigram vocab size
            counts = np.zeros((V, V), dtype=np.float64)        # bigram counts
            unigram = np.zeros(V, dtype=np.float64)            # unigram counts

            for a, b in zip(train_text, train_text[1:]):       # count consecutive pairs
                ai = stoi_big[a]; bi = stoi_big[b]
                counts[ai, bi] += 1.0
                unigram[ai] += 1.0
            unigram[stoi_big[train_text[-1]]] += 1.0           # last char count
            alpha = 1e-3
            bigram_probs = (counts + alpha) / (counts.sum(axis=1, keepdims=True) + alpha * V)  # row-normalize
            unigram_probs = (unigram + alpha) / (unigram.sum() + alpha * V)

            # store globally for reuse during this session
            bigram = {'chars': chars, 'stoi': stoi_big, 'itos': itos_big, 'bigram_probs': bigram_probs, 'unigram_probs': unigram_probs}

        # perform sampling from the bigram model with temperature
        if len(seed_text) == 0:
            seed_text = "\n"
        out_text = seed_text                                  # initialize with the seed
        last_char = seed_text[-1]                             # condition on last char of seed

        for _ in range(num_tokens_to_generate):               # generate requested chars
            if last_char in bigram['stoi']:                   # get conditional distribution
                probs = bigram['bigram_probs'][bigram['stoi'][last_char]].copy()
            else:
                probs = bigram['unigram_probs'].copy()       # fallback to unigram after unknown char

            if temperature <= 0.0:                            # guard temperature
                temperature = 1e-8

            logits = np.log(probs + 1e-12) / temperature      # apply temperature in log space
            exps = np.exp(logits - np.max(logits))            # numerical stability subtraction
            probs_t = exps / exps.sum()                       # normalized probabilities

            next_i = np.random.choice(np.arange(len(probs_t)), p=probs_t)  # sample char index
            next_c = bigram['itos'][next_i]                   # convert index to char
            out_text += next_c                                # append generated char
            last_char = next_c                                # update last_char for next step

        return out_text                                      # return bigram-generated text

In [None]:
#Train the model
hist_train_losses, hist_val_losses, hist_train_ppls, hist_val_ppls = train_modules_with_history(
    embed, lstm, fc, train_loader, val_loader, num_epochs=num_epochs, save_path=checkpoint_path
)                                                      # trains modules and saves checkpoint each epoch

Epoch 1/40 | train_loss 0.6826 ppl 1.979 | val_loss 1.6957 ppl 5.450 | time 4194.2s
Epoch 2/40 | train_loss 0.7107 ppl 2.035 | val_loss 1.6527 ppl 5.221 | time 4189.6s
Epoch 3/40 | train_loss 0.8593 ppl 2.361 | val_loss 1.4673 ppl 4.337 | time 4190.5s
