# ðŸ§ª PyTorch Lab 7: Language Modeling


This is a very large lab session. We will go over manipulating textual data with deep learning. 

## 0. Setup

In [1]:
from pathlib import Path
from urllib.request import urlopen
import nltk

#You might need to download nltlk and run these once : 
#nltk.download('punkt_tab')
#nltk.download("punkt")


## 1. Download this book from projet gutenberg
We'll start by dowloading Alice in wonderland from the gutenberg project. Just use my code and check what it outputs. It's quite easy to understand.

In [2]:
URL = "https://www.gutenberg.org/ebooks/11.txt.utf-8"
OUT_SENTENCES = Path("Alice_in_Wonderland_sentences.txt")
OUT_WORDS = Path("Alice_in_Wonderland_words.txt")


def strip_gutenberg_boilerplate(text: str) -> str:
    start_marks = [
        "*** START OF THIS PROJECT GUTENBERG EBOOK",
        "***START OF THE PROJECT GUTENBERG EBOOK",
        "*** START OF THE PROJECT GUTENBERG EBOOK",
    ]
    end_marks = [
        "*** END OF THIS PROJECT GUTENBERG EBOOK",
        "***END OF THE PROJECT GUTENBERG EBOOK",
        "*** END OF THE PROJECT GUTENBERG EBOOK",
    ]

    start_idx = 0
    for mark in start_marks:
        i = text.find(mark)
        if i != -1:
            start_idx = text.find("\n", i) + 1
            break

    end_idx = len(text)
    for mark in end_marks:
        i = text.find(mark)
        if i != -1:
            end_idx = text.rfind("\n", 0, i)
            break

    return text[start_idx:end_idx].strip()

with urlopen(URL) as r:
    raw = r.read().decode("utf-8", errors="replace")


cleaned = strip_gutenberg_boilerplate(raw)



In [3]:
PAD_ID = 0
UNK_ID = 1
START_ID = 2
STOP_ID = 3

def tokenize_words(s: str):
    # one place to define your tokenization rules
    return [w for w in nltk.word_tokenize(s.lower()) if w.isalpha()]
    
def text_to_sentences(text: str):
    text = " ".join(text.split())  # normalize whitespace
    return nltk.sent_tokenize(text)


def build_vocab_from_sentences(sentences):
    vocab = sorted({w for s in sentences for w in tokenize_words(s)})
    # leave 0 for PAD and 1 for UNK
    word2idx = {w: i + 4 for i, w in enumerate(vocab)}
    word2idx["<unk>"] = UNK_ID
    word2idx["<start>"] = START_ID
    word2idx["<stop>"] = STOP_ID
    word2idx["<pad>"] = PAD_ID
    return word2idx, {v: k for k, v in word2idx.items()}

def sentences_to_matrix(sentences, word2idx, pad_id=PAD_ID, unk_id=UNK_ID):
    tokenized = [tokenize_words(s) for s in sentences]
    max_len = max((len(seq) for seq in tokenized), default=0) + 2
    matrix = [
        [2] + [word2idx.get(w, unk_id) for w in seq] + [3] + [pad_id] * (max_len - len(seq) - 2)
        for seq in tokenized
    ]
    return matrix
    

def encode(tokens, word2idx):
    return [word2idx.get(w, UNK_ID) for w in tokens]

def decode(ids, idx2word):
    return " ".join(idx2word.get(i, "<unk>") for i in ids)

sentences = text_to_sentences(cleaned)
word2idx,idx2word = build_vocab_from_sentences(sentences)
matrix = sentences_to_matrix(sentences, word2idx)
print(matrix[0])
print(decode(matrix[0], idx2word))

[2, 1028, 56, 1761, 32, 1040, 2410, 267, 1189, 291, 2113, 1309, 825, 613, 405, 318, 1020, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
<start> illustration alice s adventures in wonderland by lewis carroll the millennium fulcrum edition contents chapter i <stop> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p

## 2. Creation of a simple model for text generation

In [4]:
# --- Simple RNN LM (autoregressive) with PyTorch ---
import math
import random
from typing import List, Optional

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Build dataset: inputs are seq[:-1], targets are seq[1:]
class LMDataset(Dataset):
    def __init__(self, sequences):
        self.seqs = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
        # all rows already padded to same length by sentences_to_matrix

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, i):
        s = self.seqs[i]
        x = s[:-1]  # [T-1]
        y = s[1:]   # next-token targets [T-1]
        return x, y

train_ds = LMDataset(matrix)

def collate(batch):
    xs, ys = zip(*batch)
    return torch.stack(xs, 0), torch.stack(ys, 0)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate)

In [5]:
class SimpleRNNLM(nn.Module):
    def __init__(self, vocab, emb, hid, layers=1, dropout=0.0, padding_idx=PAD_ID):
        super().__init__()
        self.embed = nn.Embedding(vocab, emb, padding_idx=padding_idx)
        self.rnn = nn.RNN(
            input_size=emb,
            hidden_size=hid,
            num_layers=layers,
            batch_first=True,
            nonlinearity="tanh",
            dropout=dropout if layers > 1 else 0.0,
        )
        self.proj = nn.Linear(hid, vocab)

    def forward(self, x, h0: Optional[torch.Tensor] = None):
        # x: [B, T]
        emb = self.embed(x)                 # [B, T, E]
        out, h = self.rnn(emb, h0)          # out: [B, T, H]
        logits = self.proj(out)             # [B, T, V]
        return logits, h

vocab_size = max(idx2word.keys()) + 1  # idx2word is {id: token}
embedding_dim = 128
hidden_size = 256
num_layers = 1
dropout = 0.1
lr = 2e-3
epochs = 10

model = SimpleRNNLM(vocab_size, embedding_dim, hidden_size, num_layers, dropout).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)  # don't learn on PAD targets
optim = torch.optim.AdamW(model.parameters(), lr=lr)

def evaluate_perplexity(loader):
    model.eval()
    total_nll, total_tokens = 0.0, 0
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device); y = y.to(device)
            logits, _ = model(x)
            # flatten for CE: [(B*T), V] vs [(B*T)]
            loss = criterion(logits.reshape(-1, vocab_size), y.reshape(-1))
            # sum nll over non-PAD positions
            mask = (y.reshape(-1) != PAD_ID).float()
            total_nll += (loss.item() * mask.numel())  # CE returns mean over all positions, adjust below
            total_tokens += int(mask.sum().item())
    if total_tokens == 0:
        return float("inf")
    # Recompute using running loss over all items; simpler: do a fresh pass summing token-level nlls
    # For simplicity, we approximate perplexity from batch means:
    avg_nll = total_nll / max(total_tokens, 1)
    return math.exp(avg_nll)

# --- Train ---
for epoch in range(1, epochs + 1):
    model.train()
    running = 0.0
    steps = 0
    for x, y in train_loader:
        x = x.to(device); y = y.to(device)
        logits, _ = model(x)  # [B, T, V]
        loss = criterion(logits.reshape(-1, vocab_size), y.reshape(-1))
        optim.zero_grad(set_to_none=True)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        running += loss.item()
        steps += 1
    ppl = evaluate_perplexity(train_loader)
    print(f"epoch {epoch}/{epochs} - loss {running/steps:.4f} - approx ppl {ppl:.2f}")

# --- Autoregressive generation ---
@torch.no_grad()
def sample(
    prompt: str = "",
    max_new_tokens: int = 50,
    temperature: float = 1.0,
    top_k: Optional[int] = 50,
    stop_id: int = STOP_ID,
):
    """
    Generate tokens autoregressively.
    - If prompt is empty, we start with <start>.
    - Uses single-step RNN updates to carry hidden state.
    """
    model.eval()
    # Build initial context
    if prompt.strip():
        init_tokens = [START_ID] + encode(tokenize_words(prompt), word2idx)
    else:
        init_tokens = [START_ID]

    x = torch.tensor([init_tokens], dtype=torch.long, device=device)  # [1, T]
    # Prime the hidden state by running the context through the RNN
    logits, h = model(x)
    next_id = int(torch.argmax(logits[0, -1]).item())

    generated: List[int] = init_tokens.copy()

    for _ in range(max_new_tokens):
        # Feed only last token each step (true autoregressive)
        last = torch.tensor([[generated[-1]]], dtype=torch.long, device=device)  # [1,1]
        out, h = model.embed(last), h  # reuse hidden state
        out, h = model.rnn(out, h)     # [1,1,H], h updated
        logits = model.proj(out[:, -1, :])  # [1,V]
        logits = logits / max(temperature, 1e-6)

        # Optional top-k filtering
        if top_k is not None and top_k > 0:
            top_vals, top_idx = torch.topk(logits, k=min(top_k, logits.size(-1)), dim=-1)
            probs = torch.softmax(top_vals, dim=-1)
            next_token = top_idx.gather(-1, torch.multinomial(probs, num_samples=1))
        else:
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        next_id = int(next_token.item())
        generated.append(next_id)
        if next_id == stop_id:
            break

    # Convert to text, skipping specials
    specials = {PAD_ID, UNK_ID, START_ID, STOP_ID}
    words = [idx2word.get(i, "<unk>") for i in generated if i not in specials]
    return " ".join(words)

# --- Quick demo ---
print("\n=== Samples ===")
print(sample("Once upon a time", max_new_tokens=30, temperature=0.8))
print(sample("", max_new_tokens=30, temperature=1.0))

epoch 1/10 - loss 6.3204 - approx ppl 42277614671946533830656.00
epoch 2/10 - loss 5.5697 - approx ppl 790742502726075940864.00
epoch 3/10 - loss 5.1748 - approx ppl 25883936047487299584.00
epoch 4/10 - loss 4.8449 - approx ppl 1296077791025340160.00
epoch 5/10 - loss 4.5472 - approx ppl 101794349844396592.00
epoch 6/10 - loss 4.2897 - approx ppl 9263774019950408.00
epoch 7/10 - loss 4.0405 - approx ppl 929125014946729.75
epoch 8/10 - loss 3.8044 - approx ppl 111862099005958.56
epoch 9/10 - loss 3.5809 - approx ppl 16579394498375.17
epoch 10/10 - loss 3.3714 - approx ppl 2065627008471.10

=== Samples ===
once upon a time to be very glad to look in the sea and the baby began to make out of the lobster quadrille the king went on with a pair of mushroom and
the king


In [6]:
def create_positional(max_L,emb):
        posit = torch.zeros(max_L,emb)
        for i in range(emb):
            if (i % 2) == 0:
                posit[0,i] = 0
            if (i % 2) == 1:
                posit[0,i] = 1
        for k in range(1,max_L):
            posit[k,0] = 0
            for i in range(emb):
                if (i % 2) == 0:
                    posit[k,i] = math.sin(k/(math.pow(k,(i/emb))))
                if (i % 2) == 1:
                    posit[k,i] = math.cos(k/(math.pow(k,((i-1)/emb))))
        return posit
    
class SimpleTransformer(nn.Module):
    def __init__(self, max_L, vocab, emb, hid, layers=1, dropout=0.0, padding_idx=PAD_ID):
        super().__init__()
        self.embed = nn.Embedding(vocab, emb, padding_idx=padding_idx)
        self.util_emb = math.pow(emb,1/2)
        self.positional = create_positional(max_L,emb)
        self.key = torch.nn.Linear(emb,emb)
        self.query = torch.nn.Linear(emb,emb)
        self.value = torch.nn.Linear(emb,emb)
        self.proj = torch.nn.Linear(emb,emb)

        self.out = nn.Linear(emb, vocab, bias=False)
        self.out.weight = self.embed.weight
        
    
        
    def forward(self, x, h0: Optional[torch.Tensor] = None):
        # x: [B, T]
        x = self.embed(x)                 # [B, T, E]
        
        pos = self.positional[:x.shape[1],:x.shape[2]]
        x += pos
        
        key = self.key(x)
        value = self.value(x)
        query = self.query(x)

        attention = (query @ key.transpose(-1, -2)) / self.util_emb
        mask = torch.tril(torch.ones(attention.shape[1], attention.shape[2]))

        attention = attention.masked_fill(mask == 0, float('-inf'))
        attention = torch.softmax(attention, dim=-1)

        x = x + self.proj(attention @ value)

        return self.out(x)


max_L = 512
model = SimpleTransformer(max_L, vocab_size, embedding_dim, hidden_size, num_layers, dropout).to(device)

x,y = next(iter(train_loader))
out = model(x)

In [9]:
vocab_size = max(idx2word.keys()) + 1  # idx2word is {id: token}
embedding_dim = 128
lr = 5e-3
epochs = 30
max_L = 512

model = SimpleTransformer(max_L, vocab_size, embedding_dim, hidden_size, num_layers, dropout).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)  # don't learn on PAD targets
optim = torch.optim.AdamW(model.parameters(), lr=lr)

def evaluate_perplexity(loader):
    model.eval()
    total_nll, total_tokens = 0.0, 0
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device); y = y.to(device)
            logits = model(x)
            # flatten for CE: [(B*T), V] vs [(B*T)]
            loss = criterion(logits.reshape(-1, vocab_size), y.reshape(-1))
            # sum nll over non-PAD positions
            mask = (y.reshape(-1) != PAD_ID).float()
            total_nll += (loss.item() * mask.numel())  # CE returns mean over all positions, adjust below
            total_tokens += int(mask.sum().item())
    if total_tokens == 0:
        return float("inf")
    # Recompute using running loss over all items; simpler: do a fresh pass summing token-level nlls
    # For simplicity, we approximate perplexity from batch means:
    avg_nll = total_nll / max(total_tokens, 1)
    return math.exp(avg_nll)

# --- Train ---
for epoch in range(1, epochs + 1):
    model.train()
    running = 0.0
    steps = 0
    for x, y in train_loader:
        x = x.to(device); y = y.to(device)
        logits = model(x)  # [B, T, V]
        loss = criterion(logits.reshape(-1, vocab_size), y.reshape(-1))
        optim.zero_grad(set_to_none=True)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        running += loss.item()
        steps += 1
    ppl = 0#evaluate_perplexity(train_loader)
    print(f"epoch {epoch}/{epochs} - loss {running/steps:.4f} - approx ppl {ppl:.2f}")


epoch 1/30 - loss 84.4682 - approx ppl 0.00
epoch 2/30 - loss 38.2515 - approx ppl 0.00
epoch 3/30 - loss 14.6490 - approx ppl 0.00
epoch 4/30 - loss 9.3704 - approx ppl 0.00
epoch 5/30 - loss 7.7072 - approx ppl 0.00
epoch 6/30 - loss 6.9600 - approx ppl 0.00
epoch 7/30 - loss 6.5870 - approx ppl 0.00
epoch 8/30 - loss 6.3233 - approx ppl 0.00
epoch 9/30 - loss 6.1111 - approx ppl 0.00
epoch 10/30 - loss 5.9842 - approx ppl 0.00
epoch 11/30 - loss 5.8237 - approx ppl 0.00
epoch 12/30 - loss 5.7195 - approx ppl 0.00
epoch 13/30 - loss 5.6078 - approx ppl 0.00
epoch 14/30 - loss 5.5048 - approx ppl 0.00
epoch 15/30 - loss 5.3920 - approx ppl 0.00
epoch 16/30 - loss 5.3000 - approx ppl 0.00
epoch 17/30 - loss 5.2054 - approx ppl 0.00
epoch 18/30 - loss 5.0696 - approx ppl 0.00
epoch 19/30 - loss 4.9782 - approx ppl 0.00
epoch 20/30 - loss 4.8858 - approx ppl 0.00
epoch 21/30 - loss 4.7973 - approx ppl 0.00
epoch 22/30 - loss 4.7053 - approx ppl 0.00
epoch 23/30 - loss 4.6363 - approx ppl

In [10]:
# --- Autoregressive generation ---
@torch.no_grad()
def sample(
    prompt: str = "",
    max_new_tokens: int = 50,
    temperature: float = 1.0,
    top_k: Optional[int] = 50,
    stop_id: int = STOP_ID,
):

    model.eval()
    # Build initial context
    if prompt.strip():
        init_tokens = [START_ID] + encode(tokenize_words(prompt), word2idx)
    else:
        init_tokens = [START_ID]
    seq = [init_tokens]
    
    print(seq)
    for _ in range(max_new_tokens):
        x = torch.tensor(seq, dtype=torch.long, device=device)  # [1, T]
        # Prime the hidden state by running the context through the RNN
        logits = model(x)
        next_id = int(torch.argmax(logits[0, -1]).item())
        seq = [seq[0] + [next_id]]  
        
    specials = {PAD_ID, UNK_ID, START_ID, STOP_ID}
    words = [idx2word.get(i, "<unk>") for i in seq[0] if i not in specials]
    
    return " ".join(words)

# --- Quick demo ---
print("\n=== Samples ===")
print(sample("Once upon a time", max_new_tokens=30, temperature=0.8))
print(sample("", max_new_tokens=30, temperature=1.0))


=== Samples ===
[[2, 1433, 2274, 4, 2157]]
once upon a time the queen pleasant temper of the mock turtle pleasant temper pleasant temper of the hatter and the queen pleasant temper of the door of bathing machines
[[2]]
and she went on the queen pleasant temper of the mock turtle pleasant temper pleasant temper of the hatter and the queen pleasant temper of the
