# 🌟 Master SLM Notebook: From Scratch Long-Context Model

This master notebook consolidates the entire pipeline for building a 300M parameter Small Language Model (SLM) trained **exclusively on pre-1986 knowledge**.

### 🛠️ Pipeline Stages:
1. **📊 Data Exploration**: Analyze pre-1986 training streams.
2. **🔤 Tokenizer Training**: Build a custom BPE tokenizer.
3. **🏗️ Model Architecture**: Implement RoPE, Block-Local Attention, and Transformer blocks from scratch.
4. **🚀 Training Pipeline**: Execute the 3-phase curriculum learning (Pretrain → Context Extend → Fine-tune).
5. **🔬 Evaluation**: Perform zero-shot reasoning tests.

---
# 1. Setup & Configuration

Initialize environment, imports, and reproducibility seeds.

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import time
import json
import matplotlib.pyplot as plt
from pathlib import Path
from collections import Counter
from dataclasses import dataclass
from typing import Optional, Tuple, List
from torch.utils.data import Dataset, DataLoader

# Install dependencies if missing (for Colab)
try:
    from tokenizers import Tokenizer, models, trainers, pre_tokenizers
except ImportError:
    !pip install tokenizers
    from tokenizers import Tokenizer
    from tokenizers.models import BPE
    from tokenizers.trainers import BpeTrainer
    from tokenizers.pre_tokenizers import Whitespace

# Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Reproducibility
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Path Configuration
# Adjust these if running in a different environment
DATA_DIR = Path("../data/pre1986_training_streams_v1_FINAL")
TOKENIZER_DIR = Path("tokenizer_out")
TOKENIZER_DIR.mkdir(exist_ok=True)
CHECKPOINT_DIR = Path("checkpoints_out")
CHECKPOINT_DIR.mkdir(exist_ok=True)
CONFIG_DIR = Path("configs_out")
CONFIG_DIR.mkdir(exist_ok=True)

print("\u2713 Environment ready.")

---
# 2. Data Exploration

Analyze the training streams to understand data scale and content.

In [None]:
def get_file_stats(filepath):
    if not filepath.exists(): 
        return None
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    return {
        'size_mb': filepath.stat().st_size / (1024 * 1024),
        'words': len(content.split()),
        'eos_counts': content.count('<EOS>')
    }

files = ['base_stream.txt', 'finetune_control.txt', 
         'finetune_nuclear.txt', 'finetune_reliability.txt']

print(f"Scanning {DATA_DIR}...")
stats = {}
for fname in files:
    fpath = DATA_DIR / fname
    s = get_file_stats(fpath)
    if s:
        stats[fname] = s
        print(f"{fname}: {s['size_mb']:.2f} MB, {s['words']:,} words, {s['eos_counts']} docs")
    else:
        print(f"Warning: {fname} not found in {DATA_DIR}")

# Simple visualization
if stats:
    names = list(stats.keys())
    sizes = [stats[n]['size_mb'] for n in names]
    plt.figure(figsize=(10, 4))
    plt.bar(names, sizes, color=['#2ecc71', '#3498db', '#e74c3c', '#9b59b6'])
    plt.title('Dataset Size Distribution')
    plt.ylabel('Size (MB)')
    plt.show()

---
# 3. Tokenizer Training

Train a BPE tokenizer on the base stream. We aim for a vocab size of 32,000.

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# 1. Initialize
tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
tokenizer.pre_tokenizer = Whitespace()

# 2. Configure Trainer
trainer = BpeTrainer(
    vocab_size=32000,
    min_frequency=2,
    special_tokens=["<PAD>", "<UNK>", "<EOS>", "<BOS>"]
)

# 3. Train on base_stream
base_stream_path = DATA_DIR / "base_stream.txt"
if base_stream_path.exists():
    print("Training tokenizer on base_stream... (this may take a moment)")
    tokenizer.train([str(base_stream_path)], trainer)
    
    # Save
    tokenizer_path = TOKENIZER_DIR / "tokenizer.json"
    tokenizer.save(str(tokenizer_path))
    print(f"\u2713 Tokenizer saved to {tokenizer_path} (Vocab: {tokenizer.get_vocab_size()})")
else:
    print("\u274c Base stream not found, cannot train tokenizer.")

---
# 4. Model Architecture

Implementation of the transform decoder with RoPE and Block-Local Attention.

In [None]:
@dataclass
class ModelConfig:
    vocab_size: int = 32000
    d_model: int = 1024
    n_layers: int = 24
    n_heads: int = 16
    d_ff: int = 4096
    max_seq_len: int = 2048
    block_size: int = 512
    use_block_local: bool = False
    dropout: float = 0.1

    @property
    def head_dim(self):
        return self.d_model // self.n_heads

# Save Config Helper
def save_config(config: ModelConfig, path: Path):
    with open(path, 'w') as f:
        json.dump(config.__dict__, f, indent=2)

In [None]:
class RotaryEmbedding(nn.Module):
    """Rotary Position Embeddings (RoPE)"""
    def __init__(self, dim: int, max_seq_len: int = 2048, base: float = 10000.0):
        super().__init__()
        self.dim = dim
        self.max_seq_len = max_seq_len
        self.base = base
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer('inv_freq', inv_freq)
        self._build_cache(max_seq_len)
    
    def _build_cache(self, seq_len: int):
        positions = torch.arange(seq_len).float()
        freqs = torch.outer(positions, self.inv_freq)
        emb = torch.cat([freqs, freqs], dim=-1)
        self.register_buffer('cos_cached', emb.cos())
        self.register_buffer('sin_cached', emb.sin())
    
    def forward(self, x: torch.Tensor, seq_len: int):
        if seq_len > self.max_seq_len:
            self._build_cache(seq_len)
            self.max_seq_len = seq_len
        return (
            self.cos_cached[:seq_len].to(x.device),
            self.sin_cached[:seq_len].to(x.device)
        )

def rotate_half(x: torch.Tensor):
    x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
    return torch.cat([-x2, x1], dim=-1)

def apply_rotary_emb(q, k, cos, sin):
    cos = cos.unsqueeze(0).unsqueeze(2)
    sin = sin.unsqueeze(0).unsqueeze(2)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.n_heads = config.n_heads
        self.head_dim = config.head_dim
        self.d_model = config.d_model
        self.q_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.k_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.v_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.out_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.rotary = RotaryEmbedding(self.head_dim, config.max_seq_len)
        self.dropout = nn.Dropout(config.dropout)
        self.scale = 1.0 / math.sqrt(self.head_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.shape
        q = self.q_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim)
        k = self.k_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim)
        v = self.v_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim)
        
        cos, sin = self.rotary(q, seq_len)
        q, k = apply_rotary_emb(q, k, cos, sin)
        
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        attn = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        
        if mask is None:
            mask = torch.triu(torch.ones(seq_len, seq_len, device=x.device), diagonal=1).bool()
        attn = attn.masked_fill(mask, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        attn = self.dropout(attn)
        
        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.out_proj(out)

In [None]:
class BlockLocalAttention(nn.Module):
    """Sparse attention: attends to current + previous block"""
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.n_heads = config.n_heads
        self.head_dim = config.head_dim
        self.d_model = config.d_model
        self.block_size = config.block_size
        self.q_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.k_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.v_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.out_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.rotary = RotaryEmbedding(self.head_dim, config.max_seq_len)
        self.dropout = nn.Dropout(config.dropout)
        self.scale = 1.0 / math.sqrt(self.head_dim)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        block_size = self.block_size
        pad_len = (block_size - seq_len % block_size) % block_size
        if pad_len > 0: x = F.pad(x, (0, 0, 0, pad_len))
        padded_len = x.shape[1]
        n_blocks = padded_len // block_size
        
        q = self.q_proj(x).view(batch_size, padded_len, self.n_heads, self.head_dim)
        k = self.k_proj(x).view(batch_size, padded_len, self.n_heads, self.head_dim)
        v = self.v_proj(x).view(batch_size, padded_len, self.n_heads, self.head_dim)
        
        cos, sin = self.rotary(q, padded_len)
        q, k = apply_rotary_emb(q, k, cos, sin)
        
        q = q.view(batch_size, n_blocks, block_size, self.n_heads, self.head_dim)
        k = k.view(batch_size, n_blocks, block_size, self.n_heads, self.head_dim)
        v = v.view(batch_size, n_blocks, block_size, self.n_heads, self.head_dim)
        
        k_prev = F.pad(k, (0,0,0,0,0,0,1,0))[:, :-1]
        v_prev = F.pad(v, (0,0,0,0,0,0,1,0))[:, :-1]
        k_local = torch.cat([k_prev, k], dim=2)
        v_local = torch.cat([v_prev, v], dim=2)
        
        q = q.transpose(2, 3)
        k_local = k_local.transpose(2, 3)
        v_local = v_local.transpose(2, 3)
        
        attn = torch.matmul(q, k_local.transpose(-2, -1)) * self.scale
        mask = torch.ones(block_size, 2*block_size, device=x.device).bool()
        for i in range(block_size): mask[i, :block_size+i+1] = False
        
        attn = attn.masked_fill(mask, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, v_local)
        out = out.transpose(2, 3).contiguous().view(batch_size, padded_len, self.d_model)
        if pad_len > 0: out = out[:, :seq_len]
        return self.out_proj(out)

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.attn_norm = nn.LayerNorm(config.d_model)
        self.ffn_norm = nn.LayerNorm(config.d_model)
        self.attention = BlockLocalAttention(config) if config.use_block_local else MultiHeadAttention(config)
        self.ffn = nn.Sequential(
            nn.Linear(config.d_model, config.d_ff, bias=False),
            nn.GELU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_ff, config.d_model, bias=False)
        )
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        h = self.attn_norm(x)
        x = x + self.dropout(self.attention(h))
        h = self.ffn_norm(x)
        x = x + self.dropout(self.ffn(h))
        return x

class SLM(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config
        self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
        self.final_norm = nn.LayerNorm(config.d_model)
        self.output = nn.Linear(config.d_model, config.vocab_size, bias=False)
        self.output.weight = self.token_emb.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, tokens):
        x = self.token_emb(tokens)
        for block in self.blocks: x = block(x)
        x = self.final_norm(x)
        return self.output(x)
    
    @torch.no_grad()
    def generate(self, tokens, max_new_tokens=100, temperature=1.0):
        self.eval()
        for _ in range(max_new_tokens):
            if tokens.size(1) >= self.config.max_seq_len: break
            logits = self(tokens)[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            tokens = torch.cat([tokens, next_token], dim=1)
        return tokens

---
# 5. Training Pipeline

Dataset streaming, curriculum learning setup, and training loop.

In [None]:
class TextDataset(Dataset):
    def __init__(self, file_paths: List[Path], tokenizer: Tokenizer, seq_len: int):
        self.seq_len = seq_len
        self.tokens = []
        for path in file_paths:
            if not path.exists(): continue
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read()
            self.tokens.append(torch.tensor(tokenizer.encode(text).ids, dtype=torch.long))
        if self.tokens:
            self.tokens = torch.cat(self.tokens)
            self.n_chunks = (len(self.tokens) - 1) // seq_len
        else:
            self.n_chunks = 0

    def __len__(self): return self.n_chunks
    def __getitem__(self, idx):
        start = idx * self.seq_len
        end = start + self.seq_len + 1
        chunk = self.tokens[start:end]
        return chunk[:-1], chunk[1:]

@dataclass
class TrainingConfig:
    batch_size: int = 8
    grad_accum: int = 4
    lr: float = 3e-4
    max_steps: int = 1000  # Reduced for demo purposes, original 50000
    warmup_steps: int = 100
    seq_len: int = 2048
    
def get_lr(step, config):
    if step < config.warmup_steps:
        return config.lr * step / config.warmup_steps
    decay_ratio = (step - config.warmup_steps) / (config.max_steps - config.warmup_steps)
    coeff = 0.5 * (1.0 + math.cos(math.pi * min(decay_ratio, 1.0)))
    return config.lr * 0.1 + coeff * config.lr * 0.9

In [None]:
def train_loop(model, dataset, config, phase_name):
    loader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=0.1)
    model.train()
    
    step = 0
    iter_loader = iter(loader)
    print(f"\n=== Starting {phase_name} ==-")
    
    while step < config.max_steps:
        lr = get_lr(step, config)
        for pg in optimizer.param_groups: pg['lr'] = lr
        
        optimizer.zero_grad()
        loss_accum = 0
        for _ in range(config.grad_accum):
            try:
                x, y = next(iter_loader)
            except StopIteration:
                iter_loader = iter(loader)
                x, y = next(iter_loader)
            
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
            loss = loss / config.grad_accum
            loss.backward()
            loss_accum += loss.item()
        
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        step += 1
        
        if step % 50 == 0: # Log every 50 steps
            print(f"Step {step} | Loss: {loss_accum:.4f} | LR: {lr:.2e}")
            
    # Save checkpoint
    torch.save(model.state_dict(), CHECKPOINT_DIR / f"{phase_name}_final.pt")
    return model

---
# 6. Execution & Evaluation

Run the full pipeline (Demonstration Mode).

In [None]:
# 1. Initialize Model for Phase A
config = ModelConfig()
model = SLM(config).to(device)
print(f"Model Parameters: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

# 2. Train Phase A (Short Demo)
train_cfg = TrainingConfig(max_steps=50) # Very short for demo
ds_a = TextDataset([DATA_DIR / 'base_stream.txt'], tokenizer, 2048)

if len(ds_a) > 0:
    model = train_loop(model, ds_a, train_cfg, "PhaseA")

# 3. Evaluation
eval_prompts = [
    "The stability of a positive feedback loop depends on",
    "In a nuclear reactor, void coefficient refers to"
]

print("\n=== Evaluation ===")
model.eval()
for p in eval_prompts:
    tokens = torch.tensor([tokenizer.encode(p).ids], device=device)
    out = model.generate(tokens, max_new_tokens=50)
    print(f"Prompt: {p}")
    print(f"Generated: {tokenizer.decode(out[0].tolist())}\n")