# Summation Transformer Colab Runner

This notebook trains the **37-trainable-parameter** spectral variant in `train_strictish_h2_d6_singlecarry_from_scratch.py` and saves artifacts to `/content/outputs` (Colab-safe path).

In [None]:
import os

REPO_URL = "https://github.com/Deferf/Summation-Transformer.git"
REPO_DIR = "/content/Summation-Transformer"
OUT_DIR = "/content/outputs"

if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL}

%cd /content/Summation-Transformer
!pip -q install -r requirements.txt
!mkdir -p {OUT_DIR}

In [None]:
import os
import subprocess
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

plot_path = "/content/outputs/strictish_h2_d6_37p_colab_metrics.png"
ckpt_path = "/content/outputs/strictish_h2_d6_37p_colab.pt"

cmd = [
    "python",
    "-u",
    "train_strictish_h2_d6_singlecarry_from_scratch.py",
    "--device", device,
    "--steps", "4000",
    "--batch-size", "256",
    "--lr", "0.02",
    "--mlp-hidden", "1",
    "--mlp-bias-inner",
    "--spectral-qkv",
    "--spectral-o",
    "--log-every", "100",
    "--eval-samples", "256",
    "--final-eval-samples", "2000",
    "--target-acc", "1.1",
    "--checkpoint", ckpt_path,
    "--plot-path", plot_path,
]
print("Running:", " ".join(cmd))
env = dict(os.environ)
env["PYTHONUNBUFFERED"] = "1"

with subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1,
    env=env,
) as proc:
    assert proc.stdout is not None
    for line in proc.stdout:
        print(line, end="", flush=True)
    return_code = proc.wait()

if return_code != 0:
    raise RuntimeError(f"Training failed with exit code {return_code}")

In [None]:
from IPython.display import Image, display

plot_path = "/content/outputs/strictish_h2_d6_37p_colab_metrics.png"
display(Image(filename=plot_path))

## AdderBoard Verification

Runs the official verifier against `adderboard_submission.py`, streams output live, and saves the full verifier log to `/content/outputs/adderboard_verify_colab.log`.

In [None]:
import os
import subprocess

VERIFIER_REPO = "/content/AdderBoard"
VERIFY_LOG = "/content/outputs/adderboard_verify_colab.log"
SUBMISSION = "/content/Summation-Transformer/adderboard_submission.py"
NUM_TESTS = 10000  # lower for a faster smoke run
SEED = 2025

if not os.path.exists(VERIFIER_REPO):
    !git clone https://github.com/anadim/AdderBoard.git {VERIFIER_REPO}

cmd = [
    "python",
    f"{VERIFIER_REPO}/verify.py",
    SUBMISSION,
    "--seed", str(SEED),
    "--num-tests", str(NUM_TESTS),
]
print("Running:", " ".join(cmd))

env = dict(os.environ)
env["PYTHONUNBUFFERED"] = "1"

with open(VERIFY_LOG, "w", encoding="utf-8") as logf:
    with subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
        env=env,
    ) as proc:
        assert proc.stdout is not None
        for line in proc.stdout:
            print(line, end="", flush=True)
            logf.write(line)
        return_code = proc.wait()

if return_code != 0:
    raise RuntimeError(f"Verifier failed with exit code {return_code}")

print(f"\nSaved verifier log: {VERIFY_LOG}")
print("\n=== Verifier Tail ===")
with open(VERIFY_LOG, "r", encoding="utf-8") as f:
    tail = f.readlines()[-40:]
print("".join(tail))

## Inference Playground (with logits)

Enter any two 10-digit-or-smaller non-negative integers. The cell decodes autoregressively and prints top-k logits at each step.

In [None]:
import os
import torch

from train_strictish_h2_d6_singlecarry_from_scratch import ScratchModel, Tokenizer

default_ckpt = "/content/outputs/strictish_h2_d6_37p_colab.pt"
ckpt_path = globals().get("ckpt_path", default_ckpt)
if not os.path.exists(ckpt_path):
    raise FileNotFoundError(
        f"Checkpoint not found at {ckpt_path}. Run the training cell first or set ckpt_path."
    )

device = "cuda" if torch.cuda.is_available() else "cpu"
tok = Tokenizer()
state = torch.load(ckpt_path, map_location=device)
cfg = state.get("args", {})
model = ScratchModel(
    tok,
    init_std=cfg.get("init_std", 0.08),
    mlp_hidden=cfg.get("mlp_hidden", 1),
    attn_rank=cfg.get("attn_rank", 0),
    factorize_o=cfg.get("factorize_o", False),
    mlp_bias_inner=cfg.get("mlp_bias_inner", True),
    mlp_bias_outer=cfg.get("mlp_bias_outer", False),
    spectral_qkv=cfg.get("spectral_qkv", True),
    spectral_o=cfg.get("spectral_o", True),
).to(device)
model.load_state_dict(state["model_state_dict"])
model.eval()

def tok_to_text(t):
    if t == tok.PAD:
        return "PAD"
    if t == tok.BOS:
        return "BOS"
    if t == tok.C0:
        return "C0"
    if t == tok.EOS:
        return "EOS"
    if tok.X_BASE <= t < tok.X_BASE + 100:
        x = t - tok.X_BASE
        d1 = x // 10
        d2 = x % 10
        return f"X[{d1},{d2}]"
    if tok.Y0_BASE <= t < tok.Y0_BASE + 10:
        d = t - tok.Y0_BASE
        return f"Y[{d},0]"
    if tok.Y1_BASE <= t < tok.Y1_BASE + 10:
        d = t - tok.Y1_BASE
        return f"Y[{d},1]"
    return f"tok[{t}]"

def run_addition_with_logits(a, b, topk=8):
    if not (0 <= a < 10**10 and 0 <= b < 10**10):
        raise ValueError("Inputs must be in [0, 10^10).")

    seq = tok.encode_problem(a, b)
    print("Initial prefix tokens:", [tok_to_text(t) for t in seq])

    step = 0
    while len(seq) < model.max_len:
        x = torch.tensor([seq], dtype=torch.long, device=device)
        with torch.no_grad():
            step_logits = model(x)[0, -1]

        k = min(topk, step_logits.shape[0])
        top_vals, top_ids = torch.topk(step_logits, k=k)
        print(f"\nStep {step} (predicting token {len(seq)}):")
        for rank, (tid, val) in enumerate(zip(top_ids.tolist(), top_vals.tolist()), start=1):
            print(f"  {rank:>2}. {tok_to_text(tid):>10}   logit={val:8.3f}")

        next_tok = int(top_ids[0].item())
        seq.append(next_tok)
        step += 1
        if next_tok == tok.EOS:
            break

    pred = tok.decode_sum(seq[model.prefix_len:])
    exp = f"{a + b:011d}"
    print("\nPredicted:", pred)
    print("Expected :", exp)
    print("Correct  :", pred == exp)
    return pred, exp, seq

a = int(input("Enter first number (0..9999999999): ").strip())
b = int(input("Enter second number (0..9999999999): ").strip())
_ = run_addition_with_logits(a, b, topk=8)


Optional: download artifacts from `/content/outputs/` using Colab file browser or with `files.download(...)`.