# a-lm colab training

This notebook runs a full from-scratch pretrain on Colab using the larger `nano` config and the Colab corpus preset.


## Optional drive mount
Use this if your repo or outputs live on Google Drive.


In [None]:
from google.colab import drive

drive.mount("/content/drive")

## Locate or clone the repo
If you already uploaded the repo, this will use it. Otherwise it clones into `/content/a-lm`.


In [None]:
import os

repo_candidates = ["/content/a-lm", "/content/drive/MyDrive/a-lm"]
repo_path = None
for candidate in repo_candidates:
    if os.path.isdir(candidate):
        repo_path = candidate
        break

if repo_path is None:
    repo_path = "/content/a-lm"
    !git clone https://github.com/ammaar-alam/a-lm.git {repo_path}

%cd {repo_path}

In [None]:
print(
    "Next: run the 'Install pinned dependencies' cell below."
    " Restart only if Colab warns about imports."
    " Then continue to 'Hugging Face login' and 'Start pretraining'."
)

## Install pinned dependencies
These versions avoid Colab crashes and keep `transformers` compatibility.
This cell intentionally does **not** downgrade `numpy` (downgrades force restarts and conflict with Colab preinstalls).


In [None]:
%pip install -U "huggingface_hub<1.0" "datasets>=2.19,<3" "pyarrow>=15.0.2,<19" \
  "fsspec>=2025.3.0" "gcsfs>=2025.3.0" "tokenizers>=0.22.0,<=0.23.0"
%pip install -e . --no-deps

## Optional: enable verbose training logs
By default the progress bar updates live. If you want per-step log lines, run this cell.


In [None]:
from pathlib import Path

Path("configs/train_colab_verbose.yaml").write_text("""
optim:
  name: adamw
  lr: 3e-4
  betas: [0.9, 0.95]
  weight_decay: 0.1
  eps: 1e-8

scheduler:
  name: cosine
  warmup_steps: 1000
  max_steps: 20000

training:
  micro_batch_size: 4
  gradient_accumulation: 8
  max_steps: 20000
  checkpoint_interval: 500
  gradient_clip_norm: 0.5
  mixed_precision: fp16
  grad_checkpointing: false
  seed: 1337
  dataloader_workers: 2

logging:
  log_interval: 1
  rich_progress: false
""")
print("Wrote configs/train_colab_verbose.yaml")

## Hugging Face login
Paste your token when prompted.


In [None]:
from huggingface_hub import login

login()

## GPU check
Make sure CUDA is available before training.


In [None]:
import torch

!nvidia-smi
print(torch.__version__, torch.cuda.is_available(), torch.version.cuda)

## Start pretraining
Stores the run id in `LAST_RUN.txt` so later cells can resume or chat.


In [None]:
import subprocess
import time
from pathlib import Path

run_id = time.strftime("%Y%m%d-%H%M%S")
Path("LAST_RUN.txt").write_text(run_id)
print("run:", run_id)

train_cfg = "configs/train_colab.yaml"
if Path("configs/train_colab_verbose.yaml").exists():
    train_cfg = "configs/train_colab_verbose.yaml"
    print("using verbose logging config")

cmd = ["make", "colab-pretrain", f"RUN={run_id}", f"TRAIN_CFG={train_cfg}"]
print("command:", " ".join(cmd))
result = subprocess.run(cmd)
if result.returncode != 0:
    raise RuntimeError(f"make failed with exit code {result.returncode}")

## Chat with the latest checkpoint


In [None]:
from pathlib import Path

run_id = Path("LAST_RUN.txt").read_text().strip()
print("using run", run_id)
!make chat RUN={run_id}

## RLVR post-training


In [None]:
from pathlib import Path

run_id = Path("LAST_RUN.txt").read_text().strip()
print("using run", run_id)

!make rlvr-data
!make rlvr-train RUN={run_id}
!make chat RUN={run_id} CHECKPOINT=runs/{run_id}/rlvr/ckpt-last.pt