# Lab 5 – Hyperparameter Tuning and Optimization
Run a small sweep over LR, LoRA rank, and gradient accumulation. Pick best by validation perplexity.

## Step 0. Stable installs

In [1]:
%pip install -q --force-reinstall numpy==2.0.2 pandas==2.2.2 pyarrow==17.0.0
%pip install -q datasets>=3.0.0 transformers>=4.41.0 peft>=0.11.0 accelerate>=0.29.0 sentencepiece>=0.1.99 tqdm>=4.66.0 bitsandbytes
print('If imports fail, use Runtime → Restart runtime and re-run this cell.')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m229.9/229.9 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.2/509.2 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m347.8/347.8 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hIf imports fail, use Runtime → Restart runtime and re-run this cell.


## Step 1. Auto-detect dataset in Drive

In [None]:
# Step 1: Load dataset saved from Lab 3
from google.colab import drive
from pathlib import Path
from datasets import load_from_disk

drive.mount('/content/drive')

# Consistent base directory for all labs
BASE_DIR = Path("/content/drive/MyDrive/slm-labs")
BASE_DIR.mkdir(parents=True, exist_ok=True)

# Dataset directory from Lab 3
DATA_DIR = BASE_DIR / "lab3_tokenized"

# Sanity checks
assert DATA_DIR.exists(), f"Dataset not found at {DATA_DIR}. Run Lab 3 first."

# Load dataset
dataset = load_from_disk(DATA_DIR)
print("Loaded dataset from:", DATA_DIR)
print("Splits:", list(dataset.keys()))
for split, dset in dataset.items():
    print(f"{split}: {len(dset)} rows, columns = {dset.column_names}")


## Step 2. Load dataset

In [None]:
# Step 2: Load dataset (works only if Step 1 verified a proper folder)
from datasets import load_from_disk
ds = load_from_disk(str(DATA_DIR))
print(ds)
val = ds.get("validation") or ds.get("test")
if val is None:
    # fall back to a small slice of train for quick checks
    val = ds["train"].select(range(min(200, len(ds["train"]))))
print("Validation samples:", len(val))

## Step 3. Load base model (4-bit if possible)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
BASE_MODEL='HuggingFaceH4/zephyr-7b-beta'
kw={}
if torch.cuda.is_available():
    try:
        kw=dict(device_map='auto', quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True), torch_dtype=torch.float16)
    except Exception:
        kw=dict(torch_dtype=torch.float16)
else:
    kw=dict(torch_dtype=torch.float32)
Tok=AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
Model=AutoModelForCausalLM.from_pretrained(BASE_MODEL, **kw)
if Tok.pad_token is None: Tok.pad_token=Tok.eos_token

## Step 4. Attach LoRA

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
TARGETS=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
def attach_lora(m, r=16, alpha=32, drop=0.05):
    m = prepare_model_for_kbit_training(m)
    cfg = LoraConfig(r=r, lora_alpha=alpha, lora_dropout=drop, target_modules=TARGETS, bias='none', task_type='CAUSAL_LM')
    pm = get_peft_model(m, cfg)
    pm.print_trainable_parameters()
    return pm

## Step 5. Train short and evaluate

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import math, time, pandas as pd

def run(h):
    m = attach_lora(Model, r=h['r'], alpha=h['alpha'], drop=h['drop'])
    coll = DataCollatorForLanguageModeling(tokenizer=Tok, mlm=False)
    args = TrainingArguments(output_dir=f"./out_{int(time.time())}", per_device_train_batch_size=h['bs'], gradient_accumulation_steps=h['ga'], learning_rate=h['lr'], warmup_steps=10, max_steps=h['steps'], logging_steps=10, save_strategy='no', fp16=torch.cuda.is_available(), report_to=[])
    trainer = Trainer(model=m, args=args, train_dataset=ds['train'], eval_dataset=val, data_collator=coll)
    trainer.train()
    ev = trainer.evaluate() if val else {}
    loss = ev.get('eval_loss', None)
    ppl = math.exp(loss) if loss else None
    return {'loss': loss, 'ppl': ppl}, m

search=[{'lr':2e-4,'r':16,'alpha':32,'drop':0.05,'bs':2,'ga':4,'steps':100}]
recs=[]; BEST=None; BEST_MODEL=None
for h in search:
    print('Trial',h)
    mtr, m=run(h)
    row={**h, **mtr}; recs.append(row)
    if BEST is None or (mtr['ppl'] and mtr['ppl']<BEST['ppl']):
        BEST=row; BEST_MODEL=m
DF=pd.DataFrame(recs)
display(DF)
print('Best:',BEST)

## Step 6. Save results to Drive

In [None]:
from pathlib import Path
RES = Path('/content/drive/MyDrive/slm-labs/lab5_results'); RES.mkdir(parents=True, exist_ok=True)
DF.to_csv(RES/'trials.csv', index=False)
if BEST_MODEL is not None:
    tag=f"r{BEST['r']}_lr{BEST['lr']}_ga{BEST['ga']}"; sd=RES/f"best_{tag}"; sd.mkdir(parents=True, exist_ok=True)
    BEST_MODEL.save_pretrained(sd); Tok.save_pretrained(sd)
    print('Saved best adapters to', sd)
else:
    print('No best model to save')