# Lab 6 – Evaluation and Comparison
Compare base vs tuned with quick perplexity and side-by-side generations, then save a brief report.

## Step 0. Stable installs

In [None]:
%pip install -q --force-reinstall numpy==2.0.2 pandas==2.2.2 pyarrow==17.0.0
%pip install -q datasets>=3.0.0 transformers>=4.41.0 peft>=0.11.0 accelerate>=0.29.0 sentencepiece>=0.1.99 tqdm>=4.66.0 bitsandbytes
print('If imports fail, use Runtime → Restart runtime and re-run this cell.')

## Step 1. Auto-detect dataset and adapters

In [None]:
from google.colab import drive
from pathlib import Path
import os
drive.mount('/content/drive')
BASE = Path('/content/drive/MyDrive/slm-labs')
assert BASE.exists(), f'Missing {BASE}. Create it or change BASE.'

DSETS=[]
for r,ds,fs in os.walk(BASE):
    if 'dataset_info.json' in fs:
        DSETS.append(Path(r))
print('Datasets found:')
for i,p in enumerate(DSETS,1):
    print(i,p)
DATA_DIR = DSETS[0] if DSETS else None
print('Using DATA_DIR:', DATA_DIR)

ADAPS=[]
for r,ds,fs in os.walk(BASE):
    if 'adapter_config.json' in fs:
        ADAPS.append(Path(r))
print('Adapters found:')
for i,p in enumerate(ADAPS,1):
    print(i,p)
BEST_DIR = ADAPS[0] if ADAPS else None
print('Using BEST_DIR:', BEST_DIR)

assert DATA_DIR and DATA_DIR.exists()
assert BEST_DIR and BEST_DIR.exists()

## Step 2. Load eval split

In [None]:
from datasets import load_from_disk
DS = load_from_disk(str(DATA_DIR))
val = DS.get('validation') or DS.get('test')
if val is None:
    val = DS['train'].select(range(min(200, len(DS['train']))))
print('Eval samples:', len(val))

## Step 3. Load base and tuned models

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

BASE_MODEL='HuggingFaceH4/zephyr-7b-beta'
kw={}
if torch.cuda.is_available():
    try:
        kw=dict(device_map='auto', quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True), torch_dtype=torch.float16)
    except Exception:
        kw=dict(torch_dtype=torch.float16)
else:
    kw=dict(torch_dtype=torch.float32)
Tok=AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
Base=AutoModelForCausalLM.from_pretrained(BASE_MODEL, **kw)
if Tok.pad_token is None: Tok.pad_token=Tok.eos_token
Tuned=PeftModel.from_pretrained(Base, str(BEST_DIR))
Tuned.eval()

## Step 4. Quick perplexity

In [None]:
import math
from torch.utils.data import DataLoader
from transformers import default_data_collator

sub = val.select(range(min(512, len(val))))
def ppl(model, tok, ds):
    model.eval(); L = DataLoader(ds, batch_size=2, shuffle=False, collate_fn=default_data_collator)
    tot=0; toks=0
    with torch.no_grad():
        for b in L:
            b={k:v.to(model.device) for k,v in b.items() if hasattr(v,'to')}
            out=model(**b, labels=b.get('input_ids'))
            loss=out.loss
            tot+=loss.item()*b['input_ids'].numel(); toks+=b['input_ids'].numel()
    return math.exp(tot/max(1,toks))

base_ppl=ppl(Base,Tok,sub); tuned_ppl=ppl(Tuned,Tok,sub)
print('Base perplexity:',base_ppl,'Tuned perplexity:',tuned_ppl)

## Step 5. Side-by-side generations

In [None]:
prompts=[
 'Draft a concise cardiology discharge summary for a patient treated for acute coronary syndrome.',
 'Explain the difference between type 1 and type 2 diabetes in plain language for a patient handout.',
 'Summarize key risk factors for stroke in three bullet points.'
]
CFG=dict(max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=Tok.eos_token_id)
def gen(m,t,p):
    x=t(p, return_tensors='pt').to(m.device)
    with torch.no_grad():
        y=m.generate(**x,**CFG)
    return t.decode(y[0], skip_special_tokens=True)

for p in prompts:
    print('\nPrompt:',p)
    print('\nBase:\n',gen(Base,Tok,p))
    print('\nTuned:\n',gen(Tuned,Tok,p))

## Step 6. Save a brief report to Drive

In [None]:
from pathlib import Path
R=Path('/content/drive/MyDrive/slm-labs/lab6_report'); R.mkdir(parents=True, exist_ok=True)
with open(R/'summary.txt','w') as f:
    f.write('Lab 6 – Evaluation and Comparison\n')
    f.write(f'Base perplexity: {base_ppl:.3f}\nTuned perplexity: {tuned_ppl:.3f}\n\n')
    for i,p in enumerate(prompts,1):
        f.write(f'Prompt {i}: {p}\n')
        f.write('Base\n-----\n'); f.write(gen(Base, Tok, p)+'\n\n')
        f.write('Tuned\n-----\n'); f.write(gen(Tuned, Tok, p)+'\n\n')
print('Saved report to /content/drive/MyDrive/slm-labs/lab6_report/summary.txt')