# Lab 6 – Evaluation and Comparison
**Part 6 of the 7 Lab Hands-On SLM Training Series**

In this lab, you will evaluate your tuned LoRA model against the original base model. We will compute a quick perplexity estimate on a validation split and run side-by-side qualitative prompts to compare outputs. Results and a brief report will be saved to Google Drive.


## Step 0. Stable installs for Colab

In [None]:
%pip install -q --force-reinstall "numpy==2.0.2" "pandas==2.2.2" "pyarrow==17.0.0"
%pip install -q "datasets>=3.0.0" "transformers>=4.41.0" "peft>=0.11.0" "accelerate>=0.29.0" "sentencepiece>=0.1.99" "tqdm>=4.66.0" bitsandbytes
import importlib
for m in ["numpy","pandas","pyarrow","datasets","transformers","peft","accelerate","sentencepiece","tqdm"]:
    mod = importlib.import_module(m)
    print(m, getattr(mod, '__version__', 'unknown'))
print('If imports fail, go to Runtime → Restart runtime, then re-run this cell.')

## Step 1. Mount Google Drive and set paths

In [None]:
from google.colab import drive
from pathlib import Path
drive.mount('/content/drive')

# Dataset prepared in Lab 3
DATA_DIR = Path("/content/drive/MyDrive/slm-labs/lab3_tokenized")

# Lab 5 results folder containing best adapters
L5_DIR = Path("/content/drive/MyDrive/slm-labs/lab5_results")

# Auto-pick the most recent 'best_*' directory, otherwise set BEST_DIR manually
best_dirs = sorted([p for p in L5_DIR.glob('best_*') if p.is_dir()], key=lambda p: p.stat().st_mtime, reverse=True)
BEST_DIR = best_dirs[0] if best_dirs else None
print("Selected BEST_DIR:", BEST_DIR)

assert DATA_DIR.exists(), f"Dataset folder not found: {DATA_DIR}"
assert BEST_DIR is not None and BEST_DIR.exists(), "No best_* adapters found in lab5_results. Run Lab 5 first or set BEST_DIR manually."

## Step 2. Load the validation dataset

In [None]:
from datasets import load_from_disk
ds = load_from_disk(str(DATA_DIR))
print(ds)

if 'validation' in ds:
    eval_ds = ds['validation']
elif 'test' in ds:
    eval_ds = ds['test']
else:
    # If no split, take a small slice of train
    eval_ds = ds['train'].select(range(min(200, len(ds['train']))))
print('Eval samples:', len(eval_ds))

## Step 3. Load base model and apply best LoRA adapters

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

BASE_MODEL = "HuggingFaceH4/zephyr-7b-beta"  # change if desired

def load_base(name):
    use_gpu = torch.cuda.is_available()
    kwargs = {}
    if use_gpu:
        try:
            quant = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',
                                       bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True)
            kwargs.update(dict(device_map='auto', quantization_config=quant, torch_dtype=torch.float16))
        except Exception:
            kwargs.update(dict(torch_dtype=torch.float16))
    else:
        kwargs.update(dict(torch_dtype=torch.float32))
    tok = AutoTokenizer.from_pretrained(name, use_fast=True)
    mdl = AutoModelForCausalLM.from_pretrained(name, **kwargs)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    return tok, mdl

tokenizer_base, model_base = load_base(BASE_MODEL)
tokenizer_tuned = tokenizer_base
model_tuned = PeftModel.from_pretrained(model_base, str(BEST_DIR))
model_tuned.eval()
print('Loaded base and tuned models')

## Step 4. Quantitative evaluation with perplexity

In [None]:
import math
from torch.utils.data import DataLoader
from transformers import default_data_collator

MAX_EXAMPLES = 512
subset = eval_ds.select(range(min(MAX_EXAMPLES, len(eval_ds))))

def perplexity(model, tokenizer, dataset):
    model.eval()
    loader = DataLoader(dataset, batch_size=2, shuffle=False, collate_fn=default_data_collator)
    total_loss = 0.0
    total_tokens = 0
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(model.device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
            outputs = model(**batch, labels=batch.get('input_ids'))
            loss = outputs.loss
            total_loss += loss.item() * batch['input_ids'].numel()
            total_tokens += batch['input_ids'].numel()
    avg_nll = total_loss / max(1, total_tokens)
    return math.exp(avg_nll)

ppl_base = perplexity(model_base, tokenizer_base, subset)
ppl_tuned = perplexity(model_tuned, tokenizer_tuned, subset)
print(f"Base perplexity:  {ppl_base:.3f}")
print(f"Tuned perplexity: {ppl_tuned:.3f}")

## Step 5. Qualitative prompts side by side

In [None]:
prompts = [
    "Draft a concise cardiology discharge summary for a patient treated for acute coronary syndrome.",
    "Explain the difference between type 1 and type 2 diabetes in plain language for a patient handout.",
    "Summarize key risk factors for stroke in three bullet points.",
]

gen_cfg = dict(max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer_base.eos_token_id)

def generate_text(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, **gen_cfg)
    return tokenizer.decode(output[0], skip_special_tokens=True)

for i, p in enumerate(prompts, 1):
    print(f"\nPrompt {i}: {p}")
    base_out = generate_text(model_base, tokenizer_base, p)
    tuned_out = generate_text(model_tuned, tokenizer_tuned, p)
    print("\nBase model\n")
    print(base_out)
    print("\nTuned model\n")
    print(tuned_out)


## Step 6. Save results and a brief report to Google Drive

In [None]:
report_dir = Path("/content/drive/MyDrive/slm-labs/lab6_report")
report_dir.mkdir(parents=True, exist_ok=True)
report_path = report_dir / "summary.txt"

with open(report_path, "w") as f:
    f.write("Lab 6 – Evaluation and Comparison\n")
    f.write(f"Base model: {BASE_MODEL}\n")
    f.write(f"Best adapters: {BEST_DIR}\n\n")
    f.write(f"Base perplexity:  {ppl_base:.3f}\n")
    f.write(f"Tuned perplexity: {ppl_tuned:.3f}\n\n")
    for i, p in enumerate(prompts, 1):
        f.write(f"Prompt {i}: {p}\n")
        f.write("Base model\n")
        f.write("-----\n")
        f.write(generate_text(model_base, tokenizer_base, p) + "\n\n")
        f.write("Tuned model\n")
        f.write("-----\n")
        f.write(generate_text(model_tuned, tokenizer_tuned, p) + "\n\n")

print("Saved report to", report_path)

## Step 7. Next steps
Extend evaluation with task specific metrics if you have labeled data, such as accuracy or F1 for classification tasks, or BLEU and ROUGE for summarization. Consider building a small human evaluation rubric for clarity, correctness, and tone, and collect a few ratings to validate gains seen in perplexity.