# Lab 6 – Evaluation and Comparison
Compare base vs tuned with quick perplexity and side-by-side generations, then save a brief report.

## Step 0. Stable installs

In [1]:
%pip install -q --force-reinstall numpy==2.0.2 pandas==2.2.2 pyarrow==17.0.0
%pip install -q datasets>=3.0.0 transformers>=4.41.0 peft>=0.11.0 accelerate>=0.29.0 sentencepiece>=0.1.99 tqdm>=4.66.0 bitsandbytes
print('If imports fail, use Runtime → Restart runtime and re-run this cell.')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m117.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m229.9/229.9 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.2/509.2 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m347.8/347.8 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hIf imports fail, use Runtime → Restart runtime and re-run this cell.


## Step 1. Auto-detect dataset and adapters

In [2]:
from google.colab import drive
from pathlib import Path
import os
drive.mount('/content/drive')
BASE = Path('/content/drive/MyDrive/slm-labs')
assert BASE.exists(), f'Missing {BASE}. Create it or change BASE.'

DSETS=[]
for r,ds,fs in os.walk(BASE):
    if 'dataset_info.json' in fs:
        DSETS.append(Path(r))
print('Datasets found:')
for i,p in enumerate(DSETS,1):
    print(i,p)
DATA_DIR = DSETS[0] if DSETS else None
print('Using DATA_DIR:', DATA_DIR)

ADAPS=[]
for r,ds,fs in os.walk(BASE):
    if 'adapter_config.json' in fs:
        ADAPS.append(Path(r))
print('Adapters found:')
for i,p in enumerate(ADAPS,1):
    print(i,p)
BEST_DIR = ADAPS[0] if ADAPS else None
print('Using BEST_DIR:', BEST_DIR)

assert DATA_DIR and DATA_DIR.exists()
assert BEST_DIR and BEST_DIR.exists()

Mounted at /content/drive
Datasets found:
1 /content/drive/MyDrive/slm-labs/lab3_tokenized/train
Using DATA_DIR: /content/drive/MyDrive/slm-labs/lab3_tokenized/train
Adapters found:
1 /content/drive/MyDrive/slm-labs/lab4_lora_adapters
2 /content/drive/MyDrive/slm-labs/lab5_results/best_r16_lr0.0002_ga4
Using BEST_DIR: /content/drive/MyDrive/slm-labs/lab4_lora_adapters


## Step 2. Load eval split

In [3]:
from datasets import load_from_disk

# Check if DATA_DIR is defined
if 'DATA_DIR' not in locals():
    print("Error: DATA_DIR is not defined. Please run the previous cell to define it.")
else:
    DS = load_from_disk(str(DATA_DIR))

    # Try to pick validation or test if available
    if isinstance(DS, dict) or hasattr(DS, "keys"):
        if "validation" in DS:
            val = DS["validation"]
        elif "test" in DS:
            val = DS["test"]
        elif "train" in DS:
            val = DS["train"].select(range(min(200, len(DS["train"]))))
        else:
            first_split = list(DS.keys())[0]
            val = DS[first_split].select(range(min(200, len(DS[first_split]))))
    else:
        # DS is just a flat Dataset
        val = DS.select(range(min(200, len(DS))))

    print("Eval samples:", len(val))

Eval samples: 200


## Step 3. Load base and tuned models

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

BASE_MODEL='HuggingFaceH4/zephyr-7b-beta'
kw={}
if torch.cuda.is_available():
    try:
        kw=dict(device_map='auto',
                quantization_config=BitsAndBytesConfig(load_in_4bit=True,
                                                       bnb_4bit_quant_type='nf4',
                                                       bnb_4bit_compute_dtype=torch.float16,
                                                       bnb_4bit_use_double_quant=True),
                torch_dtype=torch.float16)
    except Exception:
        kw=dict(torch_dtype=torch.float16)
else:
    kw=dict(torch_dtype=torch.float32)
Tok=AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
Base=AutoModelForCausalLM.from_pretrained(BASE_MODEL, **kw)
if Tok.pad_token is None: Tok.pad_token=Tok.eos_token
Tuned=PeftModel.from_pretrained(Base, str(BEST_DIR))
Tuned.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=2)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
     

## Step 4. Quick perplexity

In [None]:
import math
from torch.utils.data import DataLoader
from transformers import default_data_collator

sub = val.select(range(min(512, len(val))))
def ppl(model, tok, ds):
    from torch.utils.data import DataLoader
    from transformers import default_data_collator
    import math, torch

    model.eval()
    L = DataLoader(ds, batch_size=2, shuffle=False, collate_fn=default_data_collator)

    tot = 0
    toks = 0
    with torch.no_grad():
        for b in L:
            # Move tensors to device
            b = {k: v.to(model.device) for k, v in b.items() if hasattr(v, "to")}
            # Pass batch directly, it already includes labels
            out = model(**b)
            loss = out.loss
            tot += loss.item() * b["input_ids"].numel()
            toks += b["input_ids"].numel()

    return math.exp(tot / max(1, toks))

base_ppl=ppl(Base,Tok,sub); tuned_ppl=ppl(Tuned,Tok,sub)
print('Base perplexity:',base_ppl,'Tuned perplexity:',tuned_ppl)

## Step 5. Side-by-side generations

In [None]:
prompts=[
 'Draft a concise cardiology discharge summary for a patient treated for acute coronary syndrome.',
 'Explain the difference between type 1 and type 2 diabetes in plain language for a patient handout.',
 'Summarize key risk factors for stroke in three bullet points.'
]
CFG=dict(max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=Tok.eos_token_id)
def gen(m,t,p):
    x=t(p, return_tensors='pt').to(m.device)
    with torch.no_grad():
        y=m.generate(**x,**CFG)
    return t.decode(y[0], skip_special_tokens=True)

for p in prompts:
    print('\nPrompt:',p)
    print('\nBase:\n',gen(Base,Tok,p))
    print('\nTuned:\n',gen(Tuned,Tok,p))

## Step 6. Save a brief report to Drive

In [None]:
from pathlib import Path
R=Path('/content/drive/MyDrive/slm-labs/lab6_report'); R.mkdir(parents=True, exist_ok=True)
with open(R/'summary.txt','w') as f:
    f.write('Lab 6 – Evaluation and Comparison\n')
    f.write(f'Base perplexity: {base_ppl:.3f}\nTuned perplexity: {tuned_ppl:.3f}\n\n')
    for i,p in enumerate(prompts,1):
        f.write(f'Prompt {i}: {p}\n')
        f.write('Base\n-----\n'); f.write(gen(Base, Tok, p)+'\n\n')
        f.write('Tuned\n-----\n'); f.write(gen(Tuned, Tok, p)+'\n\n')
print('Saved report to /content/drive/MyDrive/slm-labs/lab6_report/summary.txt')