In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
# 1️⃣  Clean up possible broken preinstalls
!pip uninstall -y bitsandbytes triton transformers peft accelerate xformers

# 2️⃣  Install correct CUDA-enabled bitsandbytes build + dependencies
!pip install -q bitsandbytes==0.43.3
!pip install -q triton==2.3.0
!pip install -q torch==2.3.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# 3️⃣  Install Hugging Face stack
!pip install -q transformers==4.44.2 peft==0.10.0 accelerate==0.34.2 datasets==2.21.0 scipy numpy

# 4️⃣  (Optional but helpful) install xformers for faster attention
!pip install -q xformers==0.0.27

# 5️⃣  Environment variable to reduce fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 6️⃣  Quick sanity checks
import torch, bitsandbytes as bnb, triton, transformers, peft, accelerate
print("✅ Torch:", torch.__version__)
print("✅ CUDA available:", torch.cuda.is_available())
print("✅ bitsandbytes path:", bnb.__file__)
!nvidia-smi

[0mFound existing installation: triton 3.4.0
Uninstalling triton-3.4.0:
  Successfully uninstalled triton-3.4.0
Found existing installation: transformers 4.57.0
Uninstalling transformers-4.57.0:
  Successfully uninstalled transformers-4.57.0
Found existing installation: peft 0.17.1
Uninstalling peft-0.17.1:
  Successfully uninstalled peft-0.17.1
Found existing installation: accelerate 1.10.1
Uninstalling accelerate-1.10.1:
  Successfully uninstalled accelerate-1.10.1
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.6/155.6 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.1 requires transformers<5.0.0,>=4.41.0, which is not installed.[0m[31m
[

In [1]:
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
ADAPTER_PATH = "/content/qlora-judge-ckpt"
DATA_PATH = "/content/val.jsonl"

OUT_CSV = "/content/judge_eval.csv"
OUT_REPORT = "/content/judge_metrics.json"

In [None]:
import json, re, csv, os, torch, numpy as np
from typing import Dict, Any, List, Optional
from scipy.stats import pearsonr, spearmanr
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# =========================
# ✅ CONFIG
# =========================
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
ADAPTER_PATH = "/content/qlora-judge-ckpt"   # your LoRA adapter path
DATA_PATH = "/content/val.jsonl"        # evaluation dataset path
OUT_CSV = "/content/judge_eval.csv"
OUT_REPORT = "/content/judge_metrics.json"
MAX_LEN = 2048
MAX_NEW_TOKENS = 64
METRICS = ["answer_relevancy", "hallucination", "summarization", "toxicity", "bias"]

# =========================
# ✅ Helper functions
# =========================
def load_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            try:
                rows.append(json.loads(line))
            except Exception: continue
    return rows

def safe_json(txt: str):
    """Extract and parse JSON object from raw model text."""
    m = re.search(r"\{.*\}", txt, re.S)
    if not m: return None
    try:
        return json.loads(m.group(0))
    except Exception:
        return None

def extract_overall(js: Dict[str, Any]) -> Optional[Dict[str, float]]:
    """Extract top-level or 'overall' metric scores."""
    if not isinstance(js, dict): return None
    ov = js.get("overall") or js.get("scores") or js
    res = {}
    for k in METRICS:
        try:
            if ov.get(k) is not None:
                res[k] = float(ov[k])
        except Exception:
            continue
    return res or None

def build_prompt(instr: str, inp: str) -> str:
    return f"{instr.strip()}\n\n{inp.strip()}\n"

def chunk_text(tokenizer, text, max_len=2048):
    """Split long text into <=max_len chunks (preserves all input)."""
    tokens = tokenizer.encode(text)
    for i in range(0, len(tokens), max_len):
        yield tokenizer.decode(tokens[i:i + max_len], skip_special_tokens=True)

@torch.inference_mode()
def generate_json(model, tokenizer, prompt: str, max_new_tokens=64):
    """Generate JSON output for one prompt chunk."""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
                       max_length=MAX_LEN, padding=False).to(model.device)
    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0.0,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    text = tokenizer.decode(output[0], skip_special_tokens=True)
    comp = text[len(prompt):].strip()
    return safe_json(comp)

def average_json_scores(json_list):
    """Average multiple JSONs (for multi-chunk prompts)."""
    agg = {k: [] for k in METRICS}
    for js in json_list:
        sc = extract_overall(js)
        if sc:
            for k in METRICS:
                if k in sc:
                    agg[k].append(sc[k])
    return {k: float(np.mean(v)) for k, v in agg.items() if v}

def correlations(true, pred):
    if len(true) < 3: return None, None
    try:
        return pearsonr(true, pred)[0], spearmanr(true, pred)[0]
    except Exception:
        return None, None

# =========================
# ✅ Model Loading
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16

print("🔹 Loading base model...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=dtype, device_map="auto")
base_model.eval()

print("🔹 Loading tuned model (base + adapter)...")
tuned = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=dtype, device_map="auto")
tuned = PeftModel.from_pretrained(tuned, ADAPTER_PATH)
tuned.eval()

# =========================
# ✅ Evaluation loop
# =========================
rows = load_jsonl(DATA_PATH)
csv_rows = []
gold_vec, base_vec, tuned_vec = {m: [] for m in METRICS}, {m: [] for m in METRICS}, {m: [] for m in METRICS}

for i, ex in enumerate(rows, 1):
    instr, inp, gold_raw = ex.get("instruction", ""), ex.get("input", ""), ex.get("output", "")
    gold = json.loads(gold_raw) if isinstance(gold_raw, str) else gold_raw
    gold_sc = extract_overall(gold)
    if not gold_sc:
        continue

    prompt = build_prompt(instr, inp)
    # detect if prompt exceeds context limit
    tok_len = len(tokenizer.encode(prompt))
    if tok_len > MAX_LEN:
        print(f"⚠️ Prompt {i} too long ({tok_len}), splitting...")
        chunks = list(chunk_text(tokenizer, prompt, MAX_LEN))
    else:
        chunks = [prompt]

    # generate for each chunk & average
    base_preds, tuned_preds = [], []
    for chunk in chunks:
        base_pred = generate_json(base_model, tokenizer, chunk, MAX_NEW_TOKENS)
        tuned_pred = generate_json(tuned, tokenizer, chunk, MAX_NEW_TOKENS)
        if base_pred: base_preds.append(base_pred)
        if tuned_pred: tuned_preds.append(tuned_pred)

    base_sc = average_json_scores(base_preds)
    tuned_sc = average_json_scores(tuned_preds)

    row = {"id": i}
    for m in METRICS:
        g = gold_sc.get(m)
        b = base_sc.get(m) if base_sc else None
        t = tuned_sc.get(m) if tuned_sc else None
        row[f"gold.{m}"] = g
        row[f"base.{m}"] = b
        row[f"tuned.{m}"] = t
        if g is not None and b is not None and t is not None:
            gold_vec[m].append(g)
            base_vec[m].append(b)
            tuned_vec[m].append(t)
    csv_rows.append(row)
    if i % 5 == 0:
        print(f"Processed {i}/{len(rows)} examples...")

# =========================
# ✅ Metrics aggregation
# =========================
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=sorted(csv_rows[0].keys()))
    writer.writeheader()
    writer.writerows(csv_rows)

report = {}
for m in METRICS:
    g, b, t = np.array(gold_vec[m]), np.array(base_vec[m]), np.array(tuned_vec[m])
    if len(g) == 0:
        continue
    mse_b, mae_b = float(np.mean((g - b)**2)), float(np.mean(np.abs(g - b)))
    mse_t, mae_t = float(np.mean((g - t)**2)), float(np.mean(np.abs(g - t)))
    pear_b, spear_b = correlations(g.tolist(), b.tolist())
    pear_t, spear_t = correlations(g.tolist(), t.tolist())
    report[m] = {
        "base": {"mse": mse_b, "mae": mae_b, "pearson": pear_b, "spearman": spear_b},
        "tuned": {"mse": mse_t, "mae": mae_t, "pearson": pear_t, "spearman": spear_t},
        "count": len(g)
    }

with open(OUT_REPORT, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

print("\n✅ Evaluation complete.")
print(json.dumps(report, indent=2))

🔹 Loading base model...
🔹 Loading tuned model (base + adapter)...


Token indices sequence length is longer than the specified maximum sequence length for this model (6560 > 2048). Running this sequence through the model will result in indexing errors


⚠️ Prompt 1 too long (6560), splitting...
⚠️ Prompt 2 too long (5453), splitting...
⚠️ Prompt 3 too long (6738), splitting...
⚠️ Prompt 4 too long (6561), splitting...
⚠️ Prompt 5 too long (6641), splitting...
Processed 5/156 examples...
⚠️ Prompt 6 too long (6467), splitting...
⚠️ Prompt 7 too long (5569), splitting...
⚠️ Prompt 8 too long (6623), splitting...
⚠️ Prompt 9 too long (5477), splitting...
⚠️ Prompt 10 too long (6587), splitting...
Processed 10/156 examples...
⚠️ Prompt 11 too long (6639), splitting...
⚠️ Prompt 12 too long (6607), splitting...
⚠️ Prompt 13 too long (6539), splitting...
⚠️ Prompt 14 too long (6498), splitting...
⚠️ Prompt 15 too long (6582), splitting...
Processed 15/156 examples...
⚠️ Prompt 16 too long (6574), splitting...
⚠️ Prompt 17 too long (6616), splitting...
⚠️ Prompt 18 too long (5224), splitting...
⚠️ Prompt 19 too long (6553), splitting...
⚠️ Prompt 20 too long (6570), splitting...
Processed 20/156 examples...
⚠️ Prompt 21 too long (6567), spli