# Environment (GPU T4)

In [1]:
!nvidia-smi
!pip -q uninstall -y llama-cpp-python llama-cpp-python-cu121 llama-cpp-python-cu122 llama-cpp-python-cu124 || true


Sun Oct  5 03:52:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Sanity
!python -V
!pip -V

# Clean any old bits (safe if not installed)
!pip -q uninstall -y llama-cpp-python || true

# Install the CUDA 12.4 (cu124) prebuilt wheel
!pip install --upgrade --force-reinstall --no-cache-dir \
  llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124


Python 3.12.11
pip 24.1.2 from /usr/local/lib/python3.12/dist-packages/pip (python 3.12)
[0mLooking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu124
Collecting llama-cpp-python
  Downloading https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl (551.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m551.3/551.3 MB[0m [31m109.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Downloading numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-no

# Download model

In [1]:
!pip -q install -U "huggingface_hub>=0.24.0"

from huggingface_hub import list_repo_files, hf_hub_download, login
import os, pathlib

def download_gguf_model(
    repo_id: str,
    quant_preference: str = "Q4_K_M",
    filename: str | None = None,
    dest_dir: str = "/content/models",
    hf_token: str | None = None,
) -> str:
    """
    Download a GGUF model from Hugging Face.
    - If `filename` is set, fetch that file.
    - Else pick the first *.gguf containing `quant_preference` (case-insensitive),
      falling back to the first *.gguf if none match.
    Returns the local path.
    """
    # Only login if a token is provided (avoid the Colab secrets warning)
    if hf_token:
        login(token=hf_token)

    dest = pathlib.Path(dest_dir)
    dest.mkdir(parents=True, exist_ok=True)

    files = list_repo_files(repo_id, token=hf_token)
    ggufs = [f for f in files if f.lower().endswith(".gguf")]
    if not ggufs:
        raise FileNotFoundError(f"No .gguf files found in {repo_id}")

    if filename:
        if filename not in files:
            raise FileNotFoundError(f"{filename} not found in {repo_id}")
        target = filename
    else:
        cand = [f for f in ggufs if quant_preference.lower() in f.lower()]
        target = sorted(cand)[0] if cand else sorted(ggufs)[0]

    # Be compatible with different huggingface_hub versions
    try:
        path = hf_hub_download(
            repo_id=repo_id,
            filename=target,
            local_dir=dest,
            token=hf_token,
            local_dir_use_symlinks=False,  # preferred kwarg (plural)
        )
    except TypeError:
        # Older versions don’t support local_dir_use_symlinks; retry without it
        path = hf_hub_download(
            repo_id=repo_id,
            filename=target,
            local_dir=dest,
            token=hf_token,
        )

    print(f"Downloaded: {path}")
    return path

# Example: Phi-3.5 Mini Instruct GGUF (public community conversions)
PHI35_GGUF_REPO = "bartowski/Phi-3.5-mini-instruct-GGUF"  # fallback below if needed

try:
    MODEL_PATH = download_gguf_model(
        repo_id=PHI35_GGUF_REPO,
        quant_preference="Q4_K_M",
        dest_dir="/content/models"
    )
except Exception as e:
    print("Primary repo failed, trying TheBloke mirror…", e)
    MODEL_PATH = download_gguf_model(
        repo_id="TheBloke/phi-3.5-mini-instruct-GGUF",
        quant_preference="Q4_K_M",
        dest_dir="/content/models"
    )

print("MODEL_PATH =", MODEL_PATH)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Phi-3.5-mini-instruct-Q4_K_M.gguf:   0%|          | 0.00/2.39G [00:00<?, ?B/s]

Downloaded: /content/models/Phi-3.5-mini-instruct-Q4_K_M.gguf
MODEL_PATH = /content/models/Phi-3.5-mini-instruct-Q4_K_M.gguf


# Create model and prompts skeleton

In [2]:
import time, re, json, math, uuid, pathlib, statistics as stats
from dataclasses import dataclass, asdict
import numpy as np, pandas as pd
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from llama_cpp import Llama

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,
    n_gpu_layers=-1,   # offload as many layers as fit on the T4
    n_batch=512,       # good throughput on T4 for a 7B Q4 model
    logits_all=True,
    seed=1234,
    verbose=True       # banner should say "CUDA = 1" and "offloading ... layers to GPU"
)

def run_llm(prompt, max_tokens=16, temperature=0.0, stop=None):
    t0 = time.time()
    out = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=1.0,
        stop=stop or [],
        echo=False,
        logprobs=5  # enables optional confidence metrics
    )
    dt = time.time() - t0
    txt = out["choices"][0]["text"]
    usage = out.get("usage", {})
    comp_toks = usage.get("completion_tokens", None)
    toks_per_s = (comp_toks/dt) if (comp_toks and dt>0) else None
    # first-token logprobs (for calibration/confidence)
    first_lp = None
    try:
        first_lp = out["choices"][0]["logprobs"]["content"][0]["top_logprobs"]
    except Exception:
        pass
    return txt, dt, toks_per_s, first_lp

# ---------- Prompt templates ----------
def mcq_prompt(q, A, B, C, D):
    return (
    "You are a careful reasoner. Choose the single best answer and reply with ONLY the letter.\n"
    f"Question: {q}\n\nOptions:\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nFinal answer (A/B/C/D) only:"
    )

def gsm_prompt(q):
    return (
    "Solve step by step. At the end, reply with ONLY the final number.\n"
    f"Problem: {q}\n\nFinal answer (number only):"
    )

def parse_letter(s):
    m = re.search(r'\b([ABCD])\b', s.strip())
    return m.group(1) if m else None

def parse_number(s):
    # last number in the string
    m = re.findall(r'-?\d+(?:\.\d+)?', s.replace(',', ''))
    return m[-1] if m else None


ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    yes
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5, VMM: yes
llama_model_load_from_file_impl: using device CUDA0 (Tesla T4) - 14992 MiB free
llama_model_loader: loaded meta data with 40 key-value pairs and 197 tensors from /content/models/Phi-3.5-mini-instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Phi 3.5 Mini Instruct
llama_model_loader: - kv   3:                           general.finetune str              = instruct
llama_model_loader: - kv   4:                           general.basename 

# Sanity check

In [3]:
out = llm("Say hello.", max_tokens=8, temperature=0)
print(out["choices"][0]["text"])

llama_perf_context_print:        load time =      58.75 ms
llama_perf_context_print: prompt eval time =      58.30 ms /     3 tokens (   19.43 ms per token,    51.45 tokens per second)
llama_perf_context_print:        eval time =     141.81 ms /     7 runs   (   20.26 ms per token,    49.36 tokens per second)
llama_perf_context_print:       total time =     204.99 ms /    10 tokens
llama_perf_context_print:    graphs reused =          6




Chatbot: Hello!


In [4]:
import time
t0 = time.time()
out = llm("Write a haiku about code.", max_tokens=32, temperature=0)
dt = time.time() - t0
toks = out.get("usage", {}).get("completion_tokens", 0)
print(f"Latency: {dt:.2f}s  |  toks/sec: {toks/dt if dt>0 else None:.1f}")

llama_perf_context_print:        load time =      58.75 ms
llama_perf_context_print: prompt eval time =     328.41 ms /     7 tokens (   46.92 ms per token,    21.31 tokens per second)
llama_perf_context_print:        eval time =     529.82 ms /    31 runs   (   17.09 ms per token,    58.51 tokens per second)
llama_perf_context_print:       total time =     872.92 ms /    38 tokens
llama_perf_context_print:    graphs reused =         29


Latency: 0.88s  |  toks/sec: 36.4


# Datasets loaders (MMLU + GSM8K)

In [5]:
# MMLU (validation for DEV, test for FINAL)
from datasets import load_dataset
from itertools import chain

# Full subject list in case we need the per-config fallback
MMLU_SUBJECTS = [
 'abstract_algebra','anatomy','astronomy','business_ethics','clinical_knowledge',
 'college_biology','college_chemistry','college_computer_science','college_mathematics',
 'college_medicine','college_physics','computer_security','conceptual_physics','econometrics',
 'electrical_engineering','elementary_mathematics','formal_logic','global_facts',
 'high_school_biology','high_school_chemistry','high_school_computer_science',
 'high_school_european_history','high_school_geography','high_school_government_and_politics',
 'high_school_macroeconomics','high_school_mathematics','high_school_microeconomics',
 'high_school_physics','high_school_psychology','high_school_statistics','high_school_us_history',
 'high_school_world_history','human_aging','human_sexuality','international_law','jurisprudence',
 'logical_fallacies','machine_learning','management','marketing','medical_genetics','miscellaneous',
 'moral_disputes','moral_scenarios','nutrition','philosophy','prehistory','professional_accounting',
 'professional_law','professional_medicine','professional_psychology','public_relations',
 'security_studies','sociology','us_foreign_policy','virology','world_religions'
]

def mmlu_iter(split="validation", limit=None, prefer_all=True):
    """
    Yields dicts with fields:
      id, q, A, B, C, D, gold (A/B/C/D), subject
    """
    count = 0
    try:
        if prefer_all:
            ds = load_dataset("cais/mmlu", "all", split=split)
            for i, ex in enumerate(ds):
                if limit and count >= limit: break
                A, B, C, D = ex["choices"]
                yield {
                    "id": f"mmlu-{i}",
                    "q": ex["question"],
                    "A": A, "B": B, "C": C, "D": D,
                    "gold": ex["answer"],
                    "subject": ex["subject"],
                }
                count += 1
            return
    except Exception as e:
        print("Falling back to per-subject loading:", e)

    # Fallback: iterate each subject config and chain
    for subj in MMLU_SUBJECTS:
        ds = load_dataset("cais/mmlu", subj, split=split)
        for j, ex in enumerate(ds):
            if limit and count >= limit: return
            A, B, C, D = ex["choices"]
            yield {
                "id": f"mmlu-{subj}-{j}",
                "q": ex["question"],
                "A": A, "B": B, "C": C, "D": D,
                "gold": ex["answer"],
                "subject": subj,
            }
            count += 1


# GSM8K (test for FINAL)
def gsm8k_iter(split="test", limit=None):
    ds = load_dataset("gsm8k", "main", split=split)
    for i, ex in enumerate(ds):
        if limit and i >= limit: break
        q = ex["question"]
        # gold after '####'
        m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", ex["answer"].replace(',', ''))
        gold = m.group(1) if m else None
        yield {"id": f"gsm-{i}", "q": q, "gold": gold}


# Runners + metrics

In [8]:
# --- runners + metrics (drop-in cell) ----------------------------------------
# Requires that you already defined elsewhere in the notebook:
#   - mmlu_iter(split, limit), gsm8k_iter(split, limit)
#   - run_llm(prompt, max_tokens, temperature, stop=None) -> (text, latency_s, toks_per_s, top_logprobs)
#   - mcq_prompt(q, A, B, C, D), gsm_prompt(q)
#   - parse_letter(text) -> raw letter-ish prediction; parse_number(text) -> numeric/str for GSM8K
# ---------------------------------------------------------------------------

from dataclasses import dataclass
import re
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

ABCD = ["A", "B", "C", "D"]

@dataclass
class RunCfg:
    model_name: str
    quant: str
    temperature: float = 0.0
    seed: int = 1234


def norm_choice(x):
    """Normalize any 'choice' signal to a single letter 'A'..'D'. Return None if not parseable."""
    if x is None:
        return None

    # Handle NaN (from pandas) quietly
    try:
        if isinstance(x, float) and np.isnan(x):
            return None
    except Exception:
        pass

    s = str(x).strip().upper()
    if not s:
        return None

    # direct single letter
    if s in ABCD:
        return s

    # patterns like 'A.', 'B)', '(C)', 'Answer: D', 'option c'
    m = re.search(r'([ABCD])(?=[\)\].,:;\s]|$)', s)
    if m:
        return m.group(1)

    # numbers 1..4 or 0..3
    m = re.search(r'\b([1-4])\b', s)
    if m:
        return ABCD[int(m.group(1)) - 1]
    m = re.search(r'\b([0-3])\b', s)
    if m:
        return ABCD[int(m.group(1))]

    # last resort: first char if A-D
    if s and s[0] in ABCD:
        return s[0]

    return None


def evaluate_mmlu(cfg: RunCfg, split: str = "validation", limit=None):
    rows = []
    for ex in mmlu_iter(split, limit):
        prompt = mcq_prompt(ex["q"], ex["A"], ex["B"], ex["C"], ex["D"])
        out, dt, tps, top = run_llm(
            prompt,
            max_tokens=8,
            temperature=cfg.temperature,
            stop=["\n"],
        )
        pred = parse_letter(out)  # keep your parser; we'll normalize below
        rows.append({
            "id": ex["id"],
            "subject": ex["subject"],
            "gold": ex["gold"],
            "pred": pred,
            "latency_s": dt,
            "toks_per_s": tps,
            "raw": out,
            "top_logprobs": top,
        })

    df = pd.DataFrame(rows)

    # Normalize AFTER collection to avoid mixing types
    df["gold_norm"] = df["gold"].apply(norm_choice)
    df["pred_norm"] = df["pred"].apply(norm_choice)

    # Mark correctness on normalized labels (invalid rows become False)
    df["correct"] = (
        df["pred_norm"].isin(ABCD)
        & df["gold_norm"].isin(ABCD)
        & (df["pred_norm"] == df["gold_norm"])
    )

    # Evaluate only rows where both sides are valid letters
    eval_mask = df["gold_norm"].isin(ABCD) & df["pred_norm"].isin(ABCD)
    eval_df = df[eval_mask].copy()

    labels = ABCD
    if len(eval_df):
        y_true = eval_df["gold_norm"].tolist()
        y_pred = eval_df["pred_norm"].tolist()
        acc = float(eval_df["correct"].mean())
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, labels=labels, average="macro", zero_division=0
        )
        cm = confusion_matrix(y_true, y_pred, labels=labels).tolist()
    else:
        acc, prec, rec, f1 = 0.0, 0.0, 0.0, 0.0
        cm = [[0] * 4 for _ in range(4)]

    # per-subject accuracy (only on evaluated rows so coverage gaps don’t penalize)
    by_subj = (
        eval_df.groupby("subject")["correct"].mean().sort_values(ascending=False)
        if len(eval_df) else pd.Series(dtype=float)
    )
    by_subj = {k: float(v) for k, v in by_subj.to_dict().items()}

    # latency stats across all processed rows (independent of label validity)
    lat = {
        "p50": float(df["latency_s"].median()) if len(df) else 0.0,
        "p90": float(df["latency_s"].quantile(0.9)) if len(df) else 0.0,
        "mean": float(df["latency_s"].mean()) if len(df) else 0.0,
    }
    # throughput
    tps_series = df["toks_per_s"].dropna()
    perf = {"tps_mean": float(tps_series.mean()) if not tps_series.empty else None}

    metrics = {
        "n_total": int(len(df)),
        "n_eval": int(len(eval_df)),
        "coverage": float(len(eval_df) / len(df) if len(df) else 0.0),
        "accuracy": float(acc),
        "precision_macro": float(prec),
        "recall_macro": float(rec),
        "f1_macro": float(f1),
        "confusion_matrix": cm,
        "latency": lat,
        "throughput": perf,
        "per_subject_accuracy": by_subj,
    }

    return df, metrics


def evaluate_gsm8k(cfg: RunCfg, split: str = "test", limit=None):
    rows = []
    for ex in gsm8k_iter(split, limit):
        out, dt, tps, top = run_llm(
            gsm_prompt(ex["q"]),
            max_tokens=128,
            temperature=cfg.temperature,
        )
        pred = parse_number(out)
        rows.append({
            "id": ex["id"],
            "gold": ex["gold"],
            "pred": pred,
            "latency_s": dt,
            "toks_per_s": tps,
            "raw": out,
        })

    df = pd.DataFrame(rows)

    # strict EM accuracy
    df["correct"] = (df["pred"].astype(str) == df["gold"].astype(str))
    acc = float(df["correct"].mean())

    # format-error rate (non-numeric outputs)
    fmt_err = float(df["pred"].isna().mean())

    # latency + throughput (across all rows)
    lat = {
        "p50": float(df["latency_s"].median()) if len(df) else 0.0,
        "p90": float(df["latency_s"].quantile(0.9)) if len(df) else 0.0,
        "mean": float(df["latency_s"].mean()) if len(df) else 0.0,
    }
    tps = df["toks_per_s"].dropna()
    perf = {"tps_mean": float(tps.mean()) if not tps.empty else None}

    metrics = {
        "n_total": int(len(df)),
        "accuracy_em": acc,
        "format_error_rate": fmt_err,
        "latency": lat,
        "throughput": perf,
    }

    return df, metrics
# ---------------------------------------------------------------------------


# Run a smoke test (fast)

In [9]:
cfg = RunCfg(model_name="phi-3.5-mini-instruct", quant="Q4_K_M")

mmlu_df, mmlu_metrics = evaluate_mmlu(cfg, split="validation", limit=200)   # quick
gsm_df,  gsm_metrics  = evaluate_gsm8k(cfg, split="test", limit=100)        # quick

print("MMLU:", json.dumps(mmlu_metrics, indent=2)[:800], "…")
print("GSM8K:", json.dumps(gsm_metrics, indent=2)[:800], "…")


Llama.generate: 24 prefix-match hit, remaining 56 prompt tokens to eval
llama_perf_context_print:        load time =      58.75 ms
llama_perf_context_print: prompt eval time =      75.65 ms /    56 tokens (    1.35 ms per token,   740.27 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =      78.73 ms /    57 tokens
llama_perf_context_print:    graphs reused =          0
Llama.generate: 24 prefix-match hit, remaining 54 prompt tokens to eval
llama_perf_context_print:        load time =      58.75 ms
llama_perf_context_print: prompt eval time =      68.40 ms /    54 tokens (    1.27 ms per token,   789.49 tokens per second)
llama_perf_context_print:        eval time =      18.07 ms /     1 runs   (   18.07 ms per token,    55.33 tokens per second)
llama_perf_context_print:       total time =      89.67 ms /    55 tokens
llama_perf_context_print:    g

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

llama_perf_context_print:        load time =      58.75 ms
llama_perf_context_print: prompt eval time =     113.66 ms /   102 tokens (    1.11 ms per token,   897.44 tokens per second)
llama_perf_context_print:        eval time =    2213.06 ms /   127 runs   (   17.43 ms per token,    57.39 tokens per second)
llama_perf_context_print:       total time =    2416.24 ms /   229 tokens
llama_perf_context_print:    graphs reused =        122
Llama.generate: 21 prefix-match hit, remaining 39 prompt tokens to eval
llama_perf_context_print:        load time =      58.75 ms
llama_perf_context_print: prompt eval time =      60.94 ms /    39 tokens (    1.56 ms per token,   639.93 tokens per second)
llama_perf_context_print:        eval time =    1831.31 ms /   108 runs   (   16.96 ms per token,    58.97 tokens per second)
llama_perf_context_print:       total time =    1956.86 ms /   147 tokens
llama_perf_context_print:    graphs reused =        103
Llama.generate: 21 prefix-match hit, remaining

MMLU: {
  "n_total": 200,
  "n_eval": 168,
  "coverage": 0.84,
  "accuracy": 0.27380952380952384,
  "precision_macro": 0.26916866028708136,
  "recall_macro": 0.17577519379844958,
  "f1_macro": 0.21137566137566138,
  "confusion_matrix": [
    [
      34,
      31,
      3,
      7
    ],
    [
      7,
      3,
      28,
      5
    ],
    [
      3,
      4,
      9,
      34
    ],
    [
      0,
      0,
      0,
      0
    ]
  ],
  "latency": {
    "p50": 0.1750345230102539,
    "p90": 0.24130010604858398,
    "mean": 0.18871362686157225
  },
  "throughput": {
    "tps_mean": 11.145415350611902
  },
  "per_subject_accuracy": {
    "college_computer_science": 0.5555555555555556,
    "abstract_algebra": 0.5,
    "astronomy": 0.4666666666666667,
    "business_ethics": 0.4444444444444444,
    "c …
GSM8K: {
  "n_total": 100,
  "accuracy_em": 0.16,
  "format_error_rate": 0.0,
  "latency": {
    "p50": 7.14889931678772,
    "p90": 7.721417427062988,
    "mean": 7.196120216846466
  },
  "t

# Persist results (for compare across models later)

In [10]:
# --- persist run outputs -----------------------------------------------------
import uuid, pathlib, time, json
from dataclasses import asdict

# prefer /content on Colab; otherwise fall back to a local folder
run_id = uuid.uuid4().hex[:8]
out_dir = pathlib.Path("/content/llm-bench-runs")
if not out_dir.exists():
    out_dir = pathlib.Path("./llm-bench-runs")
out_dir.mkdir(parents=True, exist_ok=True)

summary = {
    "run_id": run_id,
    "model": cfg.model_name,
    "quant": cfg.quant,
    "hardware": {"gpu": "T4", "gpu_ram_gb": 15, "sys_ram_gb": 12.7},  # tweak if needed
    "settings": asdict(cfg),
    "mmlu": mmlu_metrics,
    "gsm8k": gsm_metrics,
    "timestamp": time.time(),
}

(out_dir / f"{run_id}_summary.json").write_text(json.dumps(summary, indent=2))
mmlu_df.to_json(out_dir / f"{run_id}_mmlu_preds.jsonl", orient="records", lines=True)
gsm_df.to_json(out_dir / f"{run_id}_gsm_preds.jsonl", orient="records", lines=True)

print("Saved under:", out_dir.resolve())
for p in sorted(out_dir.glob(f"{run_id}_*")):
    print(" -", p.name)
# -----------------------------------------------------------------------------


Saved under: /content/llm-bench-runs
 - dfc8d221_gsm_preds.jsonl
 - dfc8d221_mmlu_preds.jsonl
 - dfc8d221_summary.json


# ARC-C + HellaSwag evaluators

In [15]:
# ARC-Challenge & HellaSwag evaluators (complete)

from dataclasses import dataclass, asdict
from typing import List, Tuple, Optional, Dict, Any
from datasets import load_dataset
import pandas as pd
import numpy as np
import time, re, json, math
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# ---------- Shared helpers ----------

LETTER_RE = re.compile(r"\b([A-Z])\b")

def parse_letter_from_set(text: str, allowed: List[str]) -> Optional[str]:
    """
    Robustly parse a single-letter answer from {allowed}.
    Tries strict regex first, then a looser scan.
    """
    if not text:
        return None
    m = LETTER_RE.search(text.upper())
    if m:
        c = m.group(1)
        if c in allowed:
            return c
    up = text.upper()
    for c in allowed:
        if re.search(rf"\b{c}\b", up):
            return c
    return None

def mcq_prompt_generic(question: str, choices: List[str], letters: List[str]) -> str:
    lines = [question.strip(), ""]
    for L, opt in zip(letters, choices):
        lines.append(f"{L}. {opt.strip()}")
    lines.append("")
    letters_slash = "/".join(letters)
    lines.append(f"Choose the single best answer. Reply with one letter only ({letters_slash}).")
    return "\n".join(lines)

# ---------- ARC-Challenge ----------

def arc_iter(split: str = "validation", limit: Optional[int] = None):
    """
    Iterates ARC-Challenge (ai2_arc) with normalized A..E letters and gold = answerKey.
    """
    ds = load_dataset("ai2_arc", "ARC-Challenge", split=split)
    for i, ex in enumerate(ds):
        if limit is not None and i >= limit: break
        q = ex["question"]
        labels = ex["choices"]["label"]   # e.g. ["A","B","C","D"] (sometimes "E")
        texts  = ex["choices"]["text"]
        # Canonicalize A.. order
        pairs = sorted(zip(labels, texts), key=lambda p: p[0])
        letters = [L for L,_ in pairs]
        opts    = [t for _,t in pairs]
        gold = ex.get("answerKey")
        gold = gold.strip().upper() if isinstance(gold, str) else None
        yield {"id": f"arc-{i}", "q": q, "letters": letters, "choices": opts, "gold": gold}

def evaluate_arc(cfg: RunCfg, split: str = "validation", limit: Optional[int] = None):
    rows = []
    for ex in arc_iter(split, limit):
        prompt = mcq_prompt_generic(ex["q"], ex["choices"], ex["letters"])
        out, dt, tps, top = run_llm(prompt, max_tokens=2, temperature=cfg.temperature, stop=["\n"])
        pred = parse_letter_from_set(out, ex["letters"])
        rows.append({
            "id": ex["id"], "gold": ex["gold"], "pred": pred,
            "letters": "".join(ex["letters"]),
            "latency_s": dt, "toks_per_s": tps, "raw": out, "top_logprobs": top
        })
    df = pd.DataFrame(rows)

    df["correct"] = (df["pred"] == df["gold"])
    acc = df["correct"].mean()

    # dynamic label set (handles occasional 'E')
    in_true = set(df["gold"].dropna().unique().tolist())
    in_pred = set(df["pred"].dropna().unique().tolist())
    labels = sorted(list((in_true | in_pred) or set(["A","B","C","D"])))

    y_true = df["gold"].fillna("Z")
    y_pred = df["pred"].fillna("Z")

    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    cm = confusion_matrix(y_true, y_pred, labels=labels).tolist()
    fmt_err = df["pred"].isna().mean()

    lat = {"p50": df["latency_s"].median(), "p90": df["latency_s"].quantile(0.9), "mean": df["latency_s"].mean()}
    tps = df["toks_per_s"].dropna()
    perf = {"tps_mean": float(tps.mean()) if not tps.empty else None}

    metrics = {
        "accuracy": float(acc) if not math.isnan(acc) else None,
        "precision_macro": float(prec),
        "recall_macro": float(rec),
        "f1_macro": float(f1),
        "confusion_matrix_labels": labels,
        "confusion_matrix": cm,
        "format_error_rate": float(fmt_err),
        "latency": lat,
        "throughput": perf,
    }
    return df, metrics

# ---------- HellaSwag ----------

def hellaswag_iter(split: str = "validation", limit: Optional[int] = None):
    """
    Iterates HellaSwag with A..D choices. Use 'validation' for labeled eval.
    """
    ds = load_dataset("hellaswag", split=split)  # 'validation' is labeled; 'test' is unlabeled
    for i, ex in enumerate(ds):
        if limit is not None and i >= limit: break
        ctx = ex["ctx"]
        endings = ex["endings"]  # list of 4
        letters = ["A","B","C","D"]
        gold_idx = ex.get("label", None)
        gold = letters[gold_idx] if gold_idx is not None else None
        q = ctx.strip() + "\n\nWhich ending is most plausible?"
        yield {"id": f"hs-{i}", "q": q, "choices": endings, "letters": letters, "gold": gold}

def evaluate_hellaswag(cfg: RunCfg, split: str = "validation", limit: Optional[int] = None):
    rows = []
    for ex in hellaswag_iter(split, limit):
        prompt = mcq_prompt_generic(ex["q"], ex["choices"], ex["letters"])
        out, dt, tps, top = run_llm(prompt, max_tokens=2, temperature=cfg.temperature, stop=["\n"])
        pred = parse_letter_from_set(out, ex["letters"])
        rows.append({
            "id": ex["id"], "gold": ex["gold"], "pred": pred,
            "latency_s": dt, "toks_per_s": tps, "raw": out, "top_logprobs": top
        })
    df = pd.DataFrame(rows)
    df["correct"] = (df["pred"] == df["gold"])
    acc = df["correct"].mean()

    y_true = df["gold"].fillna("Z")
    y_pred = df["pred"].fillna("Z")
    labels = ["A","B","C","D"]
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    cm = confusion_matrix(y_true, y_pred, labels=labels).tolist()
    fmt_err = df["pred"].isna().mean()

    lat = {"p50": df["latency_s"].median(), "p90": df["latency_s"].quantile(0.9), "mean": df["latency_s"].mean()}
    tps = df["toks_per_s"].dropna()
    perf = {"tps_mean": float(tps.mean()) if not tps.empty else None}

    metrics = {
        "accuracy": float(acc) if not math.isnan(acc) else None,
        "precision_macro": float(prec),
        "recall_macro": float(rec),
        "f1_macro": float(f1),
        "confusion_matrix": cm,
        "format_error_rate": float(fmt_err),
        "latency": lat,
        "throughput": perf,
    }
    return df, metrics


# Smoke test

In [18]:
try:
    _ = cfg  # already set by earlier MMLU/GSM8K smoke test
except NameError:
    cfg = RunCfg(model_name="phi-3.5-mini-instruct", quant="Q4_K_M", temperature=0.0, seed=1234)

print("ARC-Challenge (validation, 100 ex)")
arc_df, arc_metrics = evaluate_arc(cfg, split="validation", limit=100)
print(json.dumps(arc_metrics, indent=2)[:600], "...\n")
display(arc_df.head())

print("\nHellaSwag (validation, 100 ex)")
hs_df, hs_metrics = evaluate_hellaswag(cfg, split="validation", limit=100)
print(json.dumps(hs_metrics, indent=2)[:600], "...\n")
display(hs_df.head())

ARC-Challenge (validation, 100 ex)


llama_perf_context_print:        load time =      58.75 ms
llama_perf_context_print: prompt eval time =     114.20 ms /   104 tokens (    1.10 ms per token,   910.69 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =     119.40 ms /   105 tokens
llama_perf_context_print:    graphs reused =          0
llama_perf_context_print:        load time =      58.75 ms
llama_perf_context_print: prompt eval time =      79.10 ms /    90 tokens (    0.88 ms per token,  1137.81 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =      82.27 ms /    91 tokens
llama_perf_context_print:    graphs reused =          0
llama_perf_context_print:        load time =      58.75 ms
llama_perf_context_print: prompt eval time =     135.81 ms 

{
  "accuracy": 0.0,
  "precision_macro": 0.0,
  "recall_macro": 0.0,
  "f1_macro": 0.0,
  "confusion_matrix_labels": [
    "1",
    "2",
    "A",
    "B",
    "C",
    "D"
  ],
  "confusion_matrix": [
    [
      0,
      0,
      0,
      0,
      0,
      0
    ],
    [
      0,
      0,
      0,
      0,
      0,
      0
    ],
    [
      0,
      0,
      0,
      0,
      0,
      0
    ],
    [
      0,
      0,
      0,
      0,
      0,
      0
    ],
    [
      0,
      0,
      0,
      0,
      0,
      0
    ],
    [
      0,
      0,
      0,
      0,
      0,
      0
    ]
  ] ...



Unnamed: 0,id,gold,pred,letters,latency_s,toks_per_s,raw,top_logprobs,correct
0,arc-0,D,,ABCD,0.165362,6.047334,,,False
1,arc-1,C,,ABCD,0.125493,7.968554,,,False
2,arc-2,D,,ABCD,0.20235,4.941929,,,False
3,arc-3,A,,ABCD,0.110395,9.058365,,,False
4,arc-4,B,,ABCD,0.114341,8.745731,,,False



HellaSwag (validation, 100 ex)


TypeError: list indices must be integers or slices, not str

# Persist results (for compare across models later)

In [13]:
summary.update({"arc_c": arc_metrics, "hellaswag": hs_metrics})
( out_dir / f"{run_id}_arc_preds.jsonl").write_text(arc_df.to_json(orient="records", lines=True))
( out_dir / f"{run_id}_hs_preds.jsonl").write_text(hs_df.to_json(orient="records", lines=True))
( out_dir / f"{run_id}_summary.json").write_text(json.dumps(summary, indent=2))
print("Appended ARC-C + HellaSwag to:", out_dir / f"{run_id}_summary.json")


Appended ARC-C + HellaSwag to: llm-bench-runs/dfc8d221_summary.json
