## Display results

In [1]:

import json
import os
import logging
from pathlib import Path

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [16]:
from __future__ import annotations

import json, math, re
from pathlib import Path
from typing import Any, Dict, Mapping, Optional, Sequence, Tuple, List

import pandas as pd


# ----------------------------
# Config
# ----------------------------
PRIMARY_METRICS = ["acc", "acc_norm", "exact_match", "word_perplexity"]
EXTRA_METRICS   = ["perplexity", "byte_perplexity", "bits_per_byte"]
ALL_METRICS     = PRIMARY_METRICS + EXTRA_METRICS  # column order base


# ----------------------------
# Loading / discovery
# ----------------------------
def load_json(path: str | Path) -> Dict[str, Any]:
    with Path(path).open("r", encoding="utf-8") as f:
        return json.load(f)

def find_model_pairs(input_dir: str | Path) -> Dict[str, Dict[str, Path]]:
    input_dir = Path(input_dir)
    pat = re.compile(r"^(?P<model>.+?)_(?P<run>[12])\.json$", flags=re.IGNORECASE)
    pairs: Dict[str, Dict[str, Path]] = {}
    for p in input_dir.glob("*.json"):
        m = pat.match(p.name)
        if not m: 
            continue
        pairs.setdefault(m.group("model"), {})[m.group("run")] = p
    return {m: d for m, d in pairs.items() if "1" in d and "2" in d}


# ----------------------------
# Parse & consolidate (two runs)
# ----------------------------
def _canonicalize_metric(raw_metric: str) -> str:
    if "," in raw_metric:
        base, *quals = raw_metric.split(",")
        qual = "__".join(q.strip() for q in quals if q.strip())
        return f"{base.strip()}__{qual}" if qual else base.strip()
    return raw_metric.strip()

def _keep_metric(metric_name: str) -> bool:
    base = metric_name.split("__", 1)[0]
    return (base in ALL_METRICS) or (base.endswith("_stderr") and base[:-7] in ALL_METRICS)

def parse_wandb_summary(summary: Mapping[str, Any]) -> Dict[str, Dict[str, Any]]:
    out: Dict[str, Dict[str, Any]] = {}
    for key, value in summary.items():
        if "/" not in key:
            continue
        task, raw_metric = key.split("/", 1)
        if raw_metric == "alias" or raw_metric.endswith("_eval_results"):
            continue
        metric = _canonicalize_metric(raw_metric)
        if not _keep_metric(metric):
            continue
        keep = None
        if isinstance(value, (int, float)) or value is None:
            keep = value
        else:
            try:
                num = float(value)
                if math.isfinite(num):
                    keep = num
            except Exception:
                pass
        if keep is not None:
            out.setdefault(task, {})[metric] = keep
    return out

def consolidate_two_summaries(
    summary1: Mapping[str, Any],
    summary2: Mapping[str, Any],
    labels: Tuple[str, str] = ("run_1", "run_2"),
) -> Dict[str, Any]:
    s1 = parse_wandb_summary(summary1)
    s2 = parse_wandb_summary(summary2)
    agg: Dict[str, Dict[str, Dict[str, Any]]] = {}
    for task, metrics in s1.items():
        t = agg.setdefault(task, {})
        for m, v in metrics.items():
            t.setdefault(m, {})[labels[0]] = v
    for task, metrics in s2.items():
        t = agg.setdefault(task, {})
        for m, v in metrics.items():
            t.setdefault(m, {})[labels[1]] = v
    return {"metadata": {"runs": [{"label": labels[0]}, {"label": labels[1]}]}, "tasks": agg}


# ----------------------------
# Build “metrics-first” table
#   Rows: tasks
#   Columns: metric per run (e.g., acc[run_1], acc[run_2], acc_norm[run_1], …)
#   Missing -> "–"
# ----------------------------
def build_metrics_first_table(consolidated: Mapping[str, Any]) -> pd.DataFrame:
    runs = [r["label"] for r in consolidated.get("metadata", {}).get("runs", [])]
    tasks = consolidated.get("tasks", {})

    # Gather metric names actually present (plus stderr variants)
    present_metrics = set()
    for m_map in tasks.values():
        present_metrics.update(m_map.keys())

    # Expand desired column order: main metrics first, then any others we kept (deterministic)
    ordered_metrics = []
    for m in ALL_METRICS:
        # include base and any qualified variants (e.g., exact_match__strict-match)
        variants = sorted([x for x in present_metrics if x == m or x.startswith(m + "__")])
        ordered_metrics.extend(variants if variants else [m])  # keep slot even if mostly missing
    # add any remaining metrics (stderr etc.) not in ALL_METRICS
    extras = sorted([m for m in present_metrics if m.split("__",1)[0] not in ALL_METRICS])
    ordered_metrics.extend(extras)

    # Build rows
    rows = {}
    for task, m_map in tasks.items():
        row = {}
        for metric in ordered_metrics:
            for run in runs:
                key = f"{metric} [{run}]"
                val = m_map.get(metric, {}).get(run, None)
                row[key] = val
        rows[task] = row

    df = pd.DataFrame.from_dict(rows, orient="index")
    # Replace missing with en dash for readability
    df = df.where(pd.notna(df), "–")
    # Stable column order
    df = df[[col for col in rows[next(iter(rows))].keys()]] if rows else df
    df.index.name = "task"
    return df


# ----------------------------
# Render/save: HTML + Excel
# ----------------------------
def _fmt_cell(val: Any, metric_name: str) -> str:
    base = metric_name.split("__", 1)[0]
    if val == "–":
        return "–"
    try:
        v = float(val)
        if base in {"acc", "acc_norm", "exact_match"} and 0 <= v <= 1:
            return f"{v:.1%}"
        if base in {"word_perplexity","perplexity","byte_perplexity","bits_per_byte"}:
            return f"{v:.3f}"
        return f"{v:.4f}"
    except Exception:
        return str(val)

def render_html_table(df: pd.DataFrame, out_html: str | Path, title: str = "LM Harness Metrics") -> Path:
    out_html = Path(out_html)
    out_html.parent.mkdir(parents=True, exist_ok=True)

    # Build a per-column formatter using the metric name extracted from header
    formatters = {}
    for col in df.columns:
        metric_name = col.split(" [", 1)[0]
        formatters[col] = (lambda mn: (lambda v: _fmt_cell(v, mn)))(metric_name)

    styler = (
        df.style
          .format(formatters)
          .set_caption(title)
          .set_table_styles([
              {"selector": "caption", "props": [("caption-side","top"), ("font-size","16px"), ("font-weight","600"), ("margin-bottom","8px")]},
              {"selector": "th", "props": [("position","sticky"), ("top","0"), ("background","#fafafa"), ("z-index","2")]},
              {"selector": "th.row_heading", "props": [("position","sticky"), ("left","0"), ("background","#fafafa"), ("z-index","3")]},
              {"selector": "td, th", "props": [("border","1px solid #ddd"), ("padding","6px 8px"), ("font-size","12px"), ("white-space","nowrap")]},
          ])
          .set_properties(**{"text-align": "right"})
    )
    # left-align the task index
    styler = styler.set_properties(subset=pd.IndexSlice[:, :], **{}).set_table_styles([
        {"selector": "tbody th", "props": [("text-align","left")]},
    ], overwrite=False)

    html = styler.to_html()
    out_html.write_text(html, encoding="utf-8")
    return out_html

# def save_excel_table(df: pd.DataFrame, out_xlsx: str | Path, sheet_name: str = "metrics") -> Path:
#     out_xlsx = Path(out_xlsx)
#     out_xlsx.parent.mkdir(parents=True, exist_ok=True)

#     with pd.ExcelWriter(out_xlsx, engine="xlsxwriter") as writer:
#         df.to_excel(writer, sheet_name=sheet_name)
#         wb  = writer.book
#         ws  = writer.sheets[sheet_name]

#         # Formats
#         header_fmt = wb.add_format({"bold": True, "bg_color": "#EFEFEF", "border":1, "text_wrap": False})
#         text_fmt   = wb.add_format({"border":1})
#         num_fmt    = wb.add_format({"border":1})
#         ws.freeze_panes(1, 1)   # freeze header row and task column
#         ws.autofilter(0, 0, df.shape[0], df.shape[1])  # enable filters

#         # Column widths heuristic
#         ws.set_column(0, 0, 24, text_fmt)  # task index
#         for j, col in enumerate(df.columns, start=1):
#             width = max(10, min(28, int(max( len(str(col)), df[col].astype(str).str.len().quantile(0.9) ) + 2 )))
#             ws.set_column(j, j, width, num_fmt)

#         # Header styling
#         for j in range(df.shape[1] + 1):
#             ws.write(0, j, ws.table[0][j] if hasattr(ws, "table") else ws.cell(0, j), header_fmt)

#     return out_xlsx


# ----------------------------
# End-to-end per-model driver
# ----------------------------
def render_per_model_tables(
    input_dir: str | Path,
    output_root: str | Path,
    *,
    table_stem: str = "task_metrics",
    run_labels: Tuple[str, str] = ("run_1", "run_2"),
) -> Dict[str, Dict[str, Path]]:
    """
    For each <model> with <model>_1.json and <model>_2.json:
      - consolidate
      - build metrics-first table (rows=tasks; columns=every metric per run)
      - save HTML and XLSX (Excel) in outputs/<model>/

    Returns: { model: {"html": Path, "xlsx": Path} }
    """
    models = find_model_pairs(input_dir)
    output_root = Path(output_root)
    output_root.mkdir(parents=True, exist_ok=True)

    results: Dict[str, Dict[str, Path]] = {}
    if not models:
        print(f"[INFO] No complete model pairs found in {input_dir}.")
        return results

    print(f"[INFO] Found {len(models)} model(s).")
    for model, pair in sorted(models.items()):
        print(f"\n[MODEL] {model}")
        s1, s2 = load_json(pair["1"]), load_json(pair["2"])
        consolidated = consolidate_two_summaries(s1, s2, labels=run_labels)
        df = build_metrics_first_table(consolidated)

        out_dir = output_root / model
        out_dir.mkdir(parents=True, exist_ok=True)
        html_path = out_dir / f"{table_stem}.html"
        xlsx_path = out_dir / f"{table_stem}.xlsx"

        render_html_table(df, html_path, title=f"{model} — LM Harness Metrics")

        print(f"  - HTML: {html_path}")
        results[model] = {"html": html_path}

    return results


In [17]:
results = render_per_model_tables(
    input_dir="./results",   # where <model>_1.json & <model>_2.json live
    output_root="./outputs",     # will create per-model folders
    table_stem="lmh_metrics",
    run_labels=("run_1","run_2"),
)
results

[INFO] Found 1 model(s).

[MODEL] mha
  - HTML: outputs/mha/lmh_metrics.html


{'mha': {'html': PosixPath('outputs/mha/lmh_metrics.html')}}

In [12]:
!pip install pandas xlsxwriter

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
Installing collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.9
