In [2]:

import json, re, gzip, glob
from pathlib import Path
from typing import Iterable, Tuple, Union


# Matches "key=value" or "key: value"
_KV_SEP = re.compile(r'^\s*([^:=\s]+)\s*[:=]\s*(.+?)\s*$')
# Matches "key<no-separator>number", e.g., "eval/accuracy0.5605", "step200"
_KV_CAT = re.compile(r'^\s*([^\d\-\.+]+)\s*([-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?)\s*$')

def _open(path: Union[str, Path]):
    p = str(path)
    return gzip.open(p, "rt", encoding="utf-8") if p.endswith(".gz") else open(p, "r", encoding="utf-8")

def _try_float(x):
    try:
        return float(x)
    except Exception:
        return x

def _parse_line(line: str):
    """Return a dict for a single line (either a full JSON object or a single key/value),
    or None if not parseable by known patterns.
    """
    s = line.strip()
    if not s:
        return None

    # Try JSON per-line
    try:
        obj = json.loads(s)
        if isinstance(obj, dict):
            return obj  # entire record
    except Exception:
        pass

    m = _KV_SEP.match(s)
    if m:
        k, v = m.group(1), _try_float(m.group(2))
        return {k: v}

    m = _KV_CAT.match(s)
    if m:
        k, v = m.group(1), _try_float(m.group(2))
        return {k: v}

    return None

def load_records(path: Union[str, Path]) -> Iterable[dict]:
    """Yield dict records. Handles JSONL and flat key/value lines grouped by step/blank lines."""
    rec = {}
    with _open(path) as f:
        for raw in f:
            line = raw.strip()
            if not line:
                if rec:
                    yield rec
                    rec = {}
                continue

            parsed = _parse_line(line)
            if parsed is None:
                continue

            # Full JSON object line: flush current, yield full, continue
            if len(parsed) > 1:
                if rec:
                    yield rec; rec = {}
                yield parsed
                continue

            # Single k/v
            k, v = next(iter(parsed.items()))
            if k == "step" and rec:
                # new record begins
                yield rec
                rec = {}
            rec[k] = v

    if rec:
        yield rec

def best_eval_accuracy(path: Union[str, Path], field: str = "eval/accuracy"):
    """Return (accuracy, step, record) for the best metric in a single file, or None if not found."""
    best = None
    for rec in load_records(path):
        acc = rec.get(field, None)
        if isinstance(acc, (int, float)):
            step = rec.get("step")
            if (best is None) or (acc > best[0]):
                best = (acc, step, rec)
    return best

def avg_eval_accuracy(path: Union[str, Path], field: str = "eval/accuracy"):
    """Return (average_accuracy, count, records) for all numeric entries in a single file, or None if none found."""
    values = []
    records = []

    for rec in load_records(path):
        acc = rec.get(field, None)
        if isinstance(acc, (int, float)):
            values.append(acc)
            records.append(rec)

    if not values:
        return None

    avg = sum(values) / len(values)
    return avg, len(values), records

def leaderboard(paths, field: str = "eval/accuracy", min_total: int = 0):
    """For multiple files/globs, return a sorted list of (acc, step, file) by descending acc."""
    files = []
    for p in paths:
        matches = glob.glob(p)
        if matches:
            files.extend(matches)
    files = sorted(set(files))
    rows = []
    for fp in files:
        best = best_eval_accuracy(fp, field=field)
        if best is not None:
            acc, step, rec = best
            rows.append((acc, step, fp))
    rows.sort(key=lambda r: r[0], reverse=True)
    return rows

metrics_file = "mmlu/eval_metrics.jsonl"   # <-- change this
metric_field = "eval/accuracy"

best = best_eval_accuracy(metrics_file, field=metric_field)
if best is None:
    print(f"No numeric '{metric_field}' found in {metrics_file}")
else:
    acc, step, rec = best
    step_str = f"step={int(step)}" if isinstance(step, (int, float)) else f"step={step}" if step is not None else "step=?"
    print(f"Best {metric_field}: {acc:.6f} ({step_str})\nFrom: {metrics_file}")


Best eval/accuracy: 0.646500 (step=1750)
From: mmlu/eval_metrics.jsonl


In [4]:
import json
from pathlib import Path
from typing import List, Dict, Any, Tuple

EXPECTED_STEPS = set(range(50, 2500 + 1, 50))  # 50,100,...,2000

def load_records(jsonl_path: str) -> List[Dict[str, Any]]:
    """Load JSONL file into a list of dicts; ignore malformed lines."""
    recs = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                rec = json.loads(line)
                if isinstance(rec, dict) and "step" in rec:
                    rec["_row"] = i  # keep original order index for debugging
                    recs.append(rec)
            except json.JSONDecodeError:
                # Skip malformed lines silently; or log if you prefer
                pass
    return recs

def split_into_runs(records: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]:
    """
    Split a chronologically ordered list of eval records into runs.
    A new run starts when:
      - step decreases vs previous step, or
      - step == 50 and current run already has items (common restart signature).
    """
    runs: List[List[Dict[str, Any]]] = []
    current: List[Dict[str, Any]] = []
    prev_step = None

    for rec in records:
        step = rec.get("step")
        if prev_step is None:
            current = [rec]
        else:
            # start of a new run?
            if (isinstance(step, (int, float)) and isinstance(prev_step, (int, float)) and step < prev_step) \
               or (step == 50 and current):
                # close old
                if current:
                    runs.append(current)
                current = [rec]
            else:
                current.append(rec)
        prev_step = step

    if current:
        runs.append(current)
    return runs

def avg_eval_accuracy(run: List[Dict[str, Any]], field: str = "eval/accuracy"):
    """Return (average_accuracy, count, records) for all numeric entries in a single file, or None if none found."""
    values = []
    

    for rec in run:
        acc = rec.get(field)
        if isinstance(acc, (int, float)):
            values.append(acc)
    
    if not values:
        return None

    avg = sum(values) / len(values)
    return avg

def best_eval_accuracy(run: List[Dict[str, Any]], field: str = "eval/accuracy") -> Tuple[float, Any, Dict[str, Any]]:
    """Return (best_acc, step_of_best, record_of_best) for a run."""
    best = (-float("inf"), None, None)
    for rec in run:
        acc = rec.get(field)
        if isinstance(acc, (int, float)) and acc > best[0]:
            best = (acc, rec.get("step"), rec)
    return best  # (acc, step, full_record)

def summarize_runs(runs: List[List[Dict[str, Any]]]) -> None:
    print(f"Detected {len(runs)} runs.\n")
    for i, run in enumerate(runs, 1):
        steps = [r.get("step") for r in run if isinstance(r.get("step"), (int, float))]
        step_min = min(steps) if steps else None
        step_max = max(steps) if steps else None
        # how many expected step slots present
        present = sum(1 for s in steps if s in EXPECTED_STEPS)
        best_acc, best_step, _ = best_eval_accuracy(run)
        avg = avg_eval_accuracy(run)
        print(f"Run {i:02d}: {len(run)} records | steps {step_min}→{step_max} "
              f"| expected-steps-present {present}/41 | best acc={best_acc:.6f} @ step={best_step}"
              f"| avg acc {avg:.4f}")

def save_runs(runs: List[List[Dict[str, Any]]], out_dir: str) -> None:
    """
    Save each run to its own JSONL file: run_01.jsonl, run_02.jsonl, ...
    """
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)
    for i, run in enumerate(runs, 1):
        p = out / f"run_{i:02d}.jsonl"
        with p.open("w", encoding="utf-8") as f:
            for rec in run:
                # drop helper keys
                rec_to_write = {k: v for k, v in rec.items() if not k.startswith("_")}
                f.write(json.dumps(rec_to_write, ensure_ascii=False) + "\n")
    print(f"Saved {len(runs)} run files to {out.resolve()}")

# ---------- usage ----------
# If your data is in a file, set this:
jsonl_path = "llama_mmlu_in/eval_metrics.jsonl"
# If it's in-memory text, write it to a temp file first.

records = load_records(jsonl_path)
runs = split_into_runs(records)
summarize_runs(runs)
save_runs(runs, out_dir="isolated_runs")


Detected 6 runs.

Run 01: 4 records | steps 500→2000 | expected-steps-present 4/41 | best acc=0.690071 @ step=2000| avg acc 0.6783
Run 02: 4 records | steps 500→2000 | expected-steps-present 4/41 | best acc=0.696302 @ step=2000| avg acc 0.6837
Run 03: 4 records | steps 500→2000 | expected-steps-present 4/41 | best acc=0.682592 @ step=1500| avg acc 0.6763
Run 04: 4 records | steps 500→2000 | expected-steps-present 4/41 | best acc=0.659327 @ step=1500| avg acc 0.6515
Run 05: 4 records | steps 500→2000 | expected-steps-present 4/41 | best acc=0.631076 @ step=2000| avg acc 0.6272
Run 06: 4 records | steps 500→2000 | expected-steps-present 4/41 | best acc=0.579560 @ step=1500| avg acc 0.5735
Saved 6 run files to /workspace/AUX_DPO/isolated_runs
