# Convert multiple PBT logs → machine-readable CSV + Excel workbook

This notebook handles **multiple PBT log files** and **adds a `seed` column** inferred from the filename
(e.g., `bs+wd_seed_38042.log` → seed `38042`).

For each seed/log, we parse:
- per-epoch/per-member metrics
- PBT update events (hyperparameter changes + copy events)
- optional post-update hyperparameter lines
- population-level epoch summaries

Then we reconstruct **per-epoch PBT hyperparameters** (since they change over time), concatenate all seeds,
and export:
- `results/pbt_bs_wd_parsed.csv`
- `results/pbt_bs_wd_parsed.xlsx` (with leaderboards, plus per-seed top-200 sheets)


In [1]:
from pathlib import Path

# =============================================================================
# CONFIGURATION
# =============================================================================

# Global variable: relative directory where outputs will be written.
CSV_REL_DIR = "../Structured Outputs/PBT/"

# Provide one or more PBT log files here. Seed is inferred from the filename.
# Example names: bs+wd_seed_38042.log, bs+wd_seed_217401.log
COMMON_PATH = Path("../Raw Outputs/PBT/Full Logs/")
INPUT_LOG_PATHS = [
    COMMON_PATH / "bs+wd_seed_38042.log",
    COMMON_PATH / "bs+wd_seed_217401.log",
]

# Output names (written inside CSV_REL_DIR)
OUTPUT_CSV_NAME = "pbt_bs_wd_parsed.csv"
OUTPUT_XLSX_NAME = "pbt_bs_wd_parsed.xlsx"

# Derived paths
OUTPUT_DIR = Path(CSV_REL_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_CSV_PATH = OUTPUT_DIR / OUTPUT_CSV_NAME
OUTPUT_XLSX_PATH = OUTPUT_DIR / OUTPUT_XLSX_NAME

print("Will read:", INPUT_LOG_PATHS)
print("Will write CSV:", OUTPUT_CSV_PATH)
print("Will write XLSX:", OUTPUT_XLSX_PATH)


Will read: [PosixPath('../Raw Outputs/PBT/Full Logs/bs+wd_seed_38042.log'), PosixPath('../Raw Outputs/PBT/Full Logs/bs+wd_seed_217401.log')]
Will write CSV: ../Structured Outputs/PBT/pbt_bs_wd_parsed.csv
Will write XLSX: ../Structured Outputs/PBT/pbt_bs_wd_parsed.xlsx


## Parser implementation

We implement a function `parse_pbt_log(path)` which:
1. Infers `seed` from the filename (`seed_XXXXX`).
2. Parses:
   - `(epoch, member)` metrics
   - PBT hyperparameter change events (`old -> new`) per member and per update epoch
   - copy events
   - population summaries
3. Reconstructs per-epoch PBT hyperparameters (`pbt_lr`, `pbt_weight_decay`, `pbt_drop_path`, `pbt_batch_size`)
   because PBT mutates hyperparameters over time.


In [2]:
import re
import pandas as pd

# ----------------------------
# Regex patterns
# ----------------------------
EPOCH_HEADER_RE = re.compile(r"Epoch\s+(?P<epoch>\d+)\s*/\s*(?P<epoch_total>\d+)")
TRAIN_MEMBER_RE = re.compile(r"---\s*Training Member\s+(?P<member>\d+)\s+\(Batch size:\s+(?P<bs>\d+)\)\s+---")

LR_SCHED_RE = re.compile(r"LR changed during epoch:\s+(?P<start>[-+0-9.eE]+)\s+->\s+(?P<end>[-+0-9.eE]+)")
LOSS_RE = re.compile(r"Loss:\s+(?P<loss>[-+0-9.eE]+)")
TRAIN_ACC_RE = re.compile(r"Train Accuracy:\s+(?P<acc>[-+0-9.]+)%")
TEST_ACC_RE = re.compile(r"Test Accuracy:\s+(?P<acc>[-+0-9.]+)%")

BUILT_RE = re.compile(r"built data in\s+(?P<t>[-+0-9.eE]+)\s+seconds")
TRAIN_TIME_RE = re.compile(r"total runtime to train this model was\s+(?P<t>[-+0-9.eE]+)\s+seconds")
EVAL_TIME_RE = re.compile(r"evaluation in\s+(?P<t>[-+0-9.eE]+)\s+seconds")

POP_UPDATE_RE = re.compile(r"--- Population Update \(Epoch\s+(?P<epoch>\d+)\)\s+---")
MEMBER_CHANGE_RE = re.compile(r"Member\s+(?P<member>\d+):\s+(?P<param>lr|weight_decay|drop_path|batch_size)\s+changed from\s+(?P<old>[-+0-9.eE]+)\s+to\s+(?P<new>[-+0-9.eE]+)")
MEMBER_COPIED_RE = re.compile(r"Member\s+(?P<member>\d+)\s+copied from\s+(?P<src>\d+)")

POST_LINE_RE = re.compile(r"LR=(?P<lr>[-+0-9.eE]+),\s*WD=(?P<wd>[-+0-9.eE]+),\s*DropPath=(?P<dp>[-+0-9.eE]+).*Batch=(?P<bs>\d+)")

SUMMARY_START_RE = re.compile(r"Epoch\s+(?P<epoch>\d+)\s+Summary:")
SUMMARY_TIME_RE = re.compile(r"Time:\s+(?P<time>[-+0-9.]+)s.*Avg member:\s+(?P<avg>[-+0-9.]+)s")
POP_MEAN_ACC_RE = re.compile(r"Population Mean Accuracy:\s+(?P<acc>[-+0-9.]+)%")
BEST_MEMBER_ACC_RE = re.compile(r"Best Member Accuracy:\s+(?P<acc>[-+0-9.]+)%")
MEAN_BS_RE = re.compile(r"Mean Batch Size:\s+(?P<bs>\d+)")
MEAN_LR_RE = re.compile(r"Mean Learning Rate:\s+(?P<lr>[-+0-9.eE]+)")
MEAN_WD_RE = re.compile(r"Mean Weight Decay:\s+(?P<wd>[-+0-9.eE]+)")

SEED_FROM_NAME_RE = re.compile(r"seed[_=](?P<seed>\d+)", re.IGNORECASE)

def parse_seed_from_filename(p: Path) -> int:
    m = SEED_FROM_NAME_RE.search(p.name)
    if not m:
        raise ValueError(f"Could not infer seed from filename: {p.name}")
    return int(m.group("seed"))

def parse_pbt_log(path: Path) -> dict:
    seed = parse_seed_from_filename(path)
    text = path.read_text(encoding="utf-8", errors="replace")
    lines = text.splitlines()

    m = re.search(r"Total epochs:\s*(\d+)", text)
    total_epochs = int(m.group(1)) if m else None

    epoch_member_records = []
    epoch_summary_records = []
    hyperparam_change_records = []
    copy_records = []
    post_update_records = []

    current_epoch = None
    current_epoch_total = None
    current_update_epoch = None
    last_changed_member = None
    current_block = {}

    def start_new_member_block(epoch, epoch_total, member, bs):
        return {"seed": seed, "epoch": epoch, "epoch_total": epoch_total, "member": member, "train_batch_size": bs}

    for line in lines:
        m = EPOCH_HEADER_RE.search(line)
        if m and "Summary" not in line:
            current_epoch = int(m.group("epoch"))
            current_epoch_total = int(m.group("epoch_total"))
            continue

        m = TRAIN_MEMBER_RE.search(line)
        if m:
            current_block = start_new_member_block(current_epoch, current_epoch_total, int(m.group("member")), int(m.group("bs")))
            continue

        if current_block:
            m = BUILT_RE.search(line)
            if m: current_block["data_build_s"] = float(m.group("t"))
            m = LR_SCHED_RE.search(line)
            if m:
                current_block["lr_sched_start"] = float(m.group("start"))
                current_block["lr_sched_end"] = float(m.group("end"))
            m = TRAIN_TIME_RE.search(line)
            if m: current_block["train_time_s"] = float(m.group("t"))
            m = EVAL_TIME_RE.search(line)
            if m: current_block["eval_time_s"] = float(m.group("t"))
            m = LOSS_RE.search(line)
            if m: current_block["loss"] = float(m.group("loss"))
            m = TRAIN_ACC_RE.search(line)
            if m: current_block["train_acc_pct"] = float(m.group("acc"))
            m = TEST_ACC_RE.search(line)
            if m:
                current_block["test_acc_pct"] = float(m.group("acc"))
                epoch_member_records.append(current_block.copy())
                current_block = {}
            continue

        m = POP_UPDATE_RE.search(line)
        if m:
            current_update_epoch = int(m.group("epoch"))
            continue

        m = MEMBER_CHANGE_RE.search(line)
        if m:
            hyperparam_change_records.append({
                "seed": seed, "update_epoch": current_update_epoch, "member": int(m.group("member")),
                "param": m.group("param"), "old": float(m.group("old")), "new": float(m.group("new")),
            })
            last_changed_member = int(m.group("member"))
            continue

        m = MEMBER_COPIED_RE.search(line)
        if m:
            copy_records.append({"seed": seed, "update_epoch": current_update_epoch, "member": int(m.group("member")), "copied_from": int(m.group("src"))})
            last_changed_member = int(m.group("member"))
            continue

        m = POST_LINE_RE.search(line)
        if m:
            post_update_records.append({
                "seed": seed, "update_epoch": current_update_epoch, "member": last_changed_member,
                "lr": float(m.group("lr")), "weight_decay": float(m.group("wd")),
                "drop_path": float(m.group("dp")), "batch_size": int(m.group("bs")),
            })
            continue

        m = SUMMARY_START_RE.search(line)
        if m:
            epoch_summary_records.append({"seed": seed, "epoch": int(m.group("epoch"))})
            continue

        if epoch_summary_records:
            cur = epoch_summary_records[-1]
            m = SUMMARY_TIME_RE.search(line)
            if m: cur["epoch_time_s"] = float(m.group("time")); cur["avg_member_time_s"] = float(m.group("avg"))
            m = POP_MEAN_ACC_RE.search(line)
            if m: cur["pop_mean_acc_pct"] = float(m.group("acc"))
            m = BEST_MEMBER_ACC_RE.search(line)
            if m: cur["best_member_acc_pct"] = float(m.group("acc"))
            m = MEAN_BS_RE.search(line)
            if m: cur["mean_batch_size"] = int(m.group("bs"))
            m = MEAN_LR_RE.search(line)
            if m: cur["mean_lr"] = float(m.group("lr"))
            m = MEAN_WD_RE.search(line)
            if m: cur["mean_weight_decay"] = float(m.group("wd"))

    df_epochs = pd.DataFrame(epoch_member_records)
    df_changes = pd.DataFrame(hyperparam_change_records)
    df_copies = pd.DataFrame(copy_records)
    df_post = pd.DataFrame(post_update_records)
    df_summary = pd.DataFrame(epoch_summary_records)

    if total_epochs is None and len(df_epochs):
        total_epochs = int(df_epochs["epoch"].max())

    # Reconstruct per-epoch hyperparameters (piecewise-constant, using change events)
    members = sorted(df_epochs["member"].unique().tolist()) if len(df_epochs) else []
    params = ["lr", "weight_decay", "drop_path", "batch_size"]

    pbt_rows = []
    if total_epochs is not None and len(df_changes):
        for member in members:
            for param in params:
                values = [None] * (total_epochs + 1)
                evts = df_changes[(df_changes["member"] == member) & (df_changes["param"] == param)].sort_values("update_epoch")
                for _, e in evts.iterrows():
                    u = int(e["update_epoch"]); old = float(e["old"]); new = float(e["new"])
                    for ep in range(1, u + 1):
                        if values[ep] is None: values[ep] = old
                    for ep in range(u + 1, total_epochs + 1):
                        values[ep] = new
                for ep in range(1, total_epochs + 1):
                    pbt_rows.append({"epoch": ep, "member": member, f"pbt_{param}": values[ep]})

    df_pbt = pd.DataFrame(pbt_rows)
    if len(df_pbt):
        df_pbt = df_pbt.pivot_table(index=["epoch", "member"], values=[c for c in df_pbt.columns if c.startswith("pbt_")], aggfunc="first").reset_index()
        df_main = df_epochs.merge(df_pbt, on=["epoch", "member"], how="left")
    else:
        df_main = df_epochs.copy()

    # Prefer batch size used during training if mismatch
    if "pbt_batch_size" in df_main.columns and "train_batch_size" in df_main.columns:
        mism = df_main[df_main["pbt_batch_size"].notna() & (df_main["train_batch_size"] != df_main["pbt_batch_size"])]
        if len(mism):
            df_main.loc[mism.index, "pbt_batch_size"] = df_main.loc[mism.index, "train_batch_size"]

    preferred_cols = [
        "seed", "epoch", "epoch_total", "member",
        "train_batch_size",
        "pbt_lr", "pbt_weight_decay", "pbt_drop_path", "pbt_batch_size",
        "lr_sched_start", "lr_sched_end",
        "loss", "train_acc_pct", "test_acc_pct",
        "data_build_s", "train_time_s", "eval_time_s",
    ]
    cols = preferred_cols + [c for c in df_main.columns if c not in preferred_cols]
    df_main = df_main[cols].sort_values(["epoch", "member"]).reset_index(drop=True)

    return {"seed": seed, "main": df_main, "summary": df_summary, "changes": df_changes, "copies": df_copies, "post": df_post}


## Run parser for all logs, concatenate, and export

Outputs:
- CSV: `results/pbt_bs_wd_parsed.csv`
- XLSX: `results/pbt_bs_wd_parsed.xlsx` with:
  - `epoch_member_metrics` (all seeds)
  - `epoch_summary`, `hyperparam_changes`, `copy_events`, `post_update_hparams` (all seeds, with seed column)
  - `leaderboard_top200` (combined)
  - `leaderboard_per_member` (combined)
  - `top200_<seed>` (per-seed top 200 snapshots)


In [3]:
# Parse each log
parsed = [parse_pbt_log(p) for p in INPUT_LOG_PATHS]
parsed = sorted(parsed, key=lambda d: d["seed"])
seed_values = [d["seed"] for d in parsed]

df_all = pd.concat([d["main"] for d in parsed], ignore_index=True) if parsed else pd.DataFrame()
df_summary_all = pd.concat([d["summary"] for d in parsed if len(d["summary"])], ignore_index=True)
df_changes_all = pd.concat([d["changes"] for d in parsed if len(d["changes"])], ignore_index=True)
df_copies_all = pd.concat([d["copies"] for d in parsed if len(d["copies"])], ignore_index=True)
df_post_all = pd.concat([d["post"] for d in parsed if len(d["post"])], ignore_index=True)

# Write CSV
df_all.to_csv(OUTPUT_CSV_PATH, index=False)

def leaderboard_top(df_in: pd.DataFrame, topn: int = 200) -> pd.DataFrame:
    snap = df_in.dropna(subset=["test_acc_pct"]).copy()
    snap = snap.sort_values(["test_acc_pct", "epoch"], ascending=[False, True]).reset_index(drop=True)
    snap.insert(0, "rank", snap.index + 1)
    return snap.head(topn)

def leaderboard_per_member(df_in: pd.DataFrame) -> pd.DataFrame:
    if not len(df_in):
        return pd.DataFrame()
    return (
        df_in.dropna(subset=["test_acc_pct"])
            .sort_values(["seed", "member", "test_acc_pct", "epoch"], ascending=[True, True, False, True])
            .groupby(["seed", "member"], as_index=False)
            .head(1)
            .reset_index(drop=True)
    )

lb_combined = leaderboard_top(df_all, topn=200)
lb_per_member = leaderboard_per_member(df_all)

# Write XLSX
with pd.ExcelWriter(OUTPUT_XLSX_PATH, engine="openpyxl") as writer:
    df_all.to_excel(writer, sheet_name="epoch_member_metrics", index=False)
    if len(df_summary_all): df_summary_all.to_excel(writer, sheet_name="epoch_summary", index=False)
    if len(df_changes_all): df_changes_all.to_excel(writer, sheet_name="hyperparam_changes", index=False)
    if len(df_copies_all): df_copies_all.to_excel(writer, sheet_name="copy_events", index=False)
    if len(df_post_all): df_post_all.to_excel(writer, sheet_name="post_update_hparams", index=False)

    lb_combined.to_excel(writer, sheet_name="leaderboard_top200", index=False)
    lb_per_member.to_excel(writer, sheet_name="leaderboard_per_member", index=False)

    for seed in seed_values:
        leaderboard_top(df_all[df_all["seed"] == seed], topn=200).to_excel(writer, sheet_name=f"top200_{seed}"[:31], index=False)

print("Seeds parsed:", seed_values)
print("Wrote CSV:", OUTPUT_CSV_PATH.resolve())
print("Wrote XLSX:", OUTPUT_XLSX_PATH.resolve())
df_all.head()


Seeds parsed: [38042, 217401]
Wrote CSV: /Users/etaashpatel/Documents/Final Project/Structured Outputs/PBT/pbt_bs_wd_parsed.csv
Wrote XLSX: /Users/etaashpatel/Documents/Final Project/Structured Outputs/PBT/pbt_bs_wd_parsed.xlsx


Unnamed: 0,seed,epoch,epoch_total,member,train_batch_size,pbt_lr,pbt_weight_decay,pbt_drop_path,pbt_batch_size,lr_sched_start,lr_sched_end,loss,train_acc_pct,test_acc_pct,data_build_s,train_time_s,eval_time_s
0,38042,1,60,0,64,0.000189,0.196476,0.123035,64.0,4.84e-08,3.8e-05,1.8867,29.42,36.69,5.395643,61.999089,11.392097
1,38042,1,60,1,128,0.000261,0.26756,0.119554,128.0,1.34e-07,5.2e-05,1.8981,29.23,35.01,1.661272,63.004851,13.567882
2,38042,2,60,0,64,0.000189,0.196476,0.123035,64.0,3.79e-05,7.6e-05,1.6182,40.38,43.57,1.626179,68.211169,13.086053
3,38042,2,60,1,128,0.000261,0.26756,0.119554,128.0,5.23e-05,0.000105,1.6295,40.16,44.72,1.675981,64.784817,13.043758
4,38042,3,60,0,64,0.000189,0.196476,0.123035,64.0,7.58e-05,0.000114,1.4237,47.84,49.85,1.730894,68.868857,13.099745
