# Convert PBT WD-ablation PDF log â†’ machine-readable CSV + Excel

This notebook parses a **PBT training log saved as a PDF** and exports:
- `results/pbt_wd_ablation_parsed.csv`
- `results/pbt_wd_ablation_parsed.xlsx`

Parsing rules (conservative / robust to noisy PDF text):
- **LR schedule** is parsed *only* from lines like `LR changed during epoch: a -> b`.
- **Initial hyperparameters** per member are parsed from the 5-line block under  
  `Hyperparameteres for model k at epoch 1`.
- **Hyperparameters are updated only** by the PBT update lines like  
  `Member k: weight_decay changed from ... to ...` under `--- Population Update (Epoch u) ---`.
- Updates are applied starting at epoch `u+1`, and hyperparameters are **backfilled** between updates.

> Note: This PDF prints `Hyperparameteres for model ...` at every epoch, but we intentionally **do not rely**
> on those later prints (they can be noisy in PDF text). We reconstruct the piecewise-constant trajectory from
> the explicit update lines + initial settings.


In [1]:
from pathlib import Path

# =============================================================================
# CONFIGURATION
# =============================================================================

# Global variable: relative directory where outputs will be written.
from pathlib import Path

# =============================================================================
# CONFIGURATION
# =============================================================================

# Global variable: relative directory where outputs will be written.
CSV_REL_DIR = "../Structured Outputs/PBT/"

'''# Input PDFs (seed is inferred from filename substring like 'seed_38042')
COMMON_PATH = Path("../Raw Outputs/PBT/Full Logs/")
INPUT_LOG_PATHS = [
    COMMON_PATH / "pbt_batchsize_ablation_experiment_output_seed_38042.pdf",
    COMMON_PATH / "pbt_batchsize_ablation_experiment_output_seed_217401.pdf",
]

# Output filenames (written inside CSV_REL_DIR)
OUTPUT_CSV_NAME = "pbt_batchsize_pdfs_parsed.csv"
OUTPUT_XLSX_NAME = "pbt_batchsize_pdfs_parsed.xlsx"

OUTPUT_DIR = Path(CSV_REL_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_CSV_PATH = OUTPUT_DIR / OUTPUT_CSV_NAME
OUTPUT_XLSX_PATH = OUTPUT_DIR / OUTPUT_XLSX_NAME

print("Inputs:", INPUT_LOG_PATHS)
print("CSV:", OUTPUT_CSV_PATH)
print("XLSX:", OUTPUT_XLSX_PATH)

CSV_REL_DIR = "results"'''

# Input PDF (seed inferred from filename like 'seed_38042')
INPUT_PDF_PATH = Path("../Raw Outputs/PBT/Full Logs/pbt_wd_ablation_experiment_output_seed_38042.pdf")

# Outputs (written inside CSV_REL_DIR)
OUTPUT_CSV_NAME = "pbt_wd_ablation_parsed.csv"
OUTPUT_XLSX_NAME = "pbt_wd_ablation_parsed.xlsx"

OUTPUT_DIR = Path(CSV_REL_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

OUTPUT_CSV_PATH = OUTPUT_DIR / OUTPUT_CSV_NAME
OUTPUT_XLSX_PATH = OUTPUT_DIR / OUTPUT_XLSX_NAME

print("Input:", INPUT_PDF_PATH)
print("CSV:", OUTPUT_CSV_PATH)
print("XLSX:", OUTPUT_XLSX_PATH)


Input: ../Raw Outputs/PBT/Full Logs/pbt_wd_ablation_experiment_output_seed_38042.pdf
CSV: ../Structured Outputs/PBT/pbt_wd_ablation_parsed.csv
XLSX: ../Structured Outputs/PBT/pbt_wd_ablation_parsed.xlsx


## 1) PDF text extraction (PyMuPDF preferred, pdfplumber fallback)

In [2]:
import re
import pandas as pd

def extract_pdf_text(path: Path) -> str:
    """Extract text from a PDF using PyMuPDF if available, else pdfplumber."""
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(str(path))
        parts = [page.get_text("text") for page in doc]
        doc.close()
        return "\n".join(parts)
    except Exception:
        import pdfplumber
        parts = []
        with pdfplumber.open(str(path)) as pdf:
            for page in pdf.pages:
                parts.append(page.extract_text() or "")
        return "\n".join(parts)


## 2) Regex patterns (conservative)

In [3]:
SEED_RE = re.compile(r"seed[_=](\d+)", re.IGNORECASE)

EPOCH_HDR = re.compile(r"\bEpoch\s+(?P<epoch>\d+)\s*/\s*(?P<epoch_total>\d+)\b")
TRAIN_MEMBER = re.compile(r"---\s*Training\s+Member\s+(?P<member>\d+)\s*\(Batch\s+size:\s*(?P<bs>\d+)\)\s*---")

# LR schedule ONLY from this pattern:
LR_CHANGED = re.compile(r"LR\s+changed\s+during\s+epoch:\s*(?P<start>[-+0-9.eE]+)\s*->\s*(?P<end>[-+0-9.eE]+)")

LOSS = re.compile(r"Loss:\s*(?P<loss>[-+0-9.eE]+)")
TRAIN_ACC = re.compile(r"Train\s+Accuracy:\s*(?P<acc>[-+0-9.]+)\s*%")
TEST_ACC = re.compile(r"Test\s+Accuracy:\s*(?P<acc>[-+0-9.]+)\s*%")

BUILT = re.compile(r"built\s+data\s+in\s+(?P<t>[-+0-9.eE]+)\s+seconds")
TRAIN_TIME = re.compile(r"total\s+runtime\s+to\s+train\s+this\s+model\s+was\s+(?P<t>[-+0-9.eE]+)\s+seconds")
EVAL_TIME = re.compile(r"evaluation\s+in\s+(?P<t>[-+0-9.eE]+)\s+seconds")

TOTAL_EPOCHS = re.compile(r"Total\s+epochs:\s*(?P<t>\d+)")
EXPLOIT_INTERVAL = re.compile(r"Exploit\s+interval:\s*(?P<t>\d+)\s+epochs")

HYPER_HDR = re.compile(r"Hyperparameteres\s+for\s+model\s+(?P<member>\d+)\s+at\s+epoch\s+(?P<epoch>\d+)", re.IGNORECASE)
HP_LINE = re.compile(r"^(?P<k>lr|weight_decay|drop_path|warmup_epochs|batch_size)\s*:\s*(?P<v>[-+0-9.eE]+)\s*$", re.IGNORECASE)

POP_UPDATE = re.compile(r"---\s*Population\s+Update\s+\(Epoch\s+(?P<epoch>\d+)\)\s*---")
CHANGE_LINE = re.compile(
    r"Member\s+(?P<member>\d+):\s*(?P<param>lr|weight_decay|drop_path|batch_size)\s+changed\s+from\s+(?P<old>[-+0-9.eE]+)\s+to\s+(?P<new>[-+0-9.eE]+)",
    re.IGNORECASE,
)
COPIED_LINE = re.compile(r"Member\s+(?P<member>\d+)\s+copied\s+from\s+(?P<src>\d+)", re.IGNORECASE)

POST_LINE = re.compile(
    r"LR=(?P<lr>[-+0-9.eE]+),\s*WD=(?P<wd>[-+0-9.eE]+),\s*DropPath=(?P<dp>[-+0-9.eE]+),\s*Warmup=(?P<warm>\d+)\s*epochs,\s*Batch=(?P<bs>\d+)",
    re.IGNORECASE,
)

SUMMARY_HDR = re.compile(r"Epoch\s+(?P<epoch>\d+)\s+Summary:", re.IGNORECASE)
SUMMARY_TIME = re.compile(r"Time:\s*(?P<time>[-+0-9.]+)s\s*\(Avg\s+member:\s*(?P<avg>[-+0-9.]+)s\)")
POP_MEAN_ACC = re.compile(r"Population\s+Mean\s+Accuracy:\s*(?P<acc>[-+0-9.]+)\s*%")
BEST_MEMBER_ACC = re.compile(r"Best\s+Member\s+Accuracy:\s*(?P<acc>[-+0-9.]+)\s*%")
MEAN_BS = re.compile(r"Mean\s+Batch\s+Size:\s*(?P<bs>\d+)")
MEAN_LR = re.compile(r"Mean\s+Learning\s+Rate:\s*(?P<lr>[-+0-9.eE]+)")
MEAN_WD = re.compile(r"Mean\s+Weight\s+Decay:\s*(?P<wd>[-+0-9.eE]+)")


## 3) Parse the PDF into tables

In [4]:
def parse_seed_from_name(p: Path) -> int:
    m = SEED_RE.search(p.name)
    if not m:
        raise ValueError(f"Could not infer seed from filename: {p.name}")
    return int(m.group(1))

def parse_pbt_pdf(path: Path) -> dict:
    seed = parse_seed_from_name(path)
    text = extract_pdf_text(path)
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]

    total_epochs = int(TOTAL_EPOCHS.search(text).group("t")) if TOTAL_EPOCHS.search(text) else None
    exploit_interval = int(EXPLOIT_INTERVAL.search(text).group("t")) if EXPLOIT_INTERVAL.search(text) else None

    # (1) Initial hyperparams: take the first "Hyperparameteres for model k ..." block per member.
    initial_hp = {}
    i = 0
    while i < len(lines):
        m = HYPER_HDR.search(lines[i])
        if m:
            member = int(m.group("member"))
            hp = {}
            for j in range(1, 12):
                if i + j >= len(lines): break
                mm = HP_LINE.match(lines[i + j])
                if mm:
                    k = mm.group("k").lower()
                    v = float(mm.group("v"))
                    if k in ("warmup_epochs","batch_size"):
                        v = int(round(v))
                    hp[k] = v
                if {"lr","weight_decay","drop_path","warmup_epochs","batch_size"}.issubset(hp.keys()):
                    break
            if member not in initial_hp and hp:
                initial_hp[member] = hp
        i += 1

    # (2) Per-epoch metrics + update events
    records = []
    change_events, copy_events, post_events = [], [], []
    summaries = []

    current_epoch = None
    current_epoch_total = None
    current_block = None
    current_update_epoch = None
    in_summary = False

    for ln in lines:
        m = EPOCH_HDR.search(ln)
        if m and "Summary" not in ln:
            current_epoch = int(m.group("epoch"))
            current_epoch_total = int(m.group("epoch_total"))
            in_summary = False

        m = TRAIN_MEMBER.search(ln)
        if m:
            current_block = {
                "seed": seed,
                "epoch": current_epoch,
                "epoch_total": current_epoch_total,
                "member": int(m.group("member")),
                "train_batch_size": int(m.group("bs")),
            }
            continue

        if current_block is not None:
            m = BUILT.search(ln)
            if m: current_block["data_build_s"] = float(m.group("t"))
            m = LR_CHANGED.search(ln)
            if m:
                # LR schedule ONLY from this line type
                current_block["lr_sched_start"] = float(m.group("start"))
                current_block["lr_sched_end"] = float(m.group("end"))
            m = TRAIN_TIME.search(ln)
            if m: current_block["train_time_s"] = float(m.group("t"))
            m = EVAL_TIME.search(ln)
            if m: current_block["eval_time_s"] = float(m.group("t"))
            m = LOSS.search(ln)
            if m: current_block["loss"] = float(m.group("loss"))
            m = TRAIN_ACC.search(ln)
            if m: current_block["train_acc_pct"] = float(m.group("acc"))
            m = TEST_ACC.search(ln)
            if m:
                current_block["test_acc_pct"] = float(m.group("acc"))
                records.append(current_block)
                current_block = None
            continue

        m = POP_UPDATE.search(ln)
        if m:
            current_update_epoch = int(m.group("epoch"))
            continue
        m = CHANGE_LINE.search(ln)
        if m:
            change_events.append({
                "seed": seed,
                "update_epoch": current_update_epoch,
                "member": int(m.group("member")),
                "param": m.group("param").lower(),
                "old": float(m.group("old")),
                "new": float(m.group("new")),
            })
            continue
        m = COPIED_LINE.search(ln)
        if m:
            copy_events.append({
                "seed": seed,
                "update_epoch": current_update_epoch,
                "member": int(m.group("member")),
                "copied_from": int(m.group("src")),
            })
            continue
        m = POST_LINE.search(ln)
        if m:
            post_events.append({
                "seed": seed,
                "update_epoch": current_update_epoch,
                "lr": float(m.group("lr")),
                "weight_decay": float(m.group("wd")),
                "drop_path": float(m.group("dp")),
                "warmup_epochs": int(m.group("warm")),
                "batch_size": int(m.group("bs")),
            })
            continue

        m = SUMMARY_HDR.search(ln)
        if m:
            summaries.append({"seed": seed, "epoch": int(m.group("epoch"))})
            in_summary = True
            continue
        if in_summary and summaries:
            cur = summaries[-1]
            m = SUMMARY_TIME.search(ln)
            if m:
                cur["epoch_time_s"] = float(m.group("time"))
                cur["avg_member_time_s"] = float(m.group("avg"))
            m = POP_MEAN_ACC.search(ln)
            if m: cur["pop_mean_acc_pct"] = float(m.group("acc"))
            m = BEST_MEMBER_ACC.search(ln)
            if m: cur["best_member_acc_pct"] = float(m.group("acc"))
            m = MEAN_BS.search(ln)
            if m: cur["mean_batch_size"] = int(m.group("bs"))
            m = MEAN_LR.search(ln)
            if m: cur["mean_lr"] = float(m.group("lr"))
            m = MEAN_WD.search(ln)
            if m: cur["mean_weight_decay"] = float(m.group("wd"))

    df_metrics = pd.DataFrame(records)
    df_changes = pd.DataFrame(change_events)
    df_copies = pd.DataFrame(copy_events)
    df_post = pd.DataFrame(post_events)
    df_summary = pd.DataFrame(summaries)

    if total_epochs is None and len(df_metrics):
        total_epochs = int(df_metrics["epoch"].max())

    # (3) Reconstruct hyperparams:
    # Base = initial_hp, then apply changes at update_epoch u for epochs > u.
    members = sorted(df_metrics["member"].unique().tolist()) if len(df_metrics) else sorted(initial_hp.keys())
    params = ["lr","weight_decay","drop_path","warmup_epochs","batch_size"]

    change_map = {}
    if len(df_changes):
        for (mem, par), g in df_changes.groupby(["member","param"]):
            change_map[(int(mem), str(par))] = sorted(
                [(int(u), float(n)) for u, n in zip(g["update_epoch"], g["new"])],
                key=lambda x: x[0],
            )

    hp_rows = []
    for mem in members:
        base = initial_hp.get(mem, {})
        for ep in range(1, total_epochs + 1):
            row = {"seed": seed, "epoch": ep, "member": mem}
            for par in params:
                val = base.get(par)
                for u, newv in change_map.get((mem, par), []):
                    if ep > u:
                        val = newv
                    else:
                        break
                if par in ("warmup_epochs","batch_size") and val is not None:
                    val = int(round(val))
                row[f"pbt_{par}"] = val
            hp_rows.append(row)

    df_hp = pd.DataFrame(hp_rows)
    df_main = df_metrics.merge(df_hp, on=["seed","epoch","member"], how="left") if len(df_metrics) else df_hp
    if "train_batch_size" in df_main.columns:
        df_main["pbt_batch_size"] = df_main["pbt_batch_size"].fillna(df_main["train_batch_size"])

    return df_main, df_changes, df_copies, df_post, df_summary

df_main, df_changes, df_copies, df_post, df_summary = parse_pbt_pdf(INPUT_PDF_PATH)
df_main.head()


Unnamed: 0,seed,epoch,epoch_total,member,train_batch_size,data_build_s,lr_sched_start,lr_sched_end,train_time_s,eval_time_s,loss,train_acc_pct,test_acc_pct,pbt_lr,pbt_weight_decay,pbt_drop_path,pbt_warmup_epochs,pbt_batch_size
0,38042,1,70,0,256,7.242754,1.93e-07,3.8e-05,25.363722,2.166985,2.0628,22.78,30.97,0.000189,0.292952,0.307589,5,256
1,38042,1,70,1,256,1.59843,2.66e-07,5.3e-05,24.825141,3.064337,2.04,23.8,31.2,0.000261,0.43512,0.298885,5,256
2,38042,1,70,2,256,1.541124,4.75e-07,9.4e-05,25.76948,2.314755,2.0106,25.24,31.66,0.000466,0.083222,0.217043,5,256
3,38042,1,70,3,256,1.719979,3.22e-07,6.3e-05,25.652009,3.868828,2.0363,23.82,29.31,0.000316,0.339831,0.206325,5,256
4,38042,1,70,4,256,1.585088,3.64e-07,7.2e-05,26.570324,2.736207,2.0207,24.57,30.69,0.000357,0.433634,0.254329,5,256


## 4) Export CSV + Excel (with leaderboard)

In [5]:
def top_snapshots(df_in: pd.DataFrame, topn=200) -> pd.DataFrame:
    s = df_in.dropna(subset=["test_acc_pct"]).copy()
    s = s.sort_values(["test_acc_pct","epoch"], ascending=[False, True]).reset_index(drop=True)
    s.insert(0,"rank", s.index + 1)
    return s.head(topn)

# CSV
df_main.to_csv(OUTPUT_CSV_PATH, index=False)

# Excel
lb = top_snapshots(df_main, 200)

with pd.ExcelWriter(OUTPUT_XLSX_PATH, engine="openpyxl") as writer:
    df_main.to_excel(writer, sheet_name="epoch_member_metrics", index=False)
    if len(df_summary): df_summary.sort_values(["epoch"]).to_excel(writer, sheet_name="epoch_summary", index=False)
    if len(df_changes): df_changes.sort_values(["update_epoch","member","param"]).to_excel(writer, sheet_name="hyperparam_changes", index=False)
    if len(df_copies): df_copies.sort_values(["update_epoch","member"]).to_excel(writer, sheet_name="copy_events", index=False)
    if len(df_post): df_post.sort_values(["update_epoch"]).to_excel(writer, sheet_name="post_update_lines", index=False)
    lb.to_excel(writer, sheet_name="leaderboard_top200", index=False)

print("Wrote:", OUTPUT_CSV_PATH)
print("Wrote:", OUTPUT_XLSX_PATH)
lb.head(10)


Wrote: ../Structured Outputs/PBT/pbt_wd_ablation_parsed.csv
Wrote: ../Structured Outputs/PBT/pbt_wd_ablation_parsed.xlsx


Unnamed: 0,rank,seed,epoch,epoch_total,member,train_batch_size,data_build_s,lr_sched_start,lr_sched_end,train_time_s,eval_time_s,loss,train_acc_pct,test_acc_pct,pbt_lr,pbt_weight_decay,pbt_drop_path,pbt_warmup_epochs,pbt_batch_size
0,1,38042,69,70,2,256,1.908095,2.08e-06,1e-06,24.33095,3.479003,0.342,87.98,79.46,0.000466,0.083222,0.217043,5,256
1,2,38042,70,70,2,256,1.753601,1.27e-06,1e-06,24.591279,3.234564,0.3463,87.67,79.4,0.000466,0.083222,0.217043,5,256
2,3,38042,66,70,2,256,1.723484,7.74e-06,5e-06,25.304451,2.629037,0.3488,87.62,79.35,0.000466,0.083222,0.217043,5,256
3,4,38042,61,70,1,256,1.618949,2.94e-07,5.8e-05,25.630297,2.290167,0.3906,86.03,79.34,0.000288,0.5,0.344676,5,256
4,5,38042,67,70,2,256,1.762107,5.32e-06,3e-06,24.696815,3.206165,0.3471,87.63,79.31,0.000466,0.083222,0.217043,5,256
5,6,38042,64,70,2,256,1.601513,1.42e-05,1.1e-05,26.066447,2.329392,0.353,87.49,79.29,0.000466,0.083222,0.217043,5,256
6,7,38042,63,70,2,256,1.608932,1.81e-05,1.4e-05,25.692919,2.289454,0.3581,87.23,79.28,0.000466,0.083222,0.217043,5,256
7,8,38042,60,70,2,256,1.60828,3.3e-05,2.8e-05,25.643334,2.217817,0.3736,86.58,79.26,0.000466,0.083222,0.217043,5,256
8,9,38042,61,70,2,256,1.603621,2.76e-05,2.3e-05,25.28257,2.289982,0.3648,86.97,79.26,0.000466,0.083222,0.217043,5,256
9,10,38042,68,70,2,256,1.774074,3.43e-06,2e-06,25.192173,3.032083,0.3427,87.73,79.26,0.000466,0.083222,0.217043,5,256
