# Convert PBT PDF logs â†’ machine-readable CSV + Excel workbook

This notebook parses **PBT training logs saved as PDFs** and exports:
- a combined CSV (`results/pbt_experiment_pdfs_parsed.csv`)
- an Excel workbook (`results/pbt_experiment_pdfs_parsed.xlsx`) with extra sheets + leaderboards

Key robustness rules (matching your requirements):
1. **PDF text can be noisy**, so parsing uses conservative regex patterns.
2. **Learning-rate schedule** is parsed *only* from lines like:  
   `LR changed during epoch: 6.17e-05 -> 5.46e-05`  
   and stored as `lr_sched_start`, `lr_sched_end`.
3. **Initial hyperparameter configuration** per member is taken from the block:
   ```
   Hyperparameteres for model k at epoch 1
   lr: ...
   weight_decay: ...
   drop_path: ...
   warmup_epochs: ...
   batch_size: ...
   ```
4. **Hyperparameter updates** are applied *only* after update blocks that contain lines like:
   `Member k: lr changed from ... to ...`  
   Updates at epoch `u` are applied starting at epoch `u+1` (backfilled between updates).


In [1]:
from pathlib import Path

# =============================================================================
# CONFIGURATION
# =============================================================================

# Global variable: relative directory where outputs will be written.
CSV_REL_DIR = "../Structured Outputs/PBT/"

# Input PDFs (seed is inferred from filename substring like 'seed_38042')
COMMON_PATH = Path("../Raw Outputs/PBT/Full Logs/")
INPUT_PDF_PATHS = [
    COMMON_PATH / "pbt_experiment_output_seed_38042.pdf",
    COMMON_PATH / "pbt_experiment_output_seed_217401.pdf",
    COMMON_PATH / "pbt_experiment_output_seed_45921.pdf",
    COMMON_PATH / "pbt_experiment_output_seed_207796.pdf",
    COMMON_PATH / "pbt_experiment_output_seed_637451.pdf",
]
'''INPUT_PDF_PATHS = [
    Path("pbt_experiment_output_seed_38042.pdf"),
    Path("pbt_experiment_output_seed_217401.pdf"),
    Path("pbt_experiment_output_seed_45921.pdf"),
    Path("pbt_experiment_output_seed_207796.pdf"),
    Path("pbt_experiment_output_seed_637451.pdf"),
]'''

# Output filenames (written inside CSV_REL_DIR)
OUTPUT_CSV_NAME = "pbt_experiment_pdfs_parsed.csv"
OUTPUT_XLSX_NAME = "pbt_experiment_pdfs_parsed.xlsx"

OUTPUT_DIR = Path(CSV_REL_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_CSV_PATH = OUTPUT_DIR / OUTPUT_CSV_NAME
OUTPUT_XLSX_PATH = OUTPUT_DIR / OUTPUT_XLSX_NAME

print("Inputs:", INPUT_PDF_PATHS)
print("CSV:", OUTPUT_CSV_PATH)
print("XLSX:", OUTPUT_XLSX_PATH)

Inputs: [PosixPath('../Raw Outputs/PBT/Full Logs/pbt_experiment_output_seed_38042.pdf'), PosixPath('../Raw Outputs/PBT/Full Logs/pbt_experiment_output_seed_217401.pdf'), PosixPath('../Raw Outputs/PBT/Full Logs/pbt_experiment_output_seed_45921.pdf'), PosixPath('../Raw Outputs/PBT/Full Logs/pbt_experiment_output_seed_207796.pdf'), PosixPath('../Raw Outputs/PBT/Full Logs/pbt_experiment_output_seed_637451.pdf')]
CSV: ../Structured Outputs/PBT/pbt_experiment_pdfs_parsed.csv
XLSX: ../Structured Outputs/PBT/pbt_experiment_pdfs_parsed.xlsx


## 1) PDF text extraction

We try **PyMuPDF** (`fitz`) first (fast + reliable). If it isn't available, we fall back to `pdfplumber`.
No OCR is used.


In [2]:
import re
import pandas as pd

def extract_pdf_text(path: Path) -> str:
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(str(path))
        parts = [page.get_text("text") for page in doc]
        doc.close()
        return "\n".join(parts)
    except Exception:
        import pdfplumber
        parts = []
        with pdfplumber.open(str(path)) as pdf:
            for page in pdf.pages:
                parts.append(page.extract_text() or "")
        return "\n".join(parts)


## 2) Conservative regex patterns

We only trust:
- per-epoch LR schedule from `LR changed during epoch: ... -> ...`
- initial hyperparams from the `Hyperparameteres for model ... at epoch 1` block
- hyperparam changes from the `Member k: ... changed from ... to ...` lines


In [3]:
SEED_RE = re.compile(r"seed[_=](\d+)", re.IGNORECASE)

EPOCH_HDR = re.compile(r"\bEpoch\s+(?P<epoch>\d+)\s*/\s*(?P<epoch_total>\d+)\b")
TRAIN_MEMBER = re.compile(r"---\s*Training\s+Member\s+(?P<member>\d+)\s*\(Batch\s+size:\s*(?P<bs>\d+)\)\s*---")

# Requirement (2): LR schedule is ONLY parsed from this pattern.
LR_CHANGED = re.compile(r"LR\s+changed\s+during\s+epoch:\s*(?P<start>[-+0-9.eE]+)\s*->\s*(?P<end>[-+0-9.eE]+)")

LOSS = re.compile(r"Loss:\s*(?P<loss>[-+0-9.eE]+)")
TRAIN_ACC = re.compile(r"Train\s+Accuracy:\s*(?P<acc>[-+0-9.]+)\s*%")
TEST_ACC = re.compile(r"Test\s+Accuracy:\s*(?P<acc>[-+0-9.]+)\s*%")

BUILT = re.compile(r"built\s+data\s+in\s+(?P<t>[-+0-9.eE]+)\s+seconds")
TRAIN_TIME = re.compile(r"total\s+runtime\s+to\s+train\s+this\s+model\s+was\s+(?P<t>[-+0-9.eE]+)\s+seconds")
EVAL_TIME = re.compile(r"evaluation\s+in\s+(?P<t>[-+0-9.eE]+)\s+seconds")

TOTAL_EPOCHS = re.compile(r"Total\s+epochs:\s*(?P<t>\d+)")
EXPLOIT_INTERVAL = re.compile(r"Exploit\s+interval:\s*(?P<t>\d+)\s+epochs")

# Requirement (3): initial hyperparams come from these key:value lines
HYPER_HDR = re.compile(r"Hyperparameteres\s+for\s+model\s+(?P<member>\d+)\s+at\s+epoch\s+(?P<epoch>\d+)", re.IGNORECASE)
HP_LINE = re.compile(r"^(?P<k>lr|weight_decay|drop_path|warmup_epochs|batch_size)\s*:\s*(?P<v>[-+0-9.eE]+)\s*$", re.IGNORECASE)

# Requirement (4): hyperparams change only after these update lines
POP_UPDATE = re.compile(r"---\s*Population\s+Update\s+\(Epoch\s+(?P<epoch>\d+)\)\s*---")
CHANGE_LINE = re.compile(r"Member\s+(?P<member>\d+):\s*(?P<param>lr|weight_decay|drop_path|batch_size)\s+changed\s+from\s+(?P<old>[-+0-9.eE]+)\s+to\s+(?P<new>[-+0-9.eE]+)", re.IGNORECASE)
COPIED_LINE = re.compile(r"Member\s+(?P<member>\d+)\s+copied\s+from\s+(?P<src>\d+)", re.IGNORECASE)

# Optional: consolidated post-update line (sanity checking)
POST_LINE = re.compile(
    r"LR=(?P<lr>[-+0-9.eE]+),\s*WD=(?P<wd>[-+0-9.eE]+),\s*DropPath=(?P<dp>[-+0-9.eE]+),\s*Warmup=(?P<warm>\d+)\s*epochs,\s*Batch=(?P<bs>\d+)",
    re.IGNORECASE,
)


## 3) Parse one PDF into tables

Returns:
- `df_main`: metrics per `(epoch, member)` + reconstructed hyperparams (`pbt_*`)
- `df_changes`, `df_copies`, `df_post` (events)
- `df_summary` (population-level summaries, optional)


In [4]:
def parse_seed_from_name(p: Path) -> int:
    m = SEED_RE.search(p.name)
    if not m:
        raise ValueError(f"Could not infer seed from filename: {p.name}")
    return int(m.group(1))

def parse_pdf_log(path: Path) -> dict:
    seed = parse_seed_from_name(path)
    text = extract_pdf_text(path)
    lines = [ln.strip() for ln in text.splitlines() if ln.strip() != ""]

    # Read global metadata if present
    total_epochs = int(TOTAL_EPOCHS.search(text).group("t")) if TOTAL_EPOCHS.search(text) else None
    exploit_interval = int(EXPLOIT_INTERVAL.search(text).group("t")) if EXPLOIT_INTERVAL.search(text) else None

    # (A) initial hyperparams: first hp print per member (usually epoch 1)
    initial_hp = {}
    i = 0
    while i < len(lines):
        m = HYPER_HDR.search(lines[i])
        if m:
            member = int(m.group("member"))
            hp = {}
            for j in range(1, 12):
                if i + j >= len(lines): break
                mm = HP_LINE.match(lines[i + j])
                if mm:
                    k = mm.group("k").lower()
                    v = float(mm.group("v"))
                    if k in ("warmup_epochs", "batch_size"):
                        v = int(round(v))
                    hp[k] = v
                if {"lr","weight_decay","drop_path","warmup_epochs","batch_size"}.issubset(hp.keys()):
                    break
            if member not in initial_hp and hp:
                initial_hp[member] = hp
        i += 1

    # (B) event tables + metrics
    change_events, copy_events, post_events, records = [], [], [], []
    current_update_epoch = None
    current_epoch = None
    current_epoch_total = None
    current_block = None
    in_summary = False
    summaries = []

    # summary regex (optional)
    SUMMARY_HDR = re.compile(r"Epoch\s+(?P<epoch>\d+)\s+Summary:", re.IGNORECASE)
    SUMMARY_TIME = re.compile(r"Time:\s*(?P<time>[-+0-9.]+)s\s*\(Avg\s+member:\s*(?P<avg>[-+0-9.]+)s\)")
    POP_MEAN_ACC = re.compile(r"Population\s+Mean\s+Accuracy:\s*(?P<acc>[-+0-9.]+)\s*%")
    BEST_MEMBER_ACC = re.compile(r"Best\s+Member\s+Accuracy:\s*(?P<acc>[-+0-9.]+)\s*%")
    MEAN_BS = re.compile(r"Mean\s+Batch\s+Size:\s*(?P<bs>\d+)")
    MEAN_LR = re.compile(r"Mean\s+Learning\s+Rate:\s*(?P<lr>[-+0-9.eE]+)")
    MEAN_WD = re.compile(r"Mean\s+Weight\s+Decay:\s*(?P<wd>[-+0-9.eE]+)")

    for ln in lines:
        m = EPOCH_HDR.search(ln)
        if m and "Summary" not in ln:
            current_epoch = int(m.group("epoch"))
            current_epoch_total = int(m.group("epoch_total"))
            in_summary = False

        m = TRAIN_MEMBER.search(ln)
        if m:
            current_block = {
                "seed": seed,
                "epoch": current_epoch,
                "epoch_total": current_epoch_total,
                "member": int(m.group("member")),
                "train_batch_size": int(m.group("bs")),
            }
            continue

        if current_block is not None:
            m = BUILT.search(ln)
            if m: current_block["data_build_s"] = float(m.group("t"))
            m = LR_CHANGED.search(ln)
            if m:
                # Requirement (2)
                current_block["lr_sched_start"] = float(m.group("start"))
                current_block["lr_sched_end"] = float(m.group("end"))
            m = TRAIN_TIME.search(ln)
            if m: current_block["train_time_s"] = float(m.group("t"))
            m = EVAL_TIME.search(ln)
            if m: current_block["eval_time_s"] = float(m.group("t"))
            m = LOSS.search(ln)
            if m: current_block["loss"] = float(m.group("loss"))
            m = TRAIN_ACC.search(ln)
            if m: current_block["train_acc_pct"] = float(m.group("acc"))
            m = TEST_ACC.search(ln)
            if m:
                current_block["test_acc_pct"] = float(m.group("acc"))
                records.append(current_block)
                current_block = None
            continue

        m = POP_UPDATE.search(ln)
        if m:
            current_update_epoch = int(m.group("epoch"))
            continue
        m = CHANGE_LINE.search(ln)
        if m:
            change_events.append({
                "seed": seed,
                "update_epoch": current_update_epoch,
                "member": int(m.group("member")),
                "param": m.group("param").lower(),
                "old": float(m.group("old")),
                "new": float(m.group("new")),
            })
            continue
        m = COPIED_LINE.search(ln)
        if m:
            copy_events.append({
                "seed": seed,
                "update_epoch": current_update_epoch,
                "member": int(m.group("member")),
                "copied_from": int(m.group("src")),
            })
            continue
        m = POST_LINE.search(ln)
        if m:
            post_events.append({
                "seed": seed,
                "update_epoch": current_update_epoch,
                "lr": float(m.group("lr")),
                "weight_decay": float(m.group("wd")),
                "drop_path": float(m.group("dp")),
                "warmup_epochs": int(m.group("warm")),
                "batch_size": int(m.group("bs")),
            })
            continue

        m = SUMMARY_HDR.search(ln)
        if m:
            summaries.append({"seed": seed, "epoch": int(m.group("epoch"))})
            in_summary = True
            continue
        if in_summary and summaries:
            cur = summaries[-1]
            m = SUMMARY_TIME.search(ln)
            if m: cur["epoch_time_s"] = float(m.group("time")); cur["avg_member_time_s"] = float(m.group("avg"))
            m = POP_MEAN_ACC.search(ln)
            if m: cur["pop_mean_acc_pct"] = float(m.group("acc"))
            m = BEST_MEMBER_ACC.search(ln)
            if m: cur["best_member_acc_pct"] = float(m.group("acc"))
            m = MEAN_BS.search(ln)
            if m: cur["mean_batch_size"] = int(m.group("bs"))
            m = MEAN_LR.search(ln)
            if m: cur["mean_lr"] = float(m.group("lr"))
            m = MEAN_WD.search(ln)
            if m: cur["mean_weight_decay"] = float(m.group("wd"))

    df_metrics = pd.DataFrame(records)
    df_changes = pd.DataFrame(change_events)
    df_copies = pd.DataFrame(copy_events)
    df_post = pd.DataFrame(post_events)
    df_summary = pd.DataFrame(summaries)

    if total_epochs is None and len(df_metrics):
        total_epochs = int(df_metrics["epoch"].max())

    # (C) reconstruct piecewise-constant hyperparams per epoch-member using initial_hp + change events
    members = sorted(df_metrics["member"].unique().tolist()) if len(df_metrics) else sorted(initial_hp.keys())
    params = ["lr", "weight_decay", "drop_path", "warmup_epochs", "batch_size"]

    change_map = {}
    if len(df_changes):
        for (mem, par), g in df_changes.groupby(["member","param"]):
            change_map[(int(mem), str(par))] = sorted([(int(u), float(n)) for u, n in zip(g["update_epoch"], g["new"])], key=lambda x: x[0])

    hp_rows = []
    for mem in members:
        base = initial_hp.get(mem, {})
        for ep in range(1, total_epochs + 1):
            row = {"seed": seed, "epoch": ep, "member": mem}
            for par in params:
                val = base.get(par)
                for u, newv in change_map.get((mem, par), []):
                    if ep > u:
                        val = newv
                    else:
                        break
                if par in ("warmup_epochs","batch_size") and val is not None:
                    val = int(round(val))
                row[f"pbt_{par}"] = val
            hp_rows.append(row)

    df_hp = pd.DataFrame(hp_rows)
    df_main = df_metrics.merge(df_hp, on=["seed","epoch","member"], how="left")

    # batch size: if missing, fill from training header
    df_main["pbt_batch_size"] = df_main["pbt_batch_size"].fillna(df_main["train_batch_size"])

    return {
        "meta": {"seed": seed, "total_epochs": total_epochs, "exploit_interval": exploit_interval, "members": members},
        "main": df_main,
        "changes": df_changes,
        "copies": df_copies,
        "post": df_post,
        "summary": df_summary,
    }


## 4) Run on all PDFs and export CSV + Excel workbook

Workbook sheets:
- `epoch_member_metrics`
- `epoch_summary`
- `hyperparam_changes`
- `copy_events`
- `post_update_lines`
- `leaderboard_top200` (combined)
- `top200_<seed>` (per-seed)


In [5]:
parsed = [parse_pdf_log(p) for p in INPUT_PDF_PATHS]
parsed = sorted(parsed, key=lambda d: d["meta"]["seed"])

df_all = pd.concat([d["main"] for d in parsed], ignore_index=True)
df_changes_all = pd.concat([d["changes"] for d in parsed if len(d["changes"])], ignore_index=True)
df_copies_all = pd.concat([d["copies"] for d in parsed if len(d["copies"])], ignore_index=True)
df_post_all = pd.concat([d["post"] for d in parsed if len(d["post"])], ignore_index=True)
df_summary_all = pd.concat([d["summary"] for d in parsed if len(d["summary"])], ignore_index=True)

df_all.to_csv(OUTPUT_CSV_PATH, index=False)

def top_snapshots(df_in: pd.DataFrame, topn=200) -> pd.DataFrame:
    s = df_in.dropna(subset=["test_acc_pct"]).copy()
    s = s.sort_values(["test_acc_pct","epoch"], ascending=[False, True]).reset_index(drop=True)
    s.insert(0,"rank", s.index+1)
    return s.head(topn)

lb_combined = top_snapshots(df_all, 200)
lbs_by_seed = {seed: top_snapshots(df_all[df_all["seed"]==seed], 200) for seed in sorted(df_all["seed"].unique().tolist())}

with pd.ExcelWriter(OUTPUT_XLSX_PATH, engine="openpyxl") as writer:
    df_all.to_excel(writer, sheet_name="epoch_member_metrics", index=False)
    if len(df_summary_all): df_summary_all.sort_values(["seed","epoch"]).to_excel(writer, sheet_name="epoch_summary", index=False)
    if len(df_changes_all): df_changes_all.sort_values(["seed","update_epoch","member","param"]).to_excel(writer, sheet_name="hyperparam_changes", index=False)
    if len(df_copies_all): df_copies_all.sort_values(["seed","update_epoch","member"]).to_excel(writer, sheet_name="copy_events", index=False)
    if len(df_post_all): df_post_all.sort_values(["seed","update_epoch"]).to_excel(writer, sheet_name="post_update_lines", index=False)
    lb_combined.to_excel(writer, sheet_name="leaderboard_top200", index=False)
    for seed, lb in lbs_by_seed.items():
        lb.to_excel(writer, sheet_name=f"top200_{seed}"[:31], index=False)

print("Wrote CSV:", OUTPUT_CSV_PATH.resolve())
print("Wrote XLSX:", OUTPUT_XLSX_PATH.resolve())
lb_combined.head(10)


Wrote CSV: /Users/etaashpatel/Documents/Final Project/Structured Outputs/PBT/pbt_experiment_pdfs_parsed.csv
Wrote XLSX: /Users/etaashpatel/Documents/Final Project/Structured Outputs/PBT/pbt_experiment_pdfs_parsed.xlsx


Unnamed: 0,rank,seed,epoch,epoch_total,member,train_batch_size,data_build_s,lr_sched_start,lr_sched_end,train_time_s,eval_time_s,loss,train_acc_pct,test_acc_pct,pbt_lr,pbt_weight_decay,pbt_drop_path,pbt_warmup_epochs,pbt_batch_size
0,1,217401,66,70,1,256,1.745984,1.3e-05,9e-06,27.310036,3.417446,0.2195,92.17,79.7,0.000812,0.036872,0.181138,5,256
1,2,217401,69,70,1,256,1.632708,3e-06,1e-06,26.92768,2.405421,0.2101,92.61,79.61,0.000812,0.036872,0.181138,5,256
2,3,217401,68,70,2,256,1.676076,0.000185,0.000174,27.662909,2.487668,0.2381,91.53,79.6,0.00047,0.096679,0.180322,5,256
3,4,217401,62,70,1,256,1.559569,3.9e-05,3.1e-05,27.022432,2.497226,0.2266,91.97,79.58,0.000812,0.036872,0.181138,5,256
4,5,217401,70,70,1,256,1.582054,1e-06,1e-06,26.605139,2.91702,0.2091,92.65,79.57,0.000812,0.036872,0.181138,5,256
5,6,217401,65,70,1,256,1.650762,1.8e-05,1.3e-05,27.17183,2.495758,0.2168,92.22,79.53,0.000812,0.036872,0.181138,5,256
6,7,207796,59,70,4,256,1.611083,5.5e-05,4.6e-05,26.847638,2.552814,0.2145,92.51,79.52,0.000656,0.033785,0.103889,5,256
7,8,217401,67,70,1,256,1.637203,9e-06,5e-06,27.009433,2.423447,0.2096,92.49,79.52,0.000812,0.036872,0.181138,5,256
8,9,217401,68,70,1,256,1.615971,5e-06,3e-06,26.931492,3.588561,0.2144,92.43,79.48,0.000812,0.036872,0.181138,5,256
9,10,207796,57,70,4,256,1.58912,7.3e-05,6.3e-05,27.086997,2.614362,0.2277,92.02,79.47,0.000656,0.033785,0.103889,5,256
