# xgbwwdata Checkpoint Validation Notebook (per-source first model)

This notebook validates that a previously saved multisource checkpoint can be reloaded and reused.
It selects the **first model per data source** from the checkpoint, trains **one additional XGBoost model** per selected dataset, and evaluates holdout accuracy/log loss.


## 1) Imports and runtime settings

In [None]:
import os
import glob
import time
import warnings

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

from xgbwwdata import Filters, load_dataset

warnings.filterwarnings("ignore")
RNG = 0
TEST_SIZE = 0.20


## 2) Load the checkpoint created by the multisource experiment

By default this looks for files saved by `XGBWW_Multisource_Experiment.ipynb` under Google Drive.
If needed, set `CHECKPOINT_PATH` manually to a specific feather file.


In [None]:
# Optional manual override:
CHECKPOINT_PATH = None

GDRIVE_DIR = "/content/drive/MyDrive/xgboost2ww_runs"
PATTERN = os.path.join(GDRIVE_DIR, "*_multisource_results_*.feather")

if CHECKPOINT_PATH is None:
    files = sorted(glob.glob(PATTERN))
    if not files:
        raise FileNotFoundError(f"No checkpoint files found under: {PATTERN}")
    CHECKPOINT_PATH = files[-1]

print("Using checkpoint:", CHECKPOINT_PATH)

df_ckpt = pd.read_feather(CHECKPOINT_PATH)
print("Checkpoint rows:", len(df_ckpt), "| columns:", len(df_ckpt.columns))
display(df_ckpt.head(10))


## 3) Pick the first model from each source

In [None]:
required_cols = {"dataset_uid", "source", "rounds"}
missing = required_cols - set(df_ckpt.columns)
if missing:
    raise ValueError(f"Checkpoint is missing required columns: {missing}")

# Preserve original row order from the checkpoint, then pick first row per source.
df_ordered = df_ckpt.reset_index(drop=False).rename(columns={"index": "checkpoint_row"})
df_first_per_source = (
    df_ordered.sort_values("checkpoint_row")
    .groupby("source", as_index=False)
    .first()
    .sort_values("source")
    .reset_index(drop=True)
)

print("Sources in checkpoint:", df_ckpt["source"].nunique())
print("Rows selected (first per source):", len(df_first_per_source))
display(df_first_per_source[[c for c in ["source","dataset","dataset_uid","rounds","good_test_acc"] if c in df_first_per_source.columns]])


## 4) Reload each selected dataset, train one extra boost model, and evaluate

In [None]:
filters = Filters(
    min_rows=200,
    max_rows=60000,
    max_features=50_000,
    max_dense_elements=int(2e8),
    preprocess=True,
)


def train_extra_model_for_row(rec: pd.Series) -> dict:
    dataset_uid = rec["dataset_uid"]
    source = rec["source"]

    X, y, meta = load_dataset(dataset_uid, filters=filters)
    y = np.asarray(y)

    if len(np.unique(y)) != 2:
        raise ValueError(f"Dataset is not binary classification: {dataset_uid}")

    idx = np.arange(len(y))
    tr_idx, te_idx = train_test_split(idx, test_size=TEST_SIZE, random_state=RNG, stratify=y)
    Xtr, Xte = X[tr_idx], X[te_idx]
    ytr, yte = y[tr_idx], y[te_idx]

    if hasattr(Xtr, "tocsr"):
        Xtr = Xtr.tocsr().astype(np.float32)
        Xte = Xte.tocsr().astype(np.float32)
    else:
        Xtr = np.asarray(Xtr, dtype=np.float32)
        Xte = np.asarray(Xte, dtype=np.float32)

    rounds = int(rec.get("rounds", 100))
    extra_rounds = max(rounds + 25, 50)

    params = dict(
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        seed=RNG,
        learning_rate=0.05,
        max_depth=5,
        min_child_weight=3.0,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
    )

    dtr = xgb.DMatrix(Xtr, label=ytr)
    dte = xgb.DMatrix(Xte, label=yte)
    bst = xgb.train(params=params, dtrain=dtr, num_boost_round=extra_rounds, verbose_eval=False)

    p_te = bst.predict(dte).astype(np.float32)
    yhat = (p_te >= 0.5).astype(int)

    return dict(
        source=source,
        dataset_uid=dataset_uid,
        dataset=meta.get("name", rec.get("dataset", dataset_uid)),
        checkpoint_row=int(rec["checkpoint_row"]),
        checkpoint_rounds=rounds,
        extra_rounds=extra_rounds,
        checkpoint_good_test_acc=float(rec.get("good_test_acc", np.nan)),
        extra_test_acc=float(accuracy_score(yte, yhat)),
        extra_test_logloss=float(log_loss(yte, p_te, labels=[0, 1])),
        n_rows=int(X.shape[0]),
        n_features=int(X.shape[1]),
    )


rows = []
start = time.time()
for _, rec in df_first_per_source.iterrows():
    out = train_extra_model_for_row(rec)
    rows.append(out)
    print(
        f"[{len(rows)}/{len(df_first_per_source)}] {out['source']}: {out['dataset']} | "
        f"checkpoint_acc={out['checkpoint_good_test_acc']:.3f} | extra_acc={out['extra_test_acc']:.3f}",
        flush=True,
    )


df_validation = pd.DataFrame(rows).sort_values("source").reset_index(drop=True)
print(f"Done in {(time.time()-start)/60:.1f} min.")
display(df_validation)


## 5) Sanity checks and optional save

In [None]:
assert len(df_validation) == df_ckpt["source"].nunique(),     "Expected exactly one validation row per source."

print("Per-source checkpoint validation complete.")
print("Mean extra test accuracy:", float(df_validation["extra_test_acc"].mean()))

# Optional output path
OUT_CSV = None  # e.g., "/content/drive/MyDrive/xgboost2ww_runs/checkpoint_validation.csv"
if OUT_CSV:
    df_validation.to_csv(OUT_CSV, index=False)
    print("Saved:", OUT_CSV)
