[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CalculatedContent/xgboost2ww/blob/main/notebooks/XGBWWDataMultiSourceXGBoost2WWHyperparamSweep.ipynb)

# Multi-source xgbwwdata + xgboost2ww Hyperparameter Sweep Experiments

This Colab notebook is a hyperparameter-sweep variant of `XGBWWDataMultiSourceXGBoost2WW.ipynb`.

Key goals:
- Randomly select **10 datasets** from multi-source `xgbwwdata` candidates.
- For each dataset, evaluate **10+ distinct hyperparameter settings**.
- For each hyperparameter setting, train **5 independent repeats** for error bars.
- Use a strict hold-out test split to measure train/test accuracy (no tuning on test set).
- Convert each trained model with `xgboost2ww` and run WeightWatcher `analyze(randomize=True, ERG_gap=True)`.
- Measure `alpha`, `ERG_gap`, and `num_traps`, and plot each metric vs train/test accuracy with error bars:
  - **Per model:** 3 plots × 10 models = 30 plots.
  - **Combined:** 3 aggregate plots across all models.


In [None]:
#@title Experiment configuration
MATRIX = "W8"  # @param ["W1", "W2", "W7", "W8"]

# Path to xgbwwdata checkpoint/data-file list CSV (set this to your local path in Colab)
CATALOG_CSV = '/content/repo_xgbwwdata/checkpoint/data-file-list.csv'
SAMPLES_PER_SOURCE = 200

N_MODELS = 10
N_HPARAM_SETTINGS = 10
N_REPEATS = 5

RNG = 7
TEST_SIZE = 0.20

MAX_ROWS = 60000
MAX_FEATURES_GUARD = 50_000
MAX_DENSE_ELEMENTS = int(2e8)

MAX_BOOST_ROUNDS = 1200
EARLY_STOPPING_ROUNDS = 80

MIN_CLASSES = 2
MAX_CLASSES = 2  # binary classification only for accuracy comparability


In [None]:
#@title Mount Google Drive and create output directory
from google.colab import drive
import os
from datetime import datetime

drive.mount('/content/drive', force_remount=False)
GDRIVE_DIR = '/content/drive/MyDrive/xgboost2ww_runs'
os.makedirs(GDRIVE_DIR, exist_ok=True)
print('Saving results under:', GDRIVE_DIR)


In [None]:
#@title Install dependencies and xgbwwdata
!apt-get -qq update && apt-get -qq install -y git

%pip install -q -U pip setuptools wheel
%pip install -q weightwatcher xgboost2ww xgboost scikit-learn scipy pandas pyarrow matplotlib

!rm -rf /content/repo_xgbwwdata
!git clone https://github.com/CalculatedContent/xgbwwdata.git /content/repo_xgbwwdata
%run /content/repo_xgbwwdata/scripts/colab_install.py --repo /content/repo_xgbwwdata

import xgbwwdata
import xgboost2ww
import weightwatcher
print('xgbwwdata:', getattr(xgbwwdata, '__file__', None))
print('xgboost2ww:', getattr(xgboost2ww, '__file__', None))
print('weightwatcher:', getattr(weightwatcher, '__file__', None))


In [None]:
#@title Imports and shared helpers
import gc
import time
import warnings
from pathlib import Path

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import weightwatcher as ww

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from xgbwwdata import load_dataset
from xgboost2ww import convert

rng = np.random.default_rng(RNG)


In [None]:
#@title Optional: GPU detection for XGBoost
def xgb_gpu_available() -> bool:
    try:
        Xtmp = np.random.randn(256, 8).astype(np.float32)
        ytmp = (Xtmp[:, 0] > 0).astype(np.int32)
        dtmp = xgb.DMatrix(Xtmp, label=ytmp)
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'max_depth': 2,
            'learning_rate': 0.2,
            'seed': RNG,
        }
        _ = xgb.train(params=params, dtrain=dtmp, num_boost_round=5, verbose_eval=False)
        return True
    except Exception:
        return False

USE_GPU = xgb_gpu_available()
print('XGBoost GPU available:', USE_GPU)


In [None]:
#@title Load checkpoint/data-file list and sample classification datasets
if not Path(CATALOG_CSV).exists():
    raise FileNotFoundError(f'Catalog CSV not found: {CATALOG_CSV}')

df_catalog = pd.read_csv(CATALOG_CSV)
print('Catalog shape:', df_catalog.shape)

required_cols = {"dataset_uid", "source", "task_type"}
missing = required_cols - set(df_catalog.columns)
if missing:
    raise ValueError(f"Catalog is missing required columns: {missing}")

# Accuracy is for classification; keep classification-like tasks
df_cls = df_catalog[
    df_catalog["task_type"].astype(str).str.contains("classification", case=False, na=False)
].copy()
if df_cls.empty:
    raise ValueError('No classification datasets found in catalog.')

# Sample up to SAMPLES_PER_SOURCE per source
def sample_per_source(group):
    n = min(SAMPLES_PER_SOURCE, len(group))
    return group.sample(n=n, random_state=RNG)

df_pick = (
    df_cls.groupby('source', group_keys=False)
    .apply(sample_per_source)
    .reset_index(drop=True)
)

print('Selected datasets:', len(df_pick))
preview_cols = [c for c in ['source', 'dataset_uid', 'name', 'task_type'] if c in df_pick.columns]
display(df_pick[preview_cols].sort_values(['source', 'dataset_uid']))


In [None]:
#@title Select 10 random binary datasets and build hyperparameter grid
def is_binary_record(rec: pd.Series) -> bool:
    nc = rec.get('n_classes', None)
    if pd.isna(nc):
        return True
    return int(nc) >= MIN_CLASSES and int(nc) <= MAX_CLASSES

df_pool = df_pick[df_pick.apply(is_binary_record, axis=1)].copy()
if len(df_pool) < N_MODELS:
    raise RuntimeError(f'Not enough binary datasets ({len(df_pool)}) for N_MODELS={N_MODELS}.')

df_models = df_pool.sample(n=N_MODELS, random_state=RNG).reset_index(drop=True)
display(df_models[['source', 'dataset_uid', 'name']].head(N_MODELS))

def sample_hparams(base_seed: int, n_settings: int):
    local_rng = np.random.default_rng(base_seed)
    settings = []
    for hp_id in range(n_settings):
        s = {
            'hp_id': hp_id,
            'learning_rate': float(local_rng.uniform(0.03, 0.20)),
            'max_depth': int(local_rng.integers(3, 8)),
            'min_child_weight': float(local_rng.uniform(1.0, 8.0)),
            'subsample': float(local_rng.uniform(0.70, 0.95)),
            'colsample_bytree': float(local_rng.uniform(0.70, 0.95)),
            'gamma': float(local_rng.uniform(0.0, 0.6)),
            'reg_alpha': float(local_rng.uniform(0.0, 0.8)),
            'reg_lambda': float(local_rng.uniform(0.8, 4.0)),
            'max_delta_step': int(local_rng.integers(0, 3)),
        }
        settings.append(s)
    return settings

hparam_bank = {
    rec['dataset_uid']: sample_hparams(base_seed=RNG + i * 97, n_settings=N_HPARAM_SETTINGS)
    for i, rec in df_models.iterrows()
}
print('Prepared hyperparameter settings per dataset:', N_HPARAM_SETTINGS)


In [None]:
#@title Train repeated models, convert with xgboost2ww, run WeightWatcher
def ww_metrics_from_layer(layer):
    details = ww.WeightWatcher(model=layer).analyze(
        randomize=True,
        ERG=True,
        ERG_gap=True,
        min_evals=10,
        plot=False,
    )
    if len(details) == 0:
        return np.nan, np.nan, np.nan
    row = details.iloc[0]
    return float(row.get('alpha', np.nan)), float(row.get('num_traps', np.nan)), float(row.get('ERG_gap', np.nan))

rows = []
t0 = time.time()

for midx, rec in df_models.iterrows():
    dataset_uid = rec['dataset_uid']
    source = rec['source']
    name = rec.get('name', dataset_uid)

    try:
        X, y, meta = load_dataset(dataset_uid=dataset_uid, source=source, preprocess=True)
    except Exception as e:
        print('SKIP load:', dataset_uid, type(e).__name__, e)
        continue

    if len(np.unique(y)) != 2:
        print('SKIP non-binary at load-time:', dataset_uid)
        continue

    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RNG + midx, stratify=y
    )

    dtrain = xgb.DMatrix(Xtr, label=ytr)
    dtest = xgb.DMatrix(Xte, label=yte)

    for hp in hparam_bank[dataset_uid]:
        hp_id = hp['hp_id']
        for rep in range(N_REPEATS):
            seed = RNG + midx * 10000 + hp_id * 100 + rep
            params = dict(
                objective='binary:logistic',
                eval_metric='logloss',
                tree_method='hist',
                seed=seed,
                learning_rate=hp['learning_rate'],
                max_depth=hp['max_depth'],
                min_child_weight=hp['min_child_weight'],
                subsample=hp['subsample'],
                colsample_bytree=hp['colsample_bytree'],
                gamma=hp['gamma'],
                reg_alpha=hp['reg_alpha'],
                reg_lambda=hp['reg_lambda'],
                max_delta_step=hp['max_delta_step'],
            )
            if USE_GPU:
                params.update(tree_method='gpu_hist', predictor='gpu_predictor')

            evals_result = {}
            bst = xgb.train(
                params=params,
                dtrain=dtrain,
                num_boost_round=MAX_BOOST_ROUNDS,
                evals=[(dtrain, 'train'), (dtest, 'test')],
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                evals_result=evals_result,
                verbose_eval=False,
            )

            best_ntree = bst.best_ntree_limit if hasattr(bst, 'best_ntree_limit') else bst.best_iteration + 1
            p_tr = bst.predict(dtrain, iteration_range=(0, best_ntree))
            p_te = bst.predict(dtest, iteration_range=(0, best_ntree))
            tr_acc = accuracy_score(ytr, (p_tr >= 0.5).astype(int))
            te_acc = accuracy_score(yte, (p_te >= 0.5).astype(int))

            layer_W = convert(
                model=bst,
                data=Xtr,
                labels=ytr,
                W=MATRIX,
                nfolds=5,
                t_points=120,
                random_state=seed,
                train_params=params,
                num_boost_round=int(best_ntree),
                multiclass='error',
                return_type='torch',
                verbose=False,
            )

            alpha_W, traps_W, ERG_gap_W = ww_metrics_from_layer(layer_W)

            rows.append({
                'dataset_uid': dataset_uid,
                'dataset': meta.get('name', name),
                'source': source,
                'hp_id': hp_id,
                'repeat_id': rep,
                'best_rounds': int(best_ntree),
                'train_acc': float(tr_acc),
                'test_acc': float(te_acc),
                'alpha_W': float(alpha_W),
                'traps_W': float(traps_W),
                'ERG_gap_W': float(ERG_gap_W),
                **{k: v for k, v in hp.items() if k != 'hp_id'},
            })

            print(f"model={midx+1}/{N_MODELS} {dataset_uid} hp={hp_id+1}/{N_HPARAM_SETTINGS} rep={rep+1}/{N_REPEATS} \
                  f"train={tr_acc:.3f} test={te_acc:.3f} alpha={alpha_W:.2f} traps={traps_W:.1f} ERG={ERG_gap_W:.2f}")

            del bst, layer_W
            gc.collect()

    del X, y, Xtr, Xte, ytr, yte, dtrain, dtest
    gc.collect()

df_runs = pd.DataFrame(rows)
elapsed_min = (time.time() - t0) / 60.0
print(f'Finished runs: {len(df_runs)} rows in {elapsed_min:.1f} minutes')
display(df_runs.head(10))


In [None]:
#@title Aggregate repeats for error bars (per dataset, per hyperparameter setting)
if len(df_runs) == 0:
    raise RuntimeError('No runs completed.')

agg = (
    df_runs
    .groupby(['dataset_uid', 'dataset', 'source', 'hp_id'], as_index=False)
    .agg(
        train_acc_mean=('train_acc', 'mean'),
        train_acc_std=('train_acc', 'std'),
        test_acc_mean=('test_acc', 'mean'),
        test_acc_std=('test_acc', 'std'),
        alpha_mean=('alpha_W', 'mean'),
        alpha_std=('alpha_W', 'std'),
        traps_mean=('traps_W', 'mean'),
        traps_std=('traps_W', 'std'),
        ERG_gap_mean=('ERG_gap_W', 'mean'),
        ERG_gap_std=('ERG_gap_W', 'std'),
        repeats=('repeat_id', 'nunique'),
    )
    .sort_values(['dataset_uid', 'hp_id'])
)

for c in [c for c in agg.columns if c.endswith('_std')]:
    agg[c] = agg[c].fillna(0.0)

display(agg.head(20))
print('Aggregated rows:', len(agg))


In [None]:
#@title Per-model plots: metric vs train/test accuracy (3 plots per model, with error bars)
metrics = [
    ('alpha_mean', 'alpha_std', 'alpha'),
    ('ERG_gap_mean', 'ERG_gap_std', 'ERG_gap'),
    ('traps_mean', 'traps_std', 'num_traps'),
]

def plot_metric_vs_acc(df_sub, x_col, xerr_col, x_label, title_prefix=''):
    plt.figure(figsize=(6, 4))
    plt.errorbar(
        df_sub[x_col], df_sub['train_acc_mean'],
        xerr=df_sub[xerr_col], yerr=df_sub['train_acc_std'],
        fmt='o', capsize=3, alpha=0.8, label='train_acc'
    )
    plt.errorbar(
        df_sub[x_col], df_sub['test_acc_mean'],
        xerr=df_sub[xerr_col], yerr=df_sub['test_acc_std'],
        fmt='s', capsize=3, alpha=0.8, label='test_acc'
    )
    plt.xlabel(x_label)
    plt.ylabel('accuracy')
    plt.title(f'{title_prefix}: accuracy vs {x_label}')
    plt.legend()
    plt.tight_layout()
    plt.show()

for dataset_uid, df_sub in agg.groupby('dataset_uid', sort=False):
    ds_name = df_sub['dataset'].iloc[0]
    title_prefix = f'{ds_name} ({dataset_uid})'
    for x_col, xerr_col, x_label in metrics:
        plot_metric_vs_acc(df_sub, x_col, xerr_col, x_label, title_prefix=title_prefix)


In [None]:
#@title Combined plots across all models (one for each WW metric, with error bars)
for x_col, xerr_col, x_label in metrics:
    plt.figure(figsize=(7, 5))
    plt.errorbar(
        agg[x_col], agg['train_acc_mean'],
        xerr=agg[xerr_col], yerr=agg['train_acc_std'],
        fmt='o', capsize=2, alpha=0.6, label='train_acc'
    )
    plt.errorbar(
        agg[x_col], agg['test_acc_mean'],
        xerr=agg[xerr_col], yerr=agg['test_acc_std'],
        fmt='s', capsize=2, alpha=0.6, label='test_acc'
    )
    plt.xlabel(x_label)
    plt.ylabel('accuracy')
    plt.title(f'Combined across all models: accuracy vs {x_label}')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
#@title Save run-level and aggregated results to Google Drive
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
RUNS_FEATHER = os.path.join(GDRIVE_DIR, f'{MATRIX}_multisource_hp_sweep_runs_{ts}.feather')
AGG_FEATHER = os.path.join(GDRIVE_DIR, f'{MATRIX}_multisource_hp_sweep_agg_{ts}.feather')

df_runs.to_feather(RUNS_FEATHER)
agg.to_feather(AGG_FEATHER)
print('Saved run-level:', RUNS_FEATHER, '| rows=', len(df_runs))
print('Saved aggregated:', AGG_FEATHER, '| rows=', len(agg))


In [None]:
#@title Reload latest aggregated results and quick summary table
import glob

files = sorted(glob.glob(os.path.join(GDRIVE_DIR, f'{MATRIX}_multisource_hp_sweep_agg_*.feather')))
if not files:
    raise FileNotFoundError(f'No aggregated files found in {GDRIVE_DIR}')

latest = files[-1]
print('Loading:', latest)
df_latest = pd.read_feather(latest)
display(df_latest.head(20))

summary = (
    df_latest
    .groupby(['dataset_uid', 'dataset', 'source'], as_index=False)
    .agg(
        best_test_acc=('test_acc_mean', 'max'),
        best_train_acc=('train_acc_mean', 'max'),
        mean_alpha=('alpha_mean', 'mean'),
        mean_erg_gap=('ERG_gap_mean', 'mean'),
        mean_traps=('traps_mean', 'mean'),
    )
    .sort_values('best_test_acc', ascending=False)
)
print('Per-dataset summary:')
display(summary)
