# xgboost2ww Experiment (100–1000 random models, multi-source datasets)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CalculatedContent/xgbwwdata/blob/main/XGBWW_Multisource_Experiment.ipynb)

This notebook keeps the original workflow and cell structure, but **replaces direct OpenML dataset calls** with the
`xgbwwdata` package so datasets can come from multiple sources (`openml`, `pmlb`, `keel`, `libsvm`, `amlb`).

It shows how to:
- Pick a “good” XGBoost model with train-only CV
- Evaluate once on a true holdout test split
- Build W-matrices via `xgboost2ww.convert()`
- Run WeightWatcher diagnostics (`alpha`, `traps`, `ERG_gap`)

In [None]:
# Pick Matrix W1 | W2 | W7 | W8
MATRIX = "W7"

# Starter: 100, scale up to 1000
TARGET_DATASETS = 100

# Multi-source scan (NOT OpenML-only)
DATA_SOURCES = ["openml", "pmlb", "keel", "libsvm", "amlb"]

# Optional cap per source pass via scan limit
SCAN_LIMIT = max(TARGET_DATASETS * 3, 300)

Set up folder on Google Drive to save final results

In [None]:
from google.colab import drive
import os
from datetime import datetime

drive.mount("/content/drive", force_remount=False)
GDRIVE_DIR = "/content/drive/MyDrive/xgboost2ww_runs"
os.makedirs(GDRIVE_DIR, exist_ok=True)
print("Saving results under:", GDRIVE_DIR)

Install dependencies

In [None]:
# System deps
!apt-get -qq update && apt-get -qq install -y git

# Python deps for this notebook
%pip install -q -U pip setuptools wheel
%pip install -q "pandas==2.2.2" xgboost weightwatcher scikit-learn scipy pyarrow xgboost2ww

# Install xgbwwdata from fresh clone (same flow as README / XGBDataTest)
!rm -rf /content/repo_xgbwwdata
!git clone https://github.com/CalculatedContent/xgbwwdata.git /content/repo_xgbwwdata
%run /content/repo_xgbwwdata/scripts/colab_install.py --repo /content/repo_xgbwwdata

import xgboost2ww
import xgbwwdata
print("xgboost2ww:", getattr(xgboost2ww, "__file__", None))
print("xgbwwdata:", getattr(xgbwwdata, "__file__", None))


Imports and settings

In [None]:
import warnings, time, gc
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

import torch
import weightwatcher as ww

from xgbwwdata import Filters, scan_datasets, load_dataset, enable_logging
from xgboost2ww import convert

RNG = 0
rng = np.random.default_rng(RNG)

TEST_SIZE = 0.20
NFOLDS = 5
T_TRAJ = 160

MAX_ROWS = 60000
MAX_FEATURES_GUARD = 50_000
MAX_DENSE_ELEMENTS = int(2e8)

GOOD_TRIALS = 5
CV_MAX_ROUNDS = 3000
CV_EARLY_STOP = 150
MIN_GOOD_TEST_ACC = 0.75

Optional: GPU detection for XGBoost

In [None]:
def xgb_gpu_available() -> bool:
    try:
        Xtmp = np.random.randn(256, 8).astype(np.float32)
        ytmp = (Xtmp[:, 0] > 0).astype(np.int32)
        dtmp = xgb.DMatrix(Xtmp, label=ytmp)
        params = dict(
            objective="binary:logistic",
            eval_metric="logloss",
            tree_method="gpu_hist",
            predictor="gpu_predictor",
            max_depth=2,
            learning_rate=0.2,
            seed=RNG,
        )
        _ = xgb.train(params=params, dtrain=dtmp, num_boost_round=5, verbose_eval=False)
        return True
    except Exception:
        return False

USE_GPU = xgb_gpu_available()
print("XGBoost GPU available:", USE_GPU)

## Discover datasets with xgbwwdata (multi-source)

In [None]:
enable_logging()

filters = Filters(
    min_rows=200,
    max_rows=MAX_ROWS,
    max_features=MAX_FEATURES_GUARD,
    max_dense_elements=MAX_DENSE_ELEMENTS,
    preprocess=True,
)

df_registry = scan_datasets(
    sources=DATA_SOURCES,
    limit=SCAN_LIMIT,
    filters=filters,
    smoke_train=True,
    random_state=RNG,
    log_every=25,
)

print("Candidates found:", len(df_registry))
display(df_registry.head(10))
print(df_registry["source"].value_counts(dropna=False))

Pick a “good” XGBoost model using training-only CV

In [None]:
def pick_good_params_via_cv(Xtr, ytr, nfold=5, *, dataset_seed: int):
    dtrain = xgb.DMatrix(Xtr, label=ytr)
    local_rng = np.random.default_rng(RNG + int(dataset_seed))

    best = None
    best_score = np.inf

    for _ in range(GOOD_TRIALS):
        params = dict(
            objective="binary:logistic",
            eval_metric="logloss",
            tree_method="hist",
            seed=RNG,
            learning_rate=float(10 ** local_rng.uniform(-2.0, -0.6)),
            max_depth=int(local_rng.integers(2, 7)),
            min_child_weight=float(10 ** local_rng.uniform(0.0, 2.0)),
            subsample=float(local_rng.uniform(0.6, 0.9)),
            colsample_bytree=float(local_rng.uniform(0.6, 0.9)),
            reg_lambda=float(10 ** local_rng.uniform(0.0, 2.0)),
            gamma=float(local_rng.uniform(0.0, 0.5)),
        )
        if USE_GPU:
            params["tree_method"] = "gpu_hist"
            params["predictor"] = "gpu_predictor"

        cv = xgb.cv(
            params=params,
            dtrain=dtrain,
            num_boost_round=CV_MAX_ROUNDS,
            nfold=nfold,
            stratified=True,
            early_stopping_rounds=CV_EARLY_STOP,
            seed=RNG,
            verbose_eval=False,
        )

        score = float(cv["test-logloss-mean"].iloc[-1])
        rounds = int(len(cv))
        if score < best_score:
            best_score = score
            best = (params, rounds, score)

    return best


def train_eval_fulltrain(Xtr, ytr, Xte, yte, params, rounds):
    dtr = xgb.DMatrix(Xtr, label=ytr)
    dte = xgb.DMatrix(Xte, label=yte)

    bst = xgb.train(params=params, dtrain=dtr, num_boost_round=rounds, verbose_eval=False)

    m_tr = bst.predict(dtr, output_margin=True).astype(np.float32)
    p_tr = 1.0 / (1.0 + np.exp(-m_tr))
    train_acc = float(accuracy_score(ytr, (p_tr >= 0.5).astype(int)))

    m_te = bst.predict(dte, output_margin=True).astype(np.float32)
    p_te = 1.0 / (1.0 + np.exp(-m_te))
    test_acc = float(accuracy_score(yte, (p_te >= 0.5).astype(int)))
    test_loss = float(log_loss(yte, np.vstack([1 - p_te, p_te]).T, labels=[0, 1]))

    return train_acc, test_acc, test_loss, bst

WeightWatcher helper

In [None]:
def ww_metrics_from_layer(layer):
    watcher = ww.WeightWatcher(model=layer)
    details_df = watcher.analyze(randomize=True, ERG=True, plot=False)
    alpha = float(details_df["alpha"].iloc[0]) if "alpha" in details_df.columns else np.nan
    traps = float(details_df["num_traps"].iloc[0]) if "num_traps" in details_df.columns else np.nan
    ERG_gap = float(details_df["ERG_gap"].iloc[0]) if "ERG_gap" in details_df.columns else np.nan
    return alpha, traps, ERG_gap

Run the experiment (xgbwwdata registry + load_dataset)

In [None]:
rows = []
kept = 0
t0 = time.time()

# Keep binary classification only and prefer diverse sources
for _, rec in df_registry.sample(frac=1.0, random_state=RNG).iterrows():
    if kept >= TARGET_DATASETS:
        break

    dataset_uid = rec["dataset_uid"]
    try:
        X, y, meta = load_dataset(dataset_uid, filters=filters)
    except Exception as e:
        print("SKIP load:", dataset_uid, type(e).__name__, e)
        continue

    y = np.asarray(y)
    if len(np.unique(y)) != 2:
        continue

    if int(X.shape[1]) > MAX_FEATURES_GUARD:
        continue

    tr_idx, te_idx = train_test_split(
        np.arange(len(y)),
        test_size=TEST_SIZE,
        random_state=RNG,
        stratify=y,
    )

    Xtr, Xte = X[tr_idx], X[te_idx]
    ytr, yte = y[tr_idx], y[te_idx]

    is_sparse = hasattr(Xtr, "tocsr")
    if is_sparse:
        Xtr = Xtr.tocsr().astype(np.float32)
        Xte = Xte.tocsr().astype(np.float32)
        if int(Xtr.shape[0]) * int(Xtr.shape[1]) > MAX_DENSE_ELEMENTS:
            # conservative guard for convert() if densification happens internally
            continue
    else:
        Xtr = np.asarray(Xtr, dtype=np.float32)
        Xte = np.asarray(Xte, dtype=np.float32)

    seed_from_uid = abs(hash(dataset_uid)) % (2**31 - 1)
    good_params, good_rounds, good_cv_logloss = pick_good_params_via_cv(
        Xtr, ytr, nfold=NFOLDS, dataset_seed=seed_from_uid
    )

    good_train_acc, good_test_acc, good_test_loss, bst = train_eval_fulltrain(
        Xtr, ytr, Xte, yte, good_params, good_rounds
    )

    if good_test_acc < MIN_GOOD_TEST_ACC:
        del bst, X, y, Xtr, Xte, ytr, yte
        gc.collect()
        continue

    try:
        layer_W = convert(
            model=bst,
            data=Xtr,
            labels=ytr,
            W=MATRIX,
            nfolds=NFOLDS,
            t_points=T_TRAJ,
            random_state=RNG,
            train_params=good_params,
            num_boost_round=good_rounds,
            multiclass="error",
            return_type="torch",
            verbose=False,
        )
    except Exception as e:
        print("SKIP convert:", dataset_uid, type(e).__name__, e)
        del bst, X, y, Xtr, Xte, ytr, yte
        gc.collect()
        continue

    alpha_W, traps_W, ERG_gap_W = ww_metrics_from_layer(layer_W)

    rows.append(dict(
        dataset_uid=dataset_uid,
        source=rec.get("source", "unknown"),
        dataset=meta.get("name", rec.get("name", dataset_uid)),
        n_rows_total=int(X.shape[0]),
        n_train=int(Xtr.shape[0]),
        n_test=int(Xte.shape[0]),
        n_features=int(X.shape[1]),
        rounds=int(good_rounds),
        cv_logloss=float(good_cv_logloss),
        good_train_acc=float(good_train_acc),
        good_test_acc=float(good_test_acc),
        good_test_loss=float(good_test_loss),
        alpha_W=float(alpha_W),
        traps_W=float(traps_W),
        ERG_gap_W=float(ERG_gap_W),
    ))

    kept += 1
    elapsed = (time.time() - t0) / 60.0
    print(
        f"[{kept}/{TARGET_DATASETS}] {meta.get('name', dataset_uid)} ({dataset_uid}) "
        f"| src={rec.get('source')} | train/test={good_train_acc:.3f}/{good_test_acc:.3f} "
        f"| α(W)={alpha_W:.2f} traps(W)={traps_W:.1f} | elapsed={elapsed:.1f} min",
        flush=True,
    )

    if kept % 10 == 0:
        batch = pd.DataFrame(rows[-10:]).reset_index(drop=True)
        x = np.arange(len(batch))
        w = 0.4

        plt.figure(figsize=(12, 4))
        plt.bar(x - w / 2, batch["good_test_acc"].values, width=w, color="blue", label="test accuracy")
        plt.bar(x + w / 2, batch["good_train_acc"].values, width=w, color="red", label="training accuracy")
        plt.xticks(x, batch["dataset"].values, rotation=90)
        plt.ylim(0.0, 1.0)
        plt.ylabel("accuracy")
        plt.title(f"Train/Test accuracy for models {kept - 9}-{kept}")
        plt.legend()
        plt.tight_layout()
        plt.show()

    del bst, layer_W, X, y, Xtr, Xte, ytr, yte
    gc.collect()

df_good = pd.DataFrame(rows)
print(f"DONE. datasets_kept={df_good['dataset_uid'].nunique() if len(df_good) else 0} rows={len(df_good)}")
display(df_good.head(20))

Plots

In [None]:
import matplotlib.pyplot as plt

if len(df_good) == 0:
    print("No datasets kept. Try lowering MIN_GOOD_TEST_ACC.")
else:
    x = np.arange(len(df_good))

    plt.figure()
    plt.plot(x, df_good["alpha_W"].values, label=MATRIX)
    plt.xticks(x, df_good["dataset"].values, rotation=90)
    plt.ylabel("alpha")
    plt.title("Alpha across datasets")
    plt.legend()
    plt.tight_layout()
    plt.show()

    plt.figure()
    plt.plot(x, df_good["traps_W"].values, label=MATRIX)
    plt.xticks(x, df_good["dataset"].values, rotation=90)
    plt.ylabel("traps")
    plt.title("Traps across datasets")
    plt.legend()
    plt.tight_layout()
    plt.show()

    plt.figure()
    plt.scatter(df_good["good_test_acc"].values, df_good["alpha_W"].values)
    plt.xlabel("Holdout test accuracy")
    plt.ylabel("alpha_W")
    plt.title(f"Holdout accuracy vs alpha({MATRIX})")
    plt.tight_layout()
    plt.show()

Additional structural diagnostics

In [None]:
if len(df_good) == 0:
    print("No results to plot.")
else:
    plt.figure(); plt.hist(df_good["alpha_W"].dropna().values, bins=30)
    plt.xlabel("alpha_W"); plt.ylabel("count"); plt.title(f"Histogram of alpha({MATRIX})")
    plt.tight_layout(); plt.show()

    plt.figure(); plt.hist(df_good["traps_W"].dropna().values, bins=30)
    plt.xlabel("traps_W"); plt.ylabel("count"); plt.title(f"Histogram of traps({MATRIX})")
    plt.tight_layout(); plt.show()

    plt.figure(); plt.hist(df_good["ERG_gap_W"].dropna().values, bins=30)
    plt.xlabel("ERG_gap_W"); plt.ylabel("count"); plt.title(f"Histogram of ERG_gap({MATRIX})")
    plt.tight_layout(); plt.show()

    plt.figure()
    plt.scatter(df_good["alpha_W"].values, df_good["ERG_gap_W"].values)
    plt.xlabel("alpha_W"); plt.ylabel("ERG_gap_W")
    plt.axvline(x=2.0, color="red"); plt.axhline(y=0.0, color="orange")
    plt.title(f"alpha({MATRIX}) vs ERG_gap({MATRIX})")
    plt.tight_layout(); plt.show()

Save results to Google Drive

In [None]:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
RESULTS_FEATHER = os.path.join(GDRIVE_DIR, f"{MATRIX}_multisource_results_{ts}.feather")

df_good.to_feather(RESULTS_FEATHER)
print(f"Saved {len(df_good)} rows to: {RESULTS_FEATHER}")

RELOAD data from Google Drive and plot

In [None]:
import glob

files = sorted(glob.glob(os.path.join(GDRIVE_DIR, f"{MATRIX}_multisource_results_*.feather")))
if not files:
    raise FileNotFoundError(f"No {MATRIX}_multisource_results_*.feather files found in {GDRIVE_DIR}")

RESULTS_FEATHER = files[-1]
print("Loading:", RESULTS_FEATHER)

df = pd.read_feather(RESULTS_FEATHER)
print("Rows:", len(df), "| Cols:", len(df.columns))
display(df.head(10))

if "good_test_acc" in df.columns:
    df = df.sort_values("good_test_acc", ascending=False)

alpha_col = "alpha_W"
traps_col = "traps_W"

plt.figure(); plt.hist(df[alpha_col].dropna().values, bins=30)
plt.title(f"Distribution of alpha({MATRIX})"); plt.xlabel(alpha_col); plt.ylabel("count")
plt.tight_layout(); plt.show()

plt.figure(); plt.hist(df[traps_col].dropna().values, bins=30)
plt.title(f"Distribution of traps({MATRIX})"); plt.xlabel(traps_col); plt.ylabel("count")
plt.tight_layout(); plt.show()

plt.figure(); plt.scatter(df["good_test_acc"].values, df[alpha_col].values)
plt.xlabel("good_test_acc"); plt.ylabel(alpha_col)
plt.title(f"alpha({MATRIX}) vs test accuracy")
plt.tight_layout(); plt.show()

if "good_train_acc" in df.columns:
    gap = df["good_train_acc"].values - df["good_test_acc"].values
    plt.figure(); plt.scatter(gap, df[alpha_col].values)
    plt.xlabel("train - test accuracy gap"); plt.ylabel(alpha_col)
    plt.title(f"alpha({MATRIX}) vs generalization gap")
    plt.tight_layout(); plt.show()

summary_cols = [c for c in ["dataset","dataset_uid","source","good_train_acc","good_test_acc",alpha_col,traps_col,"rounds"] if c in df.columns]
print("Top 15 by test accuracy:")
display(df[summary_cols].head(15))