<a href="https://colab.research.google.com/github/CalculatedContent/xgboost2ww/blob/main/notebooks/XGBWW_Catalog_SingleSource10x10_XGBoost_Accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBWW catalog single-source 10x10 XGBoost benchmark

This notebook:
1. Loads the dataset catalog checkpoint.
2. Selects **one source** and samples **10 random classification datasets** from that source.
3. Trains **10 XGBoost hyperparameter settings per dataset** (100 total runs) using a hold-out set.
4. Runs `xgboost2ww` + `WeightWatcher.analyze(ERG=True, randomize=True)` on each trained model.
5. Tracks `alpha`, `ERG_gap`, and `rand_distance` against test accuracy.
6. Produces scatter plots per model and then a combined multi-model view.
7. Saves all outputs to an experiment checkpoint directory on Google Drive.

## 1) Mount Google Drive and configure experiment paths

In [None]:
from google.colab import drive
from pathlib import Path
from datetime import datetime
import json

# ===== USER CONFIG =====
CATALOG_CSV = Path('/content/drive/MyDrive/xgbwwdata/catalog_checkpoint/dataset_catalog.csv')
EXPERIMENT_ROOT = Path('/content/drive/MyDrive/xgbwwdata/experiment_checkpoints')
EXPERIMENT_NAME = 'single_source_10models_10hp_xgboost_accuracy'

RANDOM_SEED = 42
TEST_SIZE = 0.20
TARGET_SOURCE = None  # Example: 'openml'. If None, auto-pick a source with >= MODELS_PER_SOURCE datasets.
MODELS_PER_SOURCE = 10
HP_SETTINGS_PER_MODEL = 10
# =======================

drive.mount('/content/drive')

EXPERIMENT_ID = f"{EXPERIMENT_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
CHECKPOINT_DIR = EXPERIMENT_ROOT / EXPERIMENT_ID
PLOTS_DIR = CHECKPOINT_DIR / 'plots'
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print('Catalog path:', CATALOG_CSV)
print('Checkpoint directory:', CHECKPOINT_DIR)

## 2) Install dependencies

In [None]:
!rm -rf /content/repo_xgbwwdata
!git clone https://github.com/CalculatedContent/xgbwwdata.git /content/repo_xgbwwdata
%run /content/repo_xgbwwdata/scripts/colab_install.py --repo /content/repo_xgbwwdata

%pip install -q openml pmlb keel-ds xgboost scikit-learn xgboost2ww weightwatcher

## 3) Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import weightwatcher as ww

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from xgbwwdata import Filters, load_dataset
from xgboost2ww import convert

## 4) Load catalog and sample 10 random datasets from one source

In [None]:
if not CATALOG_CSV.exists():
    raise FileNotFoundError(f'Catalog not found: {CATALOG_CSV}. Run dataset catalog checkpoint notebook first.')

df_catalog = pd.read_csv(CATALOG_CSV)
print('Catalog shape:', df_catalog.shape)

required_cols = {'dataset_uid', 'source', 'task_type'}
missing = required_cols - set(df_catalog.columns)
if missing:
    raise ValueError(f'Catalog is missing required columns: {missing}')

df_cls = df_catalog[df_catalog['task_type'].astype(str).str.contains('classification', case=False, na=False)].copy()
if df_cls.empty:
    raise ValueError('No classification datasets found in catalog.')

source_counts = df_cls.groupby('source')['dataset_uid'].nunique().sort_values(ascending=False)
display(source_counts.to_frame('n_classification_datasets'))

if TARGET_SOURCE is None:
    eligible = source_counts[source_counts >= MODELS_PER_SOURCE]
    if eligible.empty:
        raise ValueError(f'No source has at least {MODELS_PER_SOURCE} classification datasets.')
    chosen_source = eligible.index[0]
else:
    chosen_source = TARGET_SOURCE

source_pool = df_cls[df_cls['source'] == chosen_source].drop_duplicates('dataset_uid').copy()
if len(source_pool) < MODELS_PER_SOURCE:
    raise ValueError(f"Source '{chosen_source}' has only {len(source_pool)} datasets; need {MODELS_PER_SOURCE}.")

df_pick = source_pool.sample(n=MODELS_PER_SOURCE, random_state=RANDOM_SEED).reset_index(drop=True)
print(f"Chosen source: {chosen_source}")
print(f"Selected {len(df_pick)} datasets from source='{chosen_source}'")
display(df_pick[['source', 'dataset_uid', 'name', 'task_type']].sort_values('dataset_uid'))

## 5) Define robust hyperparameter settings (10 per model)

In [None]:
# Balanced settings designed to avoid severe overfitting while still covering a useful range.
hyperparameter_grid = [
    dict(learning_rate=0.03, max_depth=3, min_child_weight=3.0, subsample=0.80, colsample_bytree=0.80, reg_lambda=3.0, reg_alpha=0.0),
    dict(learning_rate=0.05, max_depth=4, min_child_weight=2.0, subsample=0.85, colsample_bytree=0.85, reg_lambda=2.0, reg_alpha=0.0),
    dict(learning_rate=0.07, max_depth=4, min_child_weight=3.0, subsample=0.90, colsample_bytree=0.90, reg_lambda=2.0, reg_alpha=0.0),
    dict(learning_rate=0.05, max_depth=5, min_child_weight=4.0, subsample=0.80, colsample_bytree=0.85, reg_lambda=4.0, reg_alpha=0.05),
    dict(learning_rate=0.08, max_depth=5, min_child_weight=3.0, subsample=0.85, colsample_bytree=0.80, reg_lambda=3.0, reg_alpha=0.10),
    dict(learning_rate=0.04, max_depth=6, min_child_weight=5.0, subsample=0.75, colsample_bytree=0.75, reg_lambda=5.0, reg_alpha=0.10),
    dict(learning_rate=0.06, max_depth=6, min_child_weight=4.0, subsample=0.80, colsample_bytree=0.80, reg_lambda=4.0, reg_alpha=0.15),
    dict(learning_rate=0.10, max_depth=4, min_child_weight=2.0, subsample=0.90, colsample_bytree=0.90, reg_lambda=1.5, reg_alpha=0.00),
    dict(learning_rate=0.03, max_depth=7, min_child_weight=6.0, subsample=0.70, colsample_bytree=0.70, reg_lambda=6.0, reg_alpha=0.20),
    dict(learning_rate=0.09, max_depth=5, min_child_weight=2.0, subsample=0.85, colsample_bytree=0.85, reg_lambda=2.5, reg_alpha=0.05),
]

assert len(hyperparameter_grid) == HP_SETTINGS_PER_MODEL
pd.DataFrame(hyperparameter_grid)

## 6) Train 10x10 models, evaluate hold-out accuracy, and compute WW metrics

In [None]:
filters = Filters(min_rows=200, max_rows=60000, max_features=50000, max_dense_elements=int(2e8))


def pick_metric(details_df: pd.DataFrame, candidates):
    for c in candidates:
        if c in details_df.columns and len(details_df[c].dropna()) > 0:
            return float(details_df[c].dropna().iloc[0])
    return np.nan


def train_eval_one_setting(X_train, X_test, y_train, y_test, n_classes, hp, seed):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    if n_classes == 2:
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'tree_method': 'hist',
            'seed': seed,
            **hp,
        }
    else:
        params = {
            'objective': 'multi:softprob',
            'num_class': n_classes,
            'eval_metric': 'mlogloss',
            'tree_method': 'hist',
            'seed': seed,
            **hp,
        }

    evals = [(dtrain, 'train'), (dtest, 'test')]
    bst = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=1200,
        evals=evals,
        early_stopping_rounds=40,
        verbose_eval=False,
    )

    if n_classes == 2:
        train_prob = bst.predict(dtrain)
        test_prob = bst.predict(dtest)
        yhat_train = (train_prob >= 0.5).astype(int)
        yhat_test = (test_prob >= 0.5).astype(int)
    else:
        train_prob = bst.predict(dtrain)
        test_prob = bst.predict(dtest)
        yhat_train = np.argmax(train_prob, axis=1)
        yhat_test = np.argmax(test_prob, axis=1)

    train_acc = accuracy_score(y_train, yhat_train)
    test_acc = accuracy_score(y_test, yhat_test)

    layer = convert(
        bst,
        X_train,
        y_train,
        W='W7',
        return_type='torch',
        nfolds=5,
        t_points=40,
        random_state=seed,
        train_params=params,
        num_boost_round=int(bst.best_iteration + 1),
    )

    watcher = ww.WeightWatcher(model=layer)
    details_df = watcher.analyze(ERG=True, randomize=True, plot=False)

    alpha = pick_metric(details_df, ['alpha'])
    erg_gap = pick_metric(details_df, ['ERG_gap'])
    rand_distance = pick_metric(details_df, ['rand_distance', 'random_distance', 'rand_dist'])

    return {
        'train_accuracy': float(train_acc),
        'test_accuracy': float(test_acc),
        'overfit_gap': float(train_acc - test_acc),
        'best_iteration': int(bst.best_iteration + 1),
        'alpha': alpha,
        'ERG_gap': erg_gap,
        'rand_distance': rand_distance,
    }


results = []
errors = []

for model_idx, row in df_pick.reset_index(drop=True).iterrows():
    dataset_uid = row['dataset_uid']
    source = row['source']

    try:
        X, y, _meta = load_dataset(dataset_uid, filters=filters)
        y = np.asarray(y)
        classes, y_enc = np.unique(y, return_inverse=True)
        n_classes = len(classes)
        if n_classes < 2:
            raise ValueError('dataset has <2 classes')

        X_train, X_test, y_train, y_test = train_test_split(
            X, y_enc,
            test_size=TEST_SIZE,
            random_state=RANDOM_SEED,
            stratify=y_enc,
        )

        for hp_idx, hp in enumerate(hyperparameter_grid):
            run_seed = RANDOM_SEED + model_idx * 100 + hp_idx
            try:
                out = train_eval_one_setting(X_train, X_test, y_train, y_test, n_classes, hp, run_seed)
                results.append({
                    'source': source,
                    'dataset_uid': dataset_uid,
                    'dataset_name': row.get('name', str(dataset_uid)),
                    'model_index': int(model_idx),
                    'hp_index': int(hp_idx),
                    'n_classes': int(n_classes),
                    'n_train': int(X_train.shape[0]),
                    'n_test': int(X_test.shape[0]),
                    'n_features': int(X_train.shape[1]),
                    **out,
                    'params_json': json.dumps(hp, sort_keys=True),
                })
                print(f"[OK] dataset={dataset_uid} hp={hp_idx} test_acc={out['test_accuracy']:.4f} gap={out['overfit_gap']:.4f}")
            except Exception as e:
                errors.append({'dataset_uid': dataset_uid, 'hp_index': hp_idx, 'error': str(e)})
                print(f"[ERR] dataset={dataset_uid} hp={hp_idx}: {e}")

    except Exception as e:
        errors.append({'dataset_uid': dataset_uid, 'hp_index': None, 'error': str(e)})
        print(f"[SKIP] dataset={dataset_uid}: {e}")

results_df = pd.DataFrame(results)
errors_df = pd.DataFrame(errors)

print('Successful runs:', len(results_df))
print('Failed runs:', len(errors_df))

if not results_df.empty:
    display(results_df.head())
if not errors_df.empty:
    display(errors_df.head())

## 7) Select best hyperparameter setting per model and persist checkpoint files

In [None]:
if results_df.empty:
    print('No successful runs to summarize.')
else:
    best_per_model = (
        results_df.sort_values(['model_index', 'test_accuracy', 'overfit_gap'], ascending=[True, False, True])
        .groupby('model_index', as_index=False)
        .first()
        .sort_values('model_index')
    )

    print('Best hyperparameter setting for each model (based on hold-out test accuracy):')
    display(best_per_model[[
        'model_index', 'dataset_uid', 'dataset_name', 'hp_index',
        'train_accuracy', 'test_accuracy', 'overfit_gap',
        'alpha', 'ERG_gap', 'rand_distance'
    ]])

    print('Overall summary:')
    display(results_df[['train_accuracy','test_accuracy','overfit_gap','alpha','ERG_gap','rand_distance']].describe())

    experiment_config = {
        'experiment_id': EXPERIMENT_ID,
        'experiment_name': EXPERIMENT_NAME,
        'catalog_csv': str(CATALOG_CSV),
        'chosen_source': chosen_source,
        'random_seed': RANDOM_SEED,
        'test_size': TEST_SIZE,
        'models_per_source': MODELS_PER_SOURCE,
        'hp_settings_per_model': HP_SETTINGS_PER_MODEL,
        'successful_runs': int(len(results_df)),
        'failed_runs': int(len(errors_df)),
    }

    results_path = CHECKPOINT_DIR / 'results_all_runs.csv'
    best_path = CHECKPOINT_DIR / 'best_per_model.csv'
    errors_path = CHECKPOINT_DIR / 'errors.csv'
    config_path = CHECKPOINT_DIR / 'experiment_config.json'

    results_df.to_csv(results_path, index=False)
    best_per_model.to_csv(best_path, index=False)
    errors_df.to_csv(errors_path, index=False)
    config_path.write_text(json.dumps(experiment_config, indent=2))

    print('Saved:')
    print('-', results_path)
    print('-', best_path)
    print('-', errors_path)
    print('-', config_path)

## 8) Scatter plots per model: WW metrics vs hold-out test accuracy

In [None]:
if results_df.empty:
    print('No successful runs to plot.')
else:
    metrics = ['alpha', 'ERG_gap', 'rand_distance']

    for model_idx, g in results_df.groupby('model_index'):
        fig, axes = plt.subplots(1, 3, figsize=(18, 4.8), squeeze=False)
        g = g.sort_values('hp_index')

        for ax, m in zip(axes[0], metrics):
            ax.scatter(g[m], g['test_accuracy'], s=65, alpha=0.85)
            for _, r in g.iterrows():
                ax.annotate(f"hp{int(r['hp_index'])}", (r[m], r['test_accuracy']), fontsize=8, alpha=0.8)
            ax.set_xlabel(m)
            ax.set_ylabel('Hold-out test accuracy')
            ax.set_title(f'Model {model_idx} ({g.iloc[0]["dataset_uid"]})')
            ax.grid(alpha=0.2)

        fig.tight_layout()
        out_path = PLOTS_DIR / f'model_{int(model_idx):02d}_scatter.png'
        fig.savefig(out_path, dpi=160, bbox_inches='tight')
        plt.show()
        plt.close(fig)

    print('Saved per-model scatter plots to:', PLOTS_DIR)

## 9) Combined plot across all models and all hyperparameter runs

In [None]:
if results_df.empty:
    print('No successful runs to plot.')
else:
    metrics = ['alpha', 'ERG_gap', 'rand_distance']
    n_models = results_df['model_index'].nunique()

    fig, axes = plt.subplots(3, 4, figsize=(22, 14), squeeze=False)
    axes_flat = axes.flatten()

    for i, (model_idx, g) in enumerate(sorted(results_df.groupby('model_index'), key=lambda x: x[0])):
        ax = axes_flat[i]
        ax.scatter(g['alpha'], g['test_accuracy'], label='alpha', alpha=0.8, s=25)
        ax.scatter(g['ERG_gap'], g['test_accuracy'], label='ERG_gap', alpha=0.8, s=25)
        ax.scatter(g['rand_distance'], g['test_accuracy'], label='rand_distance', alpha=0.8, s=25)
        ax.set_title(f"Model {model_idx}: {g.iloc[0]['dataset_uid']}")
        ax.set_xlabel('Metric value')
        ax.set_ylabel('Hold-out test acc')
        ax.grid(alpha=0.2)

    for j in range(i + 1, len(axes_flat)):
        axes_flat[j].axis('off')

    handles, labels = axes_flat[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper center', ncol=3)
    fig.suptitle('Combined WW-metric trends vs hold-out test accuracy (all models)', y=0.98)
    fig.tight_layout(rect=[0, 0, 1, 0.95])

    combined_path = PLOTS_DIR / 'combined_all_models_scatter.png'
    fig.savefig(combined_path, dpi=170, bbox_inches='tight')
    plt.show()
    plt.close(fig)

    print('Saved combined plot:', combined_path)