<a href="https://colab.research.google.com/github/CalculatedContent/xgbwwdata/blob/main/XGBWW_Catalog_Random5_XGBoost_Accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# XGBWW catalog-driven random-per-source XGBoost benchmark

This notebook:
1. Loads the catalog DataFrame produced by `XGBWW_Dataset_Catalog_Checkpoint.ipynb`.
2. Randomly samples **5 datasets per source** (classification only).
3. Trains one XGBoost model per sampled dataset with a small CV-based round selection.
4. Reports train and test accuracies per dataset and aggregated by source.


## 1) Mount Google Drive and configure paths


In [None]:
from google.colab import drive
from pathlib import Path

# ===== USER CONFIG =====
CATALOG_CSV = Path("/content/drive/MyDrive/xgbwwdata/catalog_checkpoint/dataset_catalog.csv")
RANDOM_SEED = 42
SAMPLES_PER_SOURCE = 5
TEST_SIZE = 0.20
# =======================

drive.mount("/content/drive")
print("Catalog path:", CATALOG_CSV)


## 2) Install dependencies

Use the same repository-install flow as the other Colab notebooks (no `pip install xgbwwdata`).


In [None]:
# Install xgbwwdata from a fresh clone using the repository installer script
!rm -rf /content/repo_xgbwwdata
!git clone https://github.com/CalculatedContent/xgbwwdata.git /content/repo_xgbwwdata
%run /content/repo_xgbwwdata/scripts/colab_install.py --repo /content/repo_xgbwwdata

# Notebook-specific dependencies
%pip install -q openml pmlb keel-ds xgboost scikit-learn


## 3) Imports


In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from xgbwwdata import Filters, load_dataset


## 4) Load catalog and pick 5 random datasets per source


In [None]:
if not CATALOG_CSV.exists():
    raise FileNotFoundError(f"Catalog not found: {CATALOG_CSV}. Run XGBWW_Dataset_Catalog_Checkpoint.ipynb first.")

df_catalog = pd.read_csv(CATALOG_CSV)
print("Catalog shape:", df_catalog.shape)

required_cols = {"dataset_uid", "source", "task_type"}
missing = required_cols - set(df_catalog.columns)
if missing:
    raise ValueError(f"Catalog is missing required columns: {missing}")

# Accuracy is for classification; keep classification-like tasks
df_cls = df_catalog[df_catalog["task_type"].astype(str).str.contains("classification", case=False, na=False)].copy()
if df_cls.empty:
    raise ValueError("No classification datasets found in catalog.")

# Sample up to SAMPLES_PER_SOURCE per source
def sample_per_source(group):
    n = min(SAMPLES_PER_SOURCE, len(group))
    return group.sample(n=n, random_state=RANDOM_SEED)

df_pick = (
    df_cls.groupby("source", group_keys=False)
    .apply(sample_per_source)
    .reset_index(drop=True)
)

print("Selected datasets:", len(df_pick))
display(df_pick[["source", "dataset_uid", "name", "task_type"]].sort_values(["source", "dataset_uid"]))


## 5) Train one XGBoost model per sampled dataset and report accuracies


In [None]:
filters = Filters(
    min_rows=200,
    max_rows=60000,
    max_features=50000,
    max_dense_elements=int(2e8),
)


def fit_and_score(dataset_uid: str, source: str):
    X, y, meta = load_dataset(dataset_uid, filters=filters)

    y = np.asarray(y)
    classes, y_enc = np.unique(y, return_inverse=True)
    n_classes = len(classes)
    if n_classes < 2:
        raise ValueError(f"Dataset {dataset_uid} has <2 classes after loading.")

    stratify = y_enc if n_classes > 1 else None
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=stratify
    )

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    if n_classes == 2:
        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "tree_method": "hist",
            "learning_rate": 0.05,
            "max_depth": 6,
            "subsample": 0.85,
            "colsample_bytree": 0.85,
            "min_child_weight": 2.0,
            "reg_lambda": 2.0,
            "seed": RANDOM_SEED,
        }
        cv = xgb.cv(
            params=params,
            dtrain=dtrain,
            num_boost_round=1200,
            nfold=5,
            stratified=True,
            early_stopping_rounds=50,
            seed=RANDOM_SEED,
            verbose_eval=False,
        )
        rounds = len(cv)
        model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)

        yhat_tr = (model.predict(dtrain) >= 0.5).astype(int)
        yhat_te = (model.predict(dtest) >= 0.5).astype(int)
    else:
        params = {
            "objective": "multi:softprob",
            "num_class": n_classes,
            "eval_metric": "mlogloss",
            "tree_method": "hist",
            "learning_rate": 0.05,
            "max_depth": 7,
            "subsample": 0.9,
            "colsample_bytree": 0.9,
            "min_child_weight": 1.0,
            "reg_lambda": 1.0,
            "seed": RANDOM_SEED,
        }
        cv = xgb.cv(
            params=params,
            dtrain=dtrain,
            num_boost_round=1200,
            nfold=5,
            stratified=True,
            early_stopping_rounds=60,
            seed=RANDOM_SEED,
            verbose_eval=False,
        )
        rounds = len(cv)
        model = xgb.train(params=params, dtrain=dtrain, num_boost_round=rounds, verbose_eval=False)

        yhat_tr = np.argmax(model.predict(dtrain), axis=1)
        yhat_te = np.argmax(model.predict(dtest), axis=1)

    return {
        "source": source,
        "dataset_uid": dataset_uid,
        "dataset_name": meta.get("name"),
        "n_rows": int(meta.get("n_rows", len(y))),
        "n_features": int(meta.get("n_features", X.shape[1] if hasattr(X, "shape") else -1)),
        "n_classes": int(n_classes),
        "rounds": int(rounds),
        "train_accuracy": float(accuracy_score(y_train, yhat_tr)),
        "test_accuracy": float(accuracy_score(y_test, yhat_te)),
    }


results = []
errors = []

for row in df_pick.itertuples(index=False):
    uid = row.dataset_uid
    source = row.source
    print(f"Training: {uid}")
    try:
        results.append(fit_and_score(uid, source))
    except Exception as e:
        errors.append({"source": source, "dataset_uid": uid, "error": str(e)})
        print(f"  Skipped {uid}: {e}")

results_df = pd.DataFrame(results)
errors_df = pd.DataFrame(errors)

print("\nCompleted:", len(results_df), "datasets")
if not errors_df.empty:
    print("Errors:", len(errors_df))
    display(errors_df.head(20))

display(results_df.sort_values(["source", "test_accuracy"], ascending=[True, False]))


## 6) Summary tables (train/test accuracies)


In [None]:
if results_df.empty:
    print("No successful trainings.")
else:
    summary = (
        results_df.groupby("source", as_index=False)[["train_accuracy", "test_accuracy"]]
        .agg(["mean", "std", "min", "max"])
    )
    summary.columns = ["source"] + [f"{a}_{b}" for a, b in summary.columns.tolist()[1:]]

    print("Per-dataset results:")
    display(results_df.sort_values(["source", "test_accuracy"], ascending=[True, False]))

    print("\nPer-source summary:")
    display(summary.sort_values("test_accuracy_mean", ascending=False))
