<a href="https://colab.research.google.com/github/CalculatedContent/xgbwwdata/blob/main/XGBWW_Dataset_Catalog_Checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# xgbwwdata Dataset Catalog Builder (with Drive checkpoint + resume)

This notebook scans one or many `xgbwwdata` sources, builds a **dataset catalog DataFrame**, and stores it in Google Drive as a checkpoint.

The catalog includes per-dataset metadata such as:
- source + source dataset ID
- unique identifier
- dataset name
- number of rows/features/classes
- experiment type (regression, binary classification, multiclass classification, single-class)

It supports:
- **Test mode** (e.g., only 2â€“3 datasets per source)
- **Full mode** (scan all datasets)
- **Resume mode** (continue from previous checkpoint without restarting from scratch)
- **Source selection** (all sources or a specific subset)


## 1) Mount Google Drive and configure paths/options


In [None]:
from google.colab import drive
from pathlib import Path

# ====== USER CONFIG ======
# If None, scan all supported sources: openml, pmlb, keel, amlb, libsvm
TARGET_SOURCES = None                  # e.g. ["openml", "pmlb"]

TEST_MODE = True
TEST_PER_SOURCE = 3                    # used only when TEST_MODE=True

SMOKE_TRAIN = False                    # True = try 1-round XGBoost train validation
SAVE_EVERY = 10                        # checkpoint save frequency (processed datasets)

# Structural filters (same spirit as xgbwwdata defaults)
MIN_ROWS = 200
MAX_ROWS = 60000
MAX_FEATURES = 50000
MAX_DENSE_ELEMENTS = int(2e8)

# Drive output folder
DRIVE_BASE = Path("/content/drive/MyDrive/xgbwwdata/catalog_checkpoint")
CATALOG_CSV = DRIVE_BASE / "dataset_catalog.csv"
PROGRESS_JSON = DRIVE_BASE / "scan_progress.json"
# ==========================

drive.mount('/content/drive')
DRIVE_BASE.mkdir(parents=True, exist_ok=True)
print(f"Catalog CSV: {CATALOG_CSV}")
print(f"Progress JSON: {PROGRESS_JSON}")


## 2) Install dependencies


In [None]:
# Install xgbwwdata from a fresh clone using the Colab installer script\n!rm -rf /content/repo_xgbwwdata\n!git clone https://github.com/CalculatedContent/xgbwwdata.git /content/repo_xgbwwdata\n%run /content/repo_xgbwwdata/scripts/colab_install.py --repo /content/repo_xgbwwdata\n\n# Notebook-specific dependencies\n%pip install -q openml pmlb keel-ds xgboost tqdm pyarrow\n

## 3) Imports and helpers


In [None]:
import json
import time
from datetime import datetime, timezone
from typing import Dict, Iterable, List, Optional

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from xgbwwdata import Filters, enable_logging
from xgbwwdata.registry import LibSVMIndex, _libsvm_load, _smoke_train_1round
from xgbwwdata.sources.openml import OpenMLSource
from xgbwwdata.sources.pmlb import PMLBSource
from xgbwwdata.sources.keel import KEELSource
from xgbwwdata.sources.amlb import AMLBSource

enable_logging()


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()


def infer_experiment_type(task_type: str, n_classes) -> str:
    if task_type == "regression":
        return "regression"
    if pd.isna(n_classes):
        return "classification_unknown"
    n_classes = int(n_classes)
    if n_classes <= 1:
        return "single_class"
    if n_classes == 2:
        return "binary_classification"
    return "multiclass_classification"


def source_factories() -> Dict[str, callable]:
    return {
        "openml": lambda: OpenMLSource(),
        "pmlb": lambda: PMLBSource(include_regression=True),
        "keel": lambda: KEELSource(),
        "amlb": lambda: AMLBSource(),
        "libsvm": lambda: None,
    }


def normalize_sources(target_sources: Optional[List[str]]) -> List[str]:
    all_sources = list(source_factories().keys())
    if target_sources is None:
        return all_sources
    cleaned = [s.lower().strip() for s in target_sources]
    bad = [s for s in cleaned if s not in all_sources]
    if bad:
        raise ValueError(f"Unknown sources: {bad}. Allowed: {all_sources}")
    return cleaned


def load_checkpoint(catalog_csv, progress_json):
    if catalog_csv.exists():
        catalog_df = pd.read_csv(catalog_csv)
    else:
        catalog_df = pd.DataFrame(columns=[
            "source",
            "source_dataset_id",
            "dataset_uid",
            "unique_id",
            "name",
            "task_type",
            "experiment_type",
            "n_rows",
            "n_features",
            "n_classes",
            "scan_timestamp_utc",
        ])

    if progress_json.exists():
        with open(progress_json, "r") as f:
            progress = json.load(f)
    else:
        progress = {
            "processed": {},   # dataset_uid -> {status, updated_at, error?}
            "last_save_utc": None,
        }

    # Backfill from existing CSV if needed
    if "dataset_uid" in catalog_df.columns:
        for uid in catalog_df["dataset_uid"].dropna().astype(str):
            progress["processed"].setdefault(uid, {"status": "ok", "updated_at": utc_now_iso()})

    return catalog_df, progress


def save_checkpoint(catalog_df, progress, catalog_csv, progress_json):
    catalog_df = catalog_df.drop_duplicates(subset=["dataset_uid"], keep="last")
    catalog_df = catalog_df.sort_values(["source", "dataset_uid"]).reset_index(drop=True)
    catalog_df.to_csv(catalog_csv, index=False)
    progress["last_save_utc"] = utc_now_iso()
    with open(progress_json, "w") as f:
        json.dump(progress, f, indent=2)


## 4) Run scan with checkpoint/resume support


In [None]:
filters = Filters(
    min_rows=MIN_ROWS,
    max_rows=MAX_ROWS,
    max_features=MAX_FEATURES,
    max_dense_elements=MAX_DENSE_ELEMENTS,
)

selected_sources = normalize_sources(TARGET_SOURCES)
catalog_df, progress = load_checkpoint(CATALOG_CSV, PROGRESS_JSON)
processed = progress["processed"]

print("Selected sources:", selected_sources)
print("Existing catalog rows:", len(catalog_df))
print("Existing processed entries:", len(processed))

new_rows = []
processed_since_save = 0

for source_name in selected_sources:
    print(f"
=== Scanning source: {source_name} ===")

    # Build source iterator
    if source_name == "libsvm":
        lib_idx = LibSVMIndex()
        uid_iter = list(lib_idx.iter_uids())
        src_obj = None
    else:
        src_obj = source_factories()[source_name]()
        uid_iter = list(src_obj.iter_ids())

    # Test mode: keep just first N not-yet-processed from this source
    if TEST_MODE:
        remaining = [uid for uid in uid_iter if uid not in processed][:TEST_PER_SOURCE]
        uid_iter = remaining
        print(f"Test mode ON -> scanning up to {len(uid_iter)} dataset(s) for {source_name}")
    else:
        print(f"Full mode -> candidate dataset count: {len(uid_iter)}")

    for uid in tqdm(uid_iter, desc=f"{source_name} datasets"):
        if uid in processed:
            continue

        try:
            if source_name == "libsvm":
                X, y, meta = _libsvm_load(uid, filters)
                task_type = "classification"
                n_classes = int(meta.get("n_classes", np.nan)) if not pd.isna(meta.get("n_classes", np.nan)) else np.nan
                name = str(meta.get("name", uid.split(":", 1)[1]))
                n_rows = int(meta.get("n_rows", X.shape[0]))
                n_features = int(meta.get("n_features", X.shape[1]))

                if SMOKE_TRAIN:
                    if not _smoke_train_1round(X, y, task_type, n_classes if not pd.isna(n_classes) else None, seed=0):
                        raise RuntimeError("smoke_train_failed")
            else:
                ok, info, X, y, task_type, n_classes, name = src_obj.validate_and_prepare(uid, filters)
                if not ok:
                    raise RuntimeError(f"filtered_out:{info}")

                if SMOKE_TRAIN:
                    if not _smoke_train_1round(X, y, task_type, n_classes, seed=0):
                        raise RuntimeError("smoke_train_failed")

                info = info if isinstance(info, dict) else {}
                n_rows = int(info.get("n_rows", X.shape[0]))
                n_features = int(info.get("n_features", X.shape[1]))

            src, src_id = uid.split(":", 1)
            experiment_type = infer_experiment_type(task_type, n_classes)

            row = {
                "source": src,
                "source_dataset_id": src_id,
                "dataset_uid": uid,
                "unique_id": f"{src}|{src_id}",
                "name": name,
                "task_type": task_type,
                "experiment_type": experiment_type,
                "n_rows": n_rows,
                "n_features": n_features,
                "n_classes": n_classes,
                "scan_timestamp_utc": utc_now_iso(),
            }
            new_rows.append(row)

            processed[uid] = {
                "status": "ok",
                "source": src,
                "updated_at": utc_now_iso(),
            }

        except Exception as e:
            processed[uid] = {
                "status": "error",
                "source": source_name,
                "error": str(e),
                "updated_at": utc_now_iso(),
            }

        processed_since_save += 1
        if processed_since_save >= SAVE_EVERY:
            if new_rows:
                catalog_df = pd.concat([catalog_df, pd.DataFrame(new_rows)], ignore_index=True)
                new_rows = []
            save_checkpoint(catalog_df, progress, CATALOG_CSV, PROGRESS_JSON)
            processed_since_save = 0
            print(f"Checkpoint saved at {utc_now_iso()} | catalog rows={len(catalog_df)}")

# Final save
if new_rows:
    catalog_df = pd.concat([catalog_df, pd.DataFrame(new_rows)], ignore_index=True)

save_checkpoint(catalog_df, progress, CATALOG_CSV, PROGRESS_JSON)
print("
Scan complete.")
print("Catalog rows:", len(catalog_df.drop_duplicates(subset=['dataset_uid'])))
print("Checkpoint files updated.")


## 5) Inspect the resulting dataset catalog


In [None]:
catalog_df = pd.read_csv(CATALOG_CSV)
print("Catalog shape:", catalog_df.shape)

# Useful sorted preview
preview_cols = [
    "source", "source_dataset_id", "dataset_uid", "name",
    "task_type", "experiment_type", "n_rows", "n_features", "n_classes"
]

catalog_df = catalog_df.sort_values(["source", "n_rows"], ascending=[True, False]).reset_index(drop=True)
display(catalog_df[preview_cols].head(30))

# Summary by source and experiment type
display(
    catalog_df.groupby(["source", "experiment_type"], dropna=False)
    .size()
    .reset_index(name="dataset_count")
    .sort_values(["source", "dataset_count"], ascending=[True, False])
)


## 6) Notes

- Re-running this notebook continues from the checkpoint in Drive.
- To scan only one source, set `TARGET_SOURCES = ["openml"]` (or another source).
- To run full scan, set `TEST_MODE = False`.
- The saved checkpoint CSV is your reusable dataset registry for future experiments.
