# OpenML Numeric-Only Classification Benchmarks — Demo

This notebook demonstrates the **OpenML Numeric-Only Classification Benchmarks** dataset artifact.

**What it does:** Loads 8 all-numeric tabular classification datasets from OpenML, covering diverse sizes (208–2310 samples), feature counts (4–60), and class counts (2–7). All features are z-score standardized with 5-fold stratified CV fold assignments.

**Datasets:** diabetes, heart-statlog, ionosphere, sonar, vehicle, segment, glass, banknote-authentication.

**Domains:** medical, physics, signal processing, computer vision, forensics, image processing.

- **Part 1 — Quick Demo** uses a small curated subset (~16 examples) for fast exploration.
- **Part 2 — Full Run** loads the complete dataset (6,339 examples) with original parameters.

In [None]:
import json
import os
import time
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt

In [None]:
GITHUB_FULL_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/openml_classif/demo/full_demo_data.json"
GITHUB_MINI_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/openml_classif/demo/mini_demo_data.json"

def _load_json(url, local_path):
    try:
        import urllib.request
        with urllib.request.urlopen(url) as response:
            return json.loads(response.read().decode())
    except Exception: pass
    if os.path.exists(local_path):
        with open(local_path) as f: return json.load(f)
    raise FileNotFoundError(f"Could not load {local_path}")

def load_mini():
    return _load_json(GITHUB_MINI_DATA_URL, "mini_demo_data.json")

def load_full():
    return _load_json(GITHUB_FULL_DATA_URL, "full_demo_data.json")

## Part 1 — Quick Demo (Mini Data)

In [None]:
data = load_mini()

### Dataset Configuration

The 8 selected OpenML datasets span diverse domains and complexity levels. This mirrors the original script's `DATASETS` configuration dict.

In [None]:
# Final 8 selected datasets (all-numeric, no missing, 2-7 classes, 200-2500 samples)
DATASETS = {
    "diabetes": {"data_id": 37, "domain": "medical"},
    "heart-statlog": {"data_id": 53, "domain": "medical"},
    "ionosphere": {"data_id": 59, "domain": "physics"},
    "sonar": {"data_id": 40, "domain": "signal_processing"},
    "vehicle": {"data_id": 54, "domain": "computer_vision"},
    "segment": {"data_id": 36, "domain": "computer_vision"},
    "glass": {"data_id": 41, "domain": "forensics"},
    "banknote-authentication": {"data_id": 1462, "domain": "image_processing"},
}

N_FOLDS = 5
RANDOM_STATE = 42

### Explore Data Structure

Print a summary table of all datasets: name, number of examples, features, classes, domain, and OpenML ID.

In [None]:
all_datasets = data["datasets"]

print(f"{'Dataset':<25s} | {'#Ex':>4s} | {'#Feat':>5s} | {'#Cls':>4s} | {'Domain':<18s} | {'OpenML ID':>9s}")
print("-" * 80)
for ds in all_datasets:
    name = ds["dataset"]
    examples = ds["examples"]
    n_examples = len(examples)
    ex0 = examples[0]
    n_features = len(ex0["metadata_feature_names"])
    n_classes = ex0["metadata_n_classes"]
    domain = ex0["metadata_domain"]
    openml_id = ex0["metadata_openml_id"]
    print(f"{name:<25s} | {n_examples:>4d} | {n_features:>5d} | {n_classes:>4d} | {domain:<18s} | {openml_id:>9d}")

total_examples = sum(len(d["examples"]) for d in all_datasets)
print(f"\nTotal: {len(all_datasets)} datasets, {total_examples} examples")

### Inspect Individual Examples

Each example has JSON-serialized feature values as `input`, the class label as `output`, plus metadata (fold index, feature names, task type, domain, OpenML ID).

In [None]:
# Show first example from each dataset
for ds in all_datasets:
    ex = ds["examples"][0]
    features = json.loads(ex["input"])
    print(f"--- {ds['dataset']} ---")
    print(f"  Label: {ex['output']}")
    print(f"  Fold:  {ex['metadata_fold']}")
    print(f"  Features ({len(features)}): {dict(list(features.items())[:4])}...")
    print()

### Analyze Class Distribution and Feature Statistics

Parse feature values from JSON strings and compute per-dataset statistics, matching the original script's standardization output.

In [None]:
for ds in all_datasets:
    name = ds["dataset"]
    examples = ds["examples"]

    # Class distribution
    labels = [ex["output"] for ex in examples]
    class_counts = Counter(labels)

    # Parse all features into a matrix
    all_features = []
    for ex in examples:
        feat_vals = json.loads(ex["input"])
        all_features.append(list(feat_vals.values()))
    feat_matrix = np.array(all_features)

    # Fold distribution
    folds = [ex["metadata_fold"] for ex in examples]
    fold_counts = Counter(folds)

    print(f"--- {name} ---")
    print(f"  Classes: {dict(class_counts)}")
    print(f"  Folds:   {dict(sorted(fold_counts.items()))}")
    if feat_matrix.size > 0:
        print(f"  Feature stats (z-scored): mean={feat_matrix.mean():.4f}, std={feat_matrix.std():.4f}")
    print()

### Visualization

Reusable visualization function: dataset overview bar chart (samples, features, classes) and per-dataset feature value distributions.

In [None]:
def visualize_datasets(datasets_list, title_prefix=""):
    """Reusable visualization for the benchmark dataset collection."""
    names = []
    n_examples_list = []
    n_features_list = []
    n_classes_list = []
    domains = []
    feat_matrices = []

    for ds in datasets_list:
        ex0 = ds["examples"][0]
        names.append(ds["dataset"])
        n_examples_list.append(len(ds["examples"]))
        n_features_list.append(len(ex0["metadata_feature_names"]))
        n_classes_list.append(ex0["metadata_n_classes"])
        domains.append(ex0["metadata_domain"])

        # Parse features
        all_feats = []
        for ex in ds["examples"]:
            vals = json.loads(ex["input"])
            all_feats.append(list(vals.values()))
        feat_matrices.append(np.array(all_feats))

    # --- Figure 1: Dataset overview ---
    fig, axes = plt.subplots(1, 3, figsize=(14, 4))
    fig.suptitle(f"{title_prefix}Dataset Overview", fontsize=13, fontweight="bold")

    x = np.arange(len(names))
    short_names = [n[:12] for n in names]

    axes[0].barh(x, n_examples_list, color="steelblue")
    axes[0].set_yticks(x)
    axes[0].set_yticklabels(short_names, fontsize=8)
    axes[0].set_xlabel("# Examples")
    axes[0].set_title("Sample Count")
    axes[0].invert_yaxis()

    axes[1].barh(x, n_features_list, color="coral")
    axes[1].set_yticks(x)
    axes[1].set_yticklabels(short_names, fontsize=8)
    axes[1].set_xlabel("# Features")
    axes[1].set_title("Feature Count")
    axes[1].invert_yaxis()

    axes[2].barh(x, n_classes_list, color="mediumseagreen")
    axes[2].set_yticks(x)
    axes[2].set_yticklabels(short_names, fontsize=8)
    axes[2].set_xlabel("# Classes")
    axes[2].set_title("Class Count")
    axes[2].invert_yaxis()

    plt.tight_layout()
    plt.show()

    # --- Figure 2: Feature value distributions (box plots) ---
    fig, axes = plt.subplots(2, 4, figsize=(14, 6))
    fig.suptitle(f"{title_prefix}Feature Value Distributions (z-scored)", fontsize=13, fontweight="bold")
    axes = axes.flatten()

    for i, (name, feat_mat) in enumerate(zip(names, feat_matrices)):
        ax = axes[i]
        if feat_mat.shape[0] > 0:
            # Show up to 10 features for readability
            n_show = min(feat_mat.shape[1], 10)
            ax.boxplot(feat_mat[:, :n_show], vert=True, widths=0.6,
                       patch_artist=True,
                       boxprops=dict(facecolor="lightblue", alpha=0.7))
        ax.set_title(name[:15], fontsize=9)
        ax.set_xlabel("Feature idx", fontsize=7)
        ax.set_ylabel("Value", fontsize=7)
        ax.tick_params(labelsize=6)

    plt.tight_layout()
    plt.show()

    # --- Summary table ---
    print(f"\n{title_prefix}Summary:")
    print(f"{'Dataset':<25s} | {'#Ex':>5s} | {'#Feat':>5s} | {'#Cls':>4s} | {'Domain':<18s}")
    print("-" * 70)
    for i in range(len(names)):
        print(f"{names[i]:<25s} | {n_examples_list[i]:>5d} | {n_features_list[i]:>5d} | {n_classes_list[i]:>4d} | {domains[i]:<18s}")
    total = sum(n_examples_list)
    print(f"\nTotal: {len(names)} datasets, {total} examples")

In [None]:
visualize_datasets(all_datasets, title_prefix="[Mini] ")

## Part 2 — Full Run (Original Parameters)

Load the complete dataset with all 6,339 examples across 8 datasets and run the same analysis with original parameters.

In [None]:
data = load_full()
all_datasets = data["datasets"]

print(f"Loaded {len(all_datasets)} datasets")
total_examples = sum(len(d["examples"]) for d in all_datasets)
print(f"Total examples: {total_examples}")

### Full Dataset Exploration

Run the same analysis on the complete dataset with all examples and original parameters.

In [None]:
# Full dataset summary table
print(f"{'Dataset':<25s} | {'#Ex':>5s} | {'#Feat':>5s} | {'#Cls':>4s} | {'Domain':<18s} | {'OpenML ID':>9s}")
print("-" * 80)
for ds in all_datasets:
    name = ds["dataset"]
    examples = ds["examples"]
    n_examples = len(examples)
    ex0 = examples[0]
    n_features = len(ex0["metadata_feature_names"])
    n_classes = ex0["metadata_n_classes"]
    domain = ex0["metadata_domain"]
    openml_id = ex0["metadata_openml_id"]
    print(f"{name:<25s} | {n_examples:>5d} | {n_features:>5d} | {n_classes:>4d} | {domain:<18s} | {openml_id:>9d}")

total_examples = sum(len(d["examples"]) for d in all_datasets)
print(f"\nTotal: {len(all_datasets)} datasets, {total_examples} examples")

In [None]:
# Full class distribution and feature stats
for ds in all_datasets:
    name = ds["dataset"]
    examples = ds["examples"]

    # Class distribution
    labels = [ex["output"] for ex in examples]
    class_counts = Counter(labels)

    # Parse all features into a matrix
    all_features = []
    for ex in examples:
        feat_vals = json.loads(ex["input"])
        all_features.append(list(feat_vals.values()))
    feat_matrix = np.array(all_features)

    # Fold distribution
    folds = [ex["metadata_fold"] for ex in examples]
    fold_counts = Counter(folds)

    print(f"--- {name} ---")
    print(f"  Classes: {dict(class_counts)}")
    print(f"  Folds:   {dict(sorted(fold_counts.items()))}")
    if feat_matrix.size > 0:
        print(f"  Feature stats (z-scored): mean={feat_matrix.mean():.4f}, std={feat_matrix.std():.4f}")
    print()

In [None]:
visualize_datasets(all_datasets, title_prefix="[Full] ")