# OpenML Mixed-Type Classification Benchmarks — Demo

This notebook demonstrates the **OpenML Mixed-Type Classification Benchmarks** dataset, which contains two pre-processed OpenML datasets for PID synergy computation research:

| Dataset | OpenML ID | Samples | Features | Types |
|---------|-----------|---------|----------|-------|
| credit_g | 31 | 1,000 | 20 | 7 num + 13 cat |
| australian | 40981 | 690 | 14 | 6 num + 8 cat |

Both datasets have categorical features ordinal-encoded and numeric features standardized. Each example includes 5-fold stratified CV fold assignments and feature type metadata.

- **Part 1 — Quick Demo**: Loads a small curated subset (5 examples per dataset) for fast exploration.
- **Part 2 — Full Run**: Loads the complete dataset (1,690 examples) with original parameters.

In [None]:
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "scikit-learn>=1.3.0",
#     "pandas>=2.0.0",
#     "numpy>=1.26.0,<2.3.0",
#     "matplotlib>=3.7.0",
# ]
# ///
import json
import warnings
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [None]:
GITHUB_FULL_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/openml_mixed_ty/demo/full_demo_data.json"
GITHUB_MINI_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/openml_mixed_ty/demo/mini_demo_data.json"
import json, os

def _load_json(url, local_path):
    try:
        import urllib.request
        with urllib.request.urlopen(url) as response:
            return json.loads(response.read().decode())
    except Exception: pass
    if os.path.exists(local_path):
        with open(local_path) as f: return json.load(f)
    raise FileNotFoundError(f"Could not load {local_path}")

def load_mini():
    return _load_json(GITHUB_MINI_DATA_URL, "mini_demo_data.json")

def load_full():
    return _load_json(GITHUB_FULL_DATA_URL, "full_demo_data.json")

---
## Part 1 — Quick Demo (Mini Data)

Load a curated subset of 5 examples per dataset for fast exploration.

In [None]:
data = load_mini()

### Dataset Summary

Print high-level statistics for each dataset: number of examples, features, feature types, class distribution, and fold assignments.

In [None]:
# Dataset configurations from the original script
# (openml_id, name) — selected for mixed numeric + categorical features
DATASETS = [
    (31, "credit_g"),
    (40981, "australian"),
]

N_FOLDS = 5
RANDOM_STATE = 42

print("=" * 60)
print("OpenML Mixed-Type Classification Benchmarks")
print("=" * 60)

for ds in data["datasets"]:
    examples = ds["examples"]
    name = ds["dataset"]
    n_examples = len(examples)

    # Get metadata from first example
    ex0 = examples[0]
    feature_names = ex0["metadata_feature_names"]
    feature_types = ex0["metadata_feature_types"]
    n_classes = ex0["metadata_n_classes"]

    n_cat = sum(1 for t in feature_types if t == "categorical")
    n_num = sum(1 for t in feature_types if t == "continuous")

    # Class distribution
    label_counts = Counter(ex["output"] for ex in examples)

    # Fold distribution
    fold_counts = Counter(ex["metadata_fold"] for ex in examples)

    print(f"\n{'='*60}")
    print(f"Dataset: {name}")
    print(f"  Examples: {n_examples}")
    print(f"  Features: {len(feature_names)} ({n_num} numeric + {n_cat} categorical)")
    print(f"  Classes: {n_classes}, distribution: {dict(label_counts)}")
    print(f"  Folds: {dict(sorted(fold_counts.items()))}")
    print(f"  Task type: {ex0['metadata_task_type']}")

### Inspect Individual Examples

Parse the JSON `input` field of each example to see the feature values, and display a few examples as a DataFrame.

In [None]:
# Parse input JSON strings into feature dictionaries and build DataFrames
for ds in data["datasets"]:
    examples = ds["examples"]
    name = ds["dataset"]

    # Build DataFrame from parsed input features
    rows = []
    for ex in examples:
        feature_dict = json.loads(ex["input"])
        feature_dict["__label__"] = ex["output"]
        feature_dict["__fold__"] = ex["metadata_fold"]
        rows.append(feature_dict)

    df = pd.DataFrame(rows)
    print(f"\n{'='*60}")
    print(f"Dataset: {name} — first {min(3, len(df))} examples")
    print(f"{'='*60}")
    print(df.head(3).to_string(max_cols=10))
    print(f"\nShape: {df.shape}")

### Feature Type Analysis

Analyze the mix of categorical vs. continuous features in each dataset — this is critical for PID synergy computation which needs mixed-type inputs.

In [None]:
# Feature type breakdown per dataset
for ds in data["datasets"]:
    examples = ds["examples"]
    name = ds["dataset"]
    ex0 = examples[0]
    feature_names = ex0["metadata_feature_names"]
    feature_types = ex0["metadata_feature_types"]

    print(f"\n{'='*60}")
    print(f"Dataset: {name} — Feature Types")
    print(f"{'='*60}")

    for fname, ftype in zip(feature_names, feature_types):
        # Show a sample value from the first example
        features = json.loads(ex0["input"])
        val = features[fname]
        print(f"  {fname:30s} {ftype:12s}  sample={val}")

### Visualization

Visualize the dataset characteristics: feature type distribution, class balance, fold assignments, and numeric feature distributions.

In [None]:
def visualize_datasets(data, title_prefix=""):
    """Reusable visualization of dataset characteristics."""
    n_datasets = len(data["datasets"])
    fig, axes = plt.subplots(n_datasets, 3, figsize=(14, 4 * n_datasets))
    if n_datasets == 1:
        axes = axes[np.newaxis, :]

    for i, ds in enumerate(data["datasets"]):
        examples = ds["examples"]
        name = ds["dataset"]
        ex0 = examples[0]
        feature_types = ex0["metadata_feature_types"]
        feature_names = ex0["metadata_feature_names"]

        # --- Panel 1: Feature type distribution (pie chart) ---
        type_counts = Counter(feature_types)
        ax1 = axes[i, 0]
        labels = list(type_counts.keys())
        sizes = list(type_counts.values())
        colors = ["#4C72B0", "#DD8452"]
        ax1.pie(sizes, labels=labels, autopct="%1.0f%%", colors=colors[:len(labels)],
                startangle=90, textprops={"fontsize": 10})
        ax1.set_title(f"{name}\nFeature Types ({len(feature_names)} total)")

        # --- Panel 2: Class distribution (bar chart) ---
        label_counts = Counter(ex["output"] for ex in examples)
        ax2 = axes[i, 1]
        classes = sorted(label_counts.keys())
        counts = [label_counts[c] for c in classes]
        bars = ax2.bar(classes, counts, color=["#55A868", "#C44E52"])
        ax2.set_title(f"{name}\nClass Distribution")
        ax2.set_xlabel("Class")
        ax2.set_ylabel("Count")
        for bar, count in zip(bars, counts):
            ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1,
                     str(count), ha="center", va="bottom", fontsize=10)

        # --- Panel 3: Numeric feature value distributions (box plot) ---
        # Parse all examples into a DataFrame
        rows = []
        for ex in examples:
            rows.append(json.loads(ex["input"]))
        df = pd.DataFrame(rows)

        # Select only continuous features
        num_cols = [fname for fname, ftype in zip(feature_names, feature_types)
                    if ftype == "continuous"]
        ax3 = axes[i, 2]
        if num_cols and len(df) > 1:
            df[num_cols].boxplot(ax=ax3, rot=45, grid=False)
            ax3.set_title(f"{name}\nNumeric Feature Distributions")
        else:
            ax3.text(0.5, 0.5, f"Only {len(df)} examples\n(need more for boxplot)",
                     ha="center", va="center", transform=ax3.transAxes)
            ax3.set_title(f"{name}\nNumeric Features")

    fig.suptitle(f"{title_prefix}OpenML Mixed-Type Classification Benchmarks",
                 fontsize=14, fontweight="bold", y=1.02)
    plt.tight_layout()
    plt.show()

    # Print summary table
    print(f"\n{'Dataset':<15} {'Examples':>8} {'Features':>8} {'Numeric':>8} {'Categorical':>11} {'Classes':>8}")
    print("-" * 68)
    for ds in data["datasets"]:
        ex0 = ds["examples"][0]
        ft = ex0["metadata_feature_types"]
        n_num = sum(1 for t in ft if t == "continuous")
        n_cat = sum(1 for t in ft if t == "categorical")
        print(f"{ds['dataset']:<15} {len(ds['examples']):>8} {len(ft):>8} {n_num:>8} {n_cat:>11} {ex0['metadata_n_classes']:>8}")
    total = sum(len(ds["examples"]) for ds in data["datasets"])
    print(f"\nTotal examples: {total}")

visualize_datasets(data, title_prefix="[Mini] ")

---
## Part 2 — Full Run (Original Parameters)

Load the complete dataset with all 1,690 examples (1,000 credit_g + 690 australian) and re-run the same analysis with original parameters.

In [None]:
data = load_full()

### Full Dataset Summary

Print statistics for the complete dataset — all 1,690 examples across both datasets with 5-fold CV.

In [None]:
# Original parameters (unchanged from data.py)
DATASETS = [
    (31, "credit_g"),
    (40981, "australian"),
]

N_FOLDS = 5
RANDOM_STATE = 42

print("=" * 60)
print("OpenML Mixed-Type Classification Benchmarks — FULL DATA")
print("=" * 60)

for ds in data["datasets"]:
    examples = ds["examples"]
    name = ds["dataset"]
    n_examples = len(examples)

    # Get metadata from first example
    ex0 = examples[0]
    feature_names = ex0["metadata_feature_names"]
    feature_types = ex0["metadata_feature_types"]
    n_classes = ex0["metadata_n_classes"]

    n_cat = sum(1 for t in feature_types if t == "categorical")
    n_num = sum(1 for t in feature_types if t == "continuous")

    # Class distribution
    label_counts = Counter(ex["output"] for ex in examples)

    # Fold distribution
    fold_counts = Counter(ex["metadata_fold"] for ex in examples)

    print(f"\n{'='*60}")
    print(f"Dataset: {name}")
    print(f"  Examples: {n_examples}")
    print(f"  Features: {len(feature_names)} ({n_num} numeric + {n_cat} categorical)")
    print(f"  Classes: {n_classes}, distribution: {dict(label_counts)}")
    print(f"  Folds: {dict(sorted(fold_counts.items()))}")
    print(f"  Task type: {ex0['metadata_task_type']}")

### Full Dataset Visualization

Re-run the same visualization with all 1,690 examples — now the box plots, class distributions, and fold counts show the true data characteristics.

In [None]:
visualize_datasets(data, title_prefix="[Full] ")