# Sklearn Built-in Tabular Classification Datasets — Demo

This notebook demonstrates the data collection pipeline for **sklearn built-in tabular classification datasets**. The original script loads the Breast Cancer Wisconsin (569 samples, 30 features, binary classification) and Wine (178 samples, 13 features, 3-class) datasets from `sklearn.datasets`, then:

1. **Standardizes** continuous features with `StandardScaler`
2. **Discretizes** features using `KBinsDiscretizer` (5-bin and 10-bin quantile)
3. **Generates** 5-fold `StratifiedKFold` cross-validation indices
4. **Outputs** each row as a separate example with JSON feature dicts and class labels

**Part 1** runs a quick demo on a small curated subset.
**Part 2** runs the full pipeline with all original parameters.

In [None]:
import json

import numpy as np
from sklearn.datasets import load_breast_cancer, load_wine
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler

import matplotlib.pyplot as plt

In [None]:
GITHUB_FULL_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/sklearn_tabular/demo/full_demo_data.json"
GITHUB_MINI_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/sklearn_tabular/demo/mini_demo_data.json"
import json, os

def _load_json(url, local_path):
    try:
        import urllib.request
        with urllib.request.urlopen(url) as response:
            return json.loads(response.read().decode())
    except Exception: pass
    if os.path.exists(local_path):
        with open(local_path) as f: return json.load(f)
    raise FileNotFoundError(f"Could not load {local_path}")

def load_mini():
    return _load_json(GITHUB_MINI_DATA_URL, "mini_demo_data.json")

def load_full():
    return _load_json(GITHUB_FULL_DATA_URL, "full_demo_data.json")

## Part 1 — Quick Demo (Mini Data)

Load a small curated subset (11 examples across 2 datasets) and run the processing pipeline with reduced parameters for a fast demo.

In [None]:
data = load_mini()

print(f"Number of datasets: {len(data['datasets'])}")
for ds in data["datasets"]:
    print(f"  {ds['dataset']}: {len(ds['examples'])} examples")
print(f"\nFirst example keys: {list(data['datasets'][0]['examples'][0].keys())}")

### Process Dataset

The core `process_dataset` function from the original script loads a sklearn dataset, applies StandardScaler, KBinsDiscretizer (5 and 10 bins), and assigns StratifiedKFold indices. Each row becomes a separate example with JSON feature dict as input and class label as output.

For the quick demo, we use **n_folds=2** (instead of 5) to speed things up.

In [None]:
def process_dataset(
    name: str,
    loader_func,
    n_folds: int = 5,
    random_state: int = 42,
) -> dict:
    """Process a single sklearn dataset into the standardized schema."""
    data = loader_func()
    X_raw = data.data
    y = data.target
    feature_names = [str(f) for f in data.feature_names]
    target_names = [str(t) for t in data.target_names]
    n_samples, n_features = X_raw.shape
    n_classes = len(target_names)

    print(f"  Processing {name}: {n_samples} samples, {n_features} features, {n_classes} classes")

    # 1. Standardize continuous features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_raw)

    # 2. Discretize with KBinsDiscretizer (5 bins and 10 bins)
    discretizer_5 = KBinsDiscretizer(
        n_bins=5,
        encode="ordinal",
        strategy="quantile",
        subsample=None,
    )
    X_disc_5 = discretizer_5.fit_transform(X_raw)

    discretizer_10 = KBinsDiscretizer(
        n_bins=10,
        encode="ordinal",
        strategy="quantile",
        subsample=None,
    )
    X_disc_10 = discretizer_10.fit_transform(X_raw)

    # 3. Generate StratifiedKFold indices
    skf = StratifiedKFold(
        n_splits=n_folds,
        shuffle=True,
        random_state=random_state,
    )
    fold_assignments = np.zeros(n_samples, dtype=int)
    for fold_idx, (_, test_idx) in enumerate(skf.split(X_raw, y)):
        fold_assignments[test_idx] = fold_idx

    # 4. Build examples (one per row)
    examples = []
    for i in range(n_samples):
        # Build input as JSON string of feature values (using original continuous values)
        feature_dict = {}
        for j, fname in enumerate(feature_names):
            feature_dict[fname] = round(float(X_raw[i, j]), 6)

        # Build the input string: JSON representation of feature values
        input_str = json.dumps(feature_dict)

        # Output: target class name as string
        output_str = str(target_names[y[i]])

        example = {
            "input": input_str,
            "output": output_str,
            "metadata_fold": int(fold_assignments[i]),
            "metadata_feature_names": feature_names,
            "metadata_task_type": "classification",
            "metadata_n_classes": n_classes,
            "metadata_row_index": i,
            "metadata_n_features": n_features,
            "metadata_n_samples": n_samples,
            "metadata_target_names": target_names,
            "metadata_standardized_values": [round(float(v), 6) for v in X_scaled[i]],
            "metadata_discretized_5bin": [int(v) for v in X_disc_5[i]],
            "metadata_discretized_10bin": [int(v) for v in X_disc_10[i]],
        }
        examples.append(example)

    print(f"  -> Generated {len(examples)} examples for {name}")
    return {"dataset": name, "examples": examples}

### Run Processing (Quick Demo)

Run the pipeline on both datasets with **n_folds=2** for a quick demo. The original script uses `n_folds=5`.

In [None]:
print("=" * 60)
print("Loading sklearn built-in tabular classification datasets")
print("=" * 60)

datasets_config = [
    ("breast_cancer_wisconsin", load_breast_cancer),
    ("wine", load_wine),
]

# Quick demo: n_folds=2 (original: n_folds=5)
all_datasets = []
for name, loader in datasets_config:
    dataset_entry = process_dataset(name=name, loader_func=loader, n_folds=2)
    all_datasets.append(dataset_entry)

output = {"datasets": all_datasets}

# Summary
total_examples = sum(len(d["examples"]) for d in all_datasets)
print(f"\nTotal datasets: {len(all_datasets)}")
print(f"Total examples: {total_examples}")
for d in all_datasets:
    print(f"  {d['dataset']}: {len(d['examples'])} examples")

### Inspect Pre-generated Data

Compare the freshly processed output with the pre-generated mini dataset loaded from JSON.

In [None]:
# Inspect a pre-generated example from the mini dataset
for ds in data["datasets"]:
    ex = ds["examples"][0]
    print(f"--- {ds['dataset']} (first example) ---")
    print(f"  Output label: {ex['output']}")
    print(f"  Fold: {ex['metadata_fold']}")
    print(f"  Task type: {ex['metadata_task_type']}")
    print(f"  N classes: {ex['metadata_n_classes']}")
    print(f"  Target names: {ex['metadata_target_names']}")
    print(f"  N features: {ex['metadata_n_features']}")
    features = json.loads(ex["input"])
    print(f"  Input features (first 5): {dict(list(features.items())[:5])}")
    print(f"  Standardized (first 5): {ex['metadata_standardized_values'][:5]}")
    print(f"  Discretized 5-bin (first 5): {ex['metadata_discretized_5bin'][:5]}")
    print(f"  Discretized 10-bin (first 5): {ex['metadata_discretized_10bin'][:5]}")
    print()

### Visualize Results

Reusable visualization: class distribution, feature value distributions (standardized), and fold assignment counts for each dataset.

In [None]:
def visualize_results(datasets_list, title_prefix=""):
    """Reusable visualization for processed dataset results."""
    n_datasets = len(datasets_list)
    fig, axes = plt.subplots(n_datasets, 3, figsize=(15, 4 * n_datasets))
    if n_datasets == 1:
        axes = axes[np.newaxis, :]

    for idx, ds in enumerate(datasets_list):
        name = ds["dataset"]
        examples = ds["examples"]

        # 1. Class distribution
        labels = [ex["output"] for ex in examples]
        unique_labels, counts = np.unique(labels, return_counts=True)
        axes[idx, 0].bar(unique_labels, counts, color=plt.cm.Set2.colors[:len(unique_labels)])
        axes[idx, 0].set_title(f"{name}\nClass Distribution")
        axes[idx, 0].set_ylabel("Count")
        axes[idx, 0].tick_params(axis="x", rotation=15)

        # 2. Standardized feature distributions (first 5 features)
        std_vals = np.array([ex["metadata_standardized_values"] for ex in examples])
        feature_names = examples[0]["metadata_feature_names"]
        n_show = min(5, std_vals.shape[1])
        bp = axes[idx, 1].boxplot(
            [std_vals[:, j] for j in range(n_show)],
            labels=[fn[:12] for fn in feature_names[:n_show]],
            patch_artist=True,
        )
        for patch, color in zip(bp["boxes"], plt.cm.Set2.colors):
            patch.set_facecolor(color)
        axes[idx, 1].set_title(f"{name}\nStandardized Features (first {n_show})")
        axes[idx, 1].tick_params(axis="x", rotation=30)

        # 3. Fold distribution
        folds = [ex["metadata_fold"] for ex in examples]
        unique_folds, fold_counts = np.unique(folds, return_counts=True)
        axes[idx, 2].bar(unique_folds.astype(str), fold_counts, color="steelblue")
        axes[idx, 2].set_title(f"{name}\nFold Distribution")
        axes[idx, 2].set_xlabel("Fold")
        axes[idx, 2].set_ylabel("Count")

    fig.suptitle(f"{title_prefix}Dataset Processing Results", fontsize=14, fontweight="bold", y=1.02)
    plt.tight_layout()
    plt.show()

    # Print summary table
    print(f"\n{'Dataset':<30} {'Examples':>8} {'Features':>8} {'Classes':>8} {'Folds':>6}")
    print("-" * 62)
    for ds in datasets_list:
        ex0 = ds["examples"][0]
        n_folds = len(set(ex["metadata_fold"] for ex in ds["examples"]))
        print(f"{ds['dataset']:<30} {len(ds['examples']):>8} {ex0['metadata_n_features']:>8} {ex0['metadata_n_classes']:>8} {n_folds:>6}")

# Visualize quick demo results
visualize_results(all_datasets, title_prefix="Quick Demo: ")

## Part 2 — Full Run (Original Parameters)

Load the complete dataset (747 examples across 2 datasets) and re-run the processing pipeline with **all original parameters** restored: `n_folds=5` (5-fold StratifiedKFold), `random_state=42`.

In [None]:
data = load_full()

print(f"Full dataset: {len(data['datasets'])} datasets")
for ds in data["datasets"]:
    print(f"  {ds['dataset']}: {len(ds['examples'])} examples")

In [None]:
print("=" * 60)
print("Full Run — Original Parameters (n_folds=5)")
print("=" * 60)

datasets_config = [
    ("breast_cancer_wisconsin", load_breast_cancer),
    ("wine", load_wine),
]

# Full run: original n_folds=5
all_datasets_full = []
for name, loader in datasets_config:
    dataset_entry = process_dataset(name=name, loader_func=loader, n_folds=5)
    all_datasets_full.append(dataset_entry)

output_full = {"datasets": all_datasets_full}

# Summary
total_examples = sum(len(d["examples"]) for d in all_datasets_full)
print(f"\nTotal datasets: {len(all_datasets_full)}")
print(f"Total examples: {total_examples}")
for d in all_datasets_full:
    print(f"  {d['dataset']}: {len(d['examples'])} examples")

In [None]:
# Visualize full run results
visualize_results(all_datasets_full, title_prefix="Full Run: ")