# Tabular Benchmark Dataset Standardizer for DOTS

This notebook demonstrates `data.py`, which loads the **imodels/tabular-benchmark-797-classification**
dataset (from the OpenML-797 benchmark suite used in RO-FIGS) and outputs standardized examples
for Dictionary-Constrained Oblique Tree Sums (DOTS) evaluation.

Each example represents a tabular data row as a binary classification task:
- **input**: Feature vector as structured text (44 numeric features: F1R–F22R, F1S–F22S)
- **context**: Full feature dictionary + metadata
- **output**: Binary classification label (`"0"` or `"1"`)

---

**Part 1 — Quick Demo:** Runs on a small 15-example subset loaded from `demo_data.json`.  
**Part 2 — Full Version:** Shows the original `main()` pipeline that processes all 200 examples.

## Part 1 — Quick Demo

### 1.1 Load Demo Data

Load a 15-sample demo subset. Tries the GitHub raw URL first (for Colab), then falls back to the local file.

In [None]:
GITHUB_RAW_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-54ecf4-dictionary-constrained-oblique-tree-sums/main/dataset_iter1_tabular_bench/demo/demo_data.json"
LOCAL_FILE = "demo_data.json"
import json, os
def load_data():
    try:
        import urllib.request
        with urllib.request.urlopen(GITHUB_RAW_URL, timeout=5) as response:
            return json.loads(response.read().decode())
    except Exception: pass
    if os.path.exists(LOCAL_FILE):
        with open(LOCAL_FILE) as f: return json.load(f)
    raise FileNotFoundError("Could not load data from GitHub or local file")
data = load_data()
print(f"Loaded {len(data['examples'])} examples")
print(f"Metadata: {data.get('metadata', 'N/A')}")

### 1.2 Imports

Original imports from `data.py`.

In [None]:
import json
import random
from pathlib import Path
from typing import Any

### 1.3 Constants

Original constants from `data.py`. File paths are not used in the demo (data is already loaded above).

In [None]:
# AIDEV-NOTE: Working directory is the script's parent directory
# WD = Path(__file__).parent  # Not applicable in notebook
# DATASETS_DIR = WD / "temp" / "datasets"  # Not applicable in notebook
# OUTPUT_FILE = WD / "data_out.json"  # Not applicable in notebook
EXAMPLES_PER_DATASET = 200
RANDOM_SEED = 42

### 1.4 Helper Functions

`load_json` loads a JSON file. `clean_feature_name` sanitizes feature names to valid Python identifiers.
`format_features_as_text` converts a feature dict into a readable text prompt for the model input.

In [None]:
def load_json(filepath: Path) -> list[dict[str, Any]]:
    """Load a JSON file and return list of records."""
    with filepath.open("r", encoding="utf-8") as f:
        return json.load(f)


def clean_feature_name(name: str) -> str:
    """Clean feature name to valid Python identifier."""
    cleaned = name.replace(" ", "_").replace("-", "_").replace(".", "_")
    cleaned = "".join(c if c.isalnum() or c == "_" else "_" for c in cleaned)
    if cleaned and cleaned[0].isdigit():
        cleaned = "f_" + cleaned
    return cleaned


def format_features_as_text(features: dict[str, Any]) -> str:
    """Format feature dict as a readable text description for the input field."""
    lines = []
    for k, v in features.items():
        if isinstance(v, float):
            lines.append(f"  {k}: {v:.4f}")
        else:
            lines.append(f"  {k}: {v}")
    return "Classify the following tabular data sample:\n" + "\n".join(lines)

### 1.5 Core Processor: `process_tabular_benchmark`

Processes the imodels/tabular-benchmark-797-classification dataset.
This dataset has 44 numeric features (F1R–F22R, F1S–F22S) + a binary target.
Already fully numeric — no preprocessing needed beyond field standardization.

In [None]:
def process_tabular_benchmark(
    records: list[dict[str, Any]],
    n_examples: int,
) -> list[dict[str, Any]]:
    """Process imodels/tabular-benchmark-797-classification dataset.

    This dataset has 44 numeric features (F1R..F22R, F1S..F22S) + binary target.
    Already fully numeric \u2014 no preprocessing needed beyond field standardization.
    """
    # AIDEV-NOTE: Remove index columns that are not features
    drop_cols = {"Unnamed: 0.1", "Unnamed: 0", "target"}
    feature_cols = [c for c in records[0].keys() if c not in drop_cols]
    feature_cols_clean = {c: clean_feature_name(c) for c in feature_cols}

    # Sample n_examples deterministically
    rng = random.Random(RANDOM_SEED)
    if len(records) > n_examples:
        sampled = rng.sample(records, n_examples)
    else:
        sampled = records

    examples = []
    for row in sampled:
        target = int(row["target"])
        features = {
            feature_cols_clean[c]: row[c] for c in feature_cols
        }

        example = {
            "input": format_features_as_text(features),
            "context": {
                "features": features,
                "n_features": len(features),
                "task_type": "binary_classification",
                "dataset_source": "OpenML-797 benchmark suite",
                "feature_type": "numeric",
                "preprocessing": "none_needed_already_numeric",
            },
            "output": str(target),
            "dataset": "imodels/tabular-benchmark-797-classification",
            "split": "test",
        }
        examples.append(example)

    return examples

### 1.6 Secondary Processor: `process_churn_prediction`

Alternative processor for the scikit-learn/churn-prediction dataset (not used in the final output,
but included for completeness). Handles mixed categorical + numeric features with one-hot encoding.

In [None]:
def process_churn_prediction(
    records: list[dict[str, Any]],
    n_examples: int,
) -> list[dict[str, Any]]:
    """Process scikit-learn/churn-prediction dataset.

    This dataset has mixed features (categorical + numeric) + binary Churn target.
    Preprocessing: one-hot encode categoricals, convert target to 0/1.
    """
    # AIDEV-NOTE: Drop customerID (identifier, not a feature)
    id_cols = {"customerID"}
    target_col = "Churn"

    # Identify categorical vs numeric features
    categorical_cols = [
        "gender", "Partner", "Dependents", "PhoneService",
        "MultipleLines", "InternetService", "OnlineSecurity",
        "OnlineBackup", "DeviceProtection", "TechSupport",
        "StreamingTV", "StreamingMovies", "Contract",
        "PaperlessBilling", "PaymentMethod",
    ]
    numeric_cols = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]

    # Encode target: Yes=1, No=0
    target_map = {"Yes": 1, "No": 0}

    # Sample n_examples deterministically
    rng = random.Random(RANDOM_SEED)
    if len(records) > n_examples:
        sampled = rng.sample(records, n_examples)
    else:
        sampled = records

    examples = []
    for row in sampled:
        target_val = row.get(target_col)
        if target_val is None:
            continue
        target = target_map.get(str(target_val), int(target_val) if str(target_val).isdigit() else 0)

        # Build feature dict with one-hot encoding for categoricals
        features: dict[str, Any] = {}

        # Numeric features
        for col in numeric_cols:
            val = row.get(col)
            if val is None or val == "" or val == " ":
                # AIDEV-NOTE: Impute missing with 0.0 (median would require full dataset scan)
                features[clean_feature_name(col)] = 0.0
            else:
                try:
                    features[clean_feature_name(col)] = float(val)
                except (ValueError, TypeError):
                    features[clean_feature_name(col)] = 0.0

        # One-hot encode categorical features
        for col in categorical_cols:
            val = str(row.get(col, "Unknown"))
            ohe_name = clean_feature_name(f"{col}_{val}")
            # AIDEV-NOTE: Set current category to 1, others implicitly 0 in context
            features[ohe_name] = 1

        # Build original features dict (before encoding) for context
        original_features = {}
        for col in numeric_cols + categorical_cols:
            if col not in id_cols:
                original_features[col] = row.get(col)

        example = {
            "input": format_features_as_text(features),
            "context": {
                "features": features,
                "original_features": original_features,
                "n_features_original": len(numeric_cols) + len(categorical_cols),
                "n_features_encoded": len(features),
                "task_type": "binary_classification",
                "dataset_source": "scikit-learn/churn-prediction (IBM Telco)",
                "feature_type": "mixed_categorical_numeric",
                "preprocessing": "one_hot_encoded_categoricals",
                "target_encoding": {"Yes": 1, "No": 0},
            },
            "output": str(target),
            "dataset": "scikit-learn/churn-prediction",
            "split": "train",
        }
        examples.append(example)

    return examples

### 1.7 Run on Demo Data

Use the pre-processed examples from `demo_data.json` and run the summary statistics
from the original `main()` function.

In [None]:
# Use the loaded demo examples directly (already processed by data.py)
tabular_examples = data["examples"][:15]  # DEMO: use 15 examples. Original: all 200 examples

print(f"Total examples: {len(tabular_examples)}")

# Summary statistics (from original main())
targets = [int(e["output"]) for e in tabular_examples]
class_1_frac = sum(targets) / len(targets) if targets else 0
print(f"\n  tabular-benchmark-797-classification (SELECTED):")
print(f"    Source: imodels/tabular-benchmark-797-classification (OpenML-797 suite)")
print(f"    Task: binary classification")
print(f"    Features: {tabular_examples[0]['context']['n_features']} (all numeric)")
print(f"    Class balance: {class_1_frac:.3f} (class=1)")
print(f"    Examples: {len(tabular_examples)}")

### 1.8 Inspect an Example

Show the structure of a single processed example.

In [None]:
ex = tabular_examples[0]
print("=== Example 0 ===")
print(ex["input"][:300], "...")
print(f"\nOutput (target): {ex['output']}")
print(f"Dataset: {ex['dataset']}")
print(f"Split: {ex['split']}")
print(f"Context keys: {list(ex['context'].keys())}")
print(f"Number of features: {ex['context']['n_features']}")
print(f"Feature names (first 6): {list(ex['context']['features'].keys())[:6]}")

---

## Part 2 — Full Run — Original Parameters

The cell below shows how to restore all original parameters to reproduce the full pipeline results.
Uncommenting those lines will use the complete dataset (200 examples) instead of the 15-example demo subset.

The subsequent cell contains the original `main()` function that loads raw records from disk,
processes all 200 examples via `process_tabular_benchmark`, and writes `data_out.json`.
This requires the raw dataset file at `temp/datasets/full_imodels_tabular-benchmark-797-classification_test.json`.

> **Note:** The full run may take significantly longer depending on data size and available resources.

In [None]:
# Uncomment to run with original parameters:
# tabular_examples = data["examples"]  # all 200 examples (DEMO used: 15)
# EXAMPLES_PER_DATASET = 200  # Original: 200 (already set above)

In [None]:
def main() -> None:
    """Main processing pipeline.

    AIDEV-NOTE: Only outputs the BEST dataset (tabular-benchmark-797-classification).
    Selected over churn-prediction because it's directly from the OpenML-797 benchmark
    suite used in RO-FIGS, has pure numeric features, and needs no preprocessing.
    """
    # NOTE: This cell is for reference only. It requires raw dataset files not included in the demo.
    # To run the full pipeline, use: python data.py
    raise RuntimeError("Full pipeline requires raw dataset files. Run 'python data.py' instead.")

    DATASETS_DIR = Path("temp") / "datasets"
    OUTPUT_FILE = Path("data_out.json")

    print(f"Loading datasets from: {DATASETS_DIR}")

    # Load the selected best dataset
    tabular_file = DATASETS_DIR / "full_imodels_tabular-benchmark-797-classification_test.json"
    tabular_records = load_json(tabular_file)
    print(f"  tabular-benchmark-797: {len(tabular_records)} records loaded")

    # Process selected dataset (200 examples)
    tabular_examples = process_tabular_benchmark(
        records=tabular_records,
        n_examples=EXAMPLES_PER_DATASET,
    )
    print(f"  tabular-benchmark-797: {len(tabular_examples)} examples extracted")

    # Build output matching exp_sel_data_out.json schema
    output = {"examples": tabular_examples}

    # Save to full_data_out.json
    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"\nOutput saved to: {OUTPUT_FILE}")
    print(f"Total examples: {len(tabular_examples)}")

    # Summary statistics
    targets = [int(e["output"]) for e in tabular_examples]
    class_1_frac = sum(targets) / len(targets) if targets else 0
    print(f"\n  tabular-benchmark-797-classification (SELECTED):")
    print(f"    Source: imodels/tabular-benchmark-797-classification (OpenML-797 suite)")
    print(f"    Task: binary classification")
    print(f"    Features: {tabular_examples[0]['context']['n_features']} (all numeric)")
    print(f"    Class balance: {class_1_frac:.3f} (class=1)")
    print(f"    Examples: {len(tabular_examples)}")

---

## Visualization

Feature distributions and class balance for the demo subset.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract feature matrix and labels
feature_names = list(tabular_examples[0]["context"]["features"].keys())
X = np.array([[ex["context"]["features"][f] for f in feature_names] for ex in tabular_examples])
y = np.array([int(ex["output"]) for ex in tabular_examples])

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# --- Plot 1: Class distribution ---
counts = [int(np.sum(y == 0)), int(np.sum(y == 1))]
bars = axes[0, 0].bar(["Class 0", "Class 1"], counts, color=["#4C72B0", "#DD8452"])
axes[0, 0].set_title("Class Distribution")
axes[0, 0].set_ylabel("Count")
for bar, c in zip(bars, counts):
    axes[0, 0].text(bar.get_x() + bar.get_width()/2, c + 0.15, str(c),
                    ha="center", fontweight="bold")

# --- Plot 2: Mean feature values by class ---
mean_0 = X[y == 0].mean(axis=0) if np.any(y == 0) else np.zeros(X.shape[1])
mean_1 = X[y == 1].mean(axis=0) if np.any(y == 1) else np.zeros(X.shape[1])
x_pos = np.arange(len(feature_names))
axes[0, 1].bar(x_pos - 0.2, mean_0, 0.4, label="Class 0", color="#4C72B0", alpha=0.8)
axes[0, 1].bar(x_pos + 0.2, mean_1, 0.4, label="Class 1", color="#DD8452", alpha=0.8)
axes[0, 1].set_title("Mean Feature Values by Class")
axes[0, 1].set_xlabel("Feature Index")
axes[0, 1].set_ylabel("Mean Value")
axes[0, 1].legend(fontsize=8)
axes[0, 1].set_xticks(x_pos[::4])
axes[0, 1].set_xticklabels([feature_names[i] for i in range(0, len(feature_names), 4)],
                           rotation=45, ha="right", fontsize=7)

# --- Plot 3: Feature value spread (box plot, first 10 features) ---
bp = axes[1, 0].boxplot(X[:, :10], labels=feature_names[:10], patch_artist=True)
for patch in bp["boxes"]:
    patch.set_facecolor("#A1C9F4")
axes[1, 0].set_title("Feature Value Spread (First 10 Features)")
axes[1, 0].tick_params(axis="x", rotation=45)

# --- Plot 4: Per-feature std deviation ---
stds = X.std(axis=0)
axes[1, 1].barh(feature_names, stds, color="#FFBE7A", edgecolor="#DD8452")
axes[1, 1].set_title("Feature Std. Deviation")
axes[1, 1].set_xlabel("Std Dev")
axes[1, 1].invert_yaxis()
axes[1, 1].tick_params(axis="y", labelsize=6)

plt.tight_layout()
plt.savefig("demo_visualization.png", dpi=120, bbox_inches="tight")
plt.show()

print(f"\n{'='*50}")
print(f"Dataset Summary")
print(f"{'='*50}")
print(f"  Feature matrix shape: {X.shape}")
print(f"  Value range: [{X.min():.2f}, {X.max():.2f}]")
print(f"  Class counts: 0={counts[0]}, 1={counts[1]}")
print(f"  Class 1 fraction: {counts[1]/sum(counts):.3f}")
print(f"  Feature names: {feature_names[:6]} ... ({len(feature_names)} total)")