# KAIROS Benchmark Evaluation

**This notebook produces the metrics reported in the README.**

It provides a rigorous comparison of the **KAIROS Hybrid Ensemble** against standard industry baselines:
1. **Logistic Regression** (Linear Baseline)
2. **Random Forest** (Bagging Baseline)
3. **KAIROS** (Calibration + Risk Policy)

## 1. Setup & Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
import sys
import os

# Add project root to path
sys.path.append("..")

from src.kairos.data.loader import load_adult_data
from src.kairos.core.pipeline import KairosInferenceEngine
from src.kairos.core.policy import KairosPolicy

# Visual Style
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = [10, 6]

In [None]:
# Load strict test set split (same random state as training)
df = load_adult_data("../data")
X = df.drop(columns=["target", "income", "fnlwgt", "education"])
y = df["target"]

# Encode categoricals for baselines (simple ordinal)
X_baseline = X.copy()
for col in X_baseline.select_dtypes("object").columns:
    X_baseline[col] = X_baseline[col].astype("category").cat.codes

# Identical split to main.py
X_train, X_test, y_train, y_test = train_test_split(
    X_baseline, y, test_size=0.2, random_state=42
)
_, X_raw_test, _, _ = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Train Baselines

In [None]:
metrics = []

# --- Logistic Regression ---
print("Training Logistic Regression...")
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
metrics.append(
    {
        "Model": "Logistic Regression",
        "Precision": precision_score(y_test, y_pred_lr),
        "Automation Rate": 1.0,  # Always decides
    }
)

# --- Random Forest ---
print("Training Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
metrics.append(
    {
        "Model": "Random Forest",
        "Precision": precision_score(y_test, y_pred_rf),
        "Automation Rate": 1.0,
    }
)

## 3. Evaluate KAIROS (Full Stack)

In [None]:
# Load Pre-trained Artifact
MODEL_PATH = "../outputs/kairos_model"
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(
        f"Please run 'python main.py' first to generate {MODEL_PATH}"
    )

engine = KairosInferenceEngine.load(MODEL_PATH)
probs = engine.predict_calibrated(X_raw_test)

# Apply Risk Policy
policy = KairosPolicy(tau_low=0.2, tau_high=0.8)
decisions = policy.predict_with_policy(probs)

# Calculate Coverage Metrics
covered_mask = decisions != "ABSTAIN"
y_covered = y_test[covered_mask]
preds_covered = np.where(decisions[covered_mask] == "ACCEPT", 1, 0)

metrics.append(
    {
        "Model": "KAIROS (Risk-Aware)",
        "Precision": precision_score(y_covered, preds_covered),
        "Automation Rate": np.mean(covered_mask),
    }
)

results_df = pd.DataFrame(metrics).set_index("Model")
results_df

## 4. Visualization of Lift

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 6))

colors = ["#bdc3c7", "#95a5a6", "#2ecc71"]  # KAIROS gets green
results_df["Precision"].plot(kind="bar", ax=ax1, color=colors, alpha=0.8, rot=0)

ax1.set_ylabel("Precision Score", fontsize=12)
ax1.set_title("Precision Lift vs Industry Baselines", fontsize=14, pad=20)
ax1.set_ylim(0.5, 1.0)

for i, v in enumerate(results_df["Precision"]):
    ax1.text(i, v + 0.01, f"{v:.1%}", ha="center", fontweight="bold")

plt.tight_layout()
plt.savefig("../docs/assets/benchmark_chart.png")
plt.show()