In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Ridge, LogisticRegression, RidgeClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.svm import SVR
from xgboost import XGBRegressor, XGBClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score, roc_auc_score, accuracy_score

# --- Category mapping ---
category_map = {
    'Summaries': ['summaries'],
    'English': ['english_1', 'english_2'],
    'Math': ['math_1', 'math_2'],
    'Coding': ['coding_1', 'coding_2', 'coding_3'],
    'Reasoning': ['reasoning_1', 'reasoning_2', 'reasoning_3']
}
llm_names = ["openai/gpt-4o", "anthropic/claude-3.5-sonnet", "deepseek/deepseek-chat", "perplexity/sonar"]

# --- Load Data ---
with open("df_processed.pkl", "rb") as f:
    df_processed = pickle.load(f)
df_raw = pd.read_csv("ranked_responses_final.csv")
prompt_to_source = (
    df_raw[["Prompt", "Source"]].drop_duplicates().set_index("Prompt")["Source"]
)
df_processed["Source"] = df_processed["prompt"].map(prompt_to_source)
df_processed = df_processed.dropna(subset=["Source"]).reset_index(drop=True)

# --- FIX: Ensure 'scores' is always a list of floats in the correct LLM order ---
def scores_to_list(val):
    # Always return a list of floats in the right LLM order
    if isinstance(val, dict):
        return [val.get(name, 0.0) for name in llm_names]
    if isinstance(val, (list, np.ndarray)):
        if len(val) == len(llm_names):
            return list(val)
        else:
            return [0.0] * len(llm_names)
    return [0.0] * len(llm_names)

df_processed["scores"] = df_processed["scores"].apply(scores_to_list)

# --- Models to Evaluate ---
regression_models = {
    "Random Forest": MultiOutputRegressor(RandomForestRegressor(n_estimators=200, max_depth=25, min_samples_leaf=5, random_state=42, n_jobs=-1)),
    "XGBoost": MultiOutputRegressor(XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.1, objective="reg:squarederror", random_state=42, n_jobs=-1)),
    "MLP": MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(256, 128), activation='relu', solver='adam', max_iter=300, random_state=42)),
    "Ridge": MultiOutputRegressor(Ridge(alpha=1.0)),
    "SVR": MultiOutputRegressor(SVR(kernel='rbf', C=1.0, epsilon=0.1))
}

classifier_models = {
    "Random Forest": RandomForestClassifier(n_estimators=150, max_depth=25, random_state=42, n_jobs=-1),
    "Logistic Regression": LogisticRegression(max_iter=250, multi_class="ovr"),
    "XGBoost": XGBClassifier(n_estimators=150, max_depth=6, use_label_encoder=False, eval_metric="mlogloss"),
    "MLP": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=250, random_state=42),
    "Ridge": RidgeClassifier()
}

# --- Evaluation Loop ---
for category, sources in category_map.items():
    print(f"\n===== Category: {category} =====")

    # Filter category data
    df_cat = df_processed[df_processed["Source"].isin(sources)]
    n_samples = len(df_cat)
    if n_samples < 10:
        print(f"  [!] Not enough data for category {category} ({n_samples} samples), skipping.")
        continue

    X = np.vstack(df_cat["embedding"].values)
    y_scores = np.array(df_cat["scores"].tolist())
    if y_scores.ndim == 1:
        y_scores = y_scores.reshape(1, -1)
    y_best = np.argmax(y_scores, axis=1)  # Best LLM index for each example

    # Normalize X
    X_norm = normalize(X, norm="l2", axis=1)

    # Split
    X_train, X_test, y_train, y_test, yb_train, yb_test = train_test_split(
        X_norm, y_scores, y_best, test_size=0.2, random_state=42, stratify=y_best
    )

    print("---- Regression Models ----")
    for name, model in regression_models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Per-LLM MSE
        mse_per_llm = [
            mean_squared_error(y_test[:, i], y_pred[:, i])
            for i in range(y_scores.shape[1])
        ]

        # Top-1 ACC, Top-1-or-2 ACC
        true_best = np.argmax(y_test, axis=1)
        # For true second-best, sort descending, then pick index 1
        sorted_idxs = np.argsort(y_test, axis=1)  # ascending
        true_2nd = sorted_idxs[:, -2]
        pred_best = np.argmax(y_pred, axis=1)

        top1_acc = np.mean(pred_best == true_best)
        top1or2_acc = np.mean([
            pred_best[i] in {true_best[i], true_2nd[i]}
            for i in range(len(pred_best))
        ])

        print(f"  {name:>10}: Per-LLM MSE={np.round(mse_per_llm, 4)}, Top-1 Acc={top1_acc:.3f}, Top-1-or-2 Acc={top1or2_acc:.3f}")

    # ---- Classification Models ----
    print("---- Classifiers for Best LLM ----")
    # Convert yb_train/yb_test to categorical indices; need one-hot for multiclass AUC
    yb_train_bin = label_binarize(yb_train, classes=np.arange(len(llm_names)))
    yb_test_bin = label_binarize(yb_test, classes=np.arange(len(llm_names)))

    for name, clf in classifier_models.items():
        clf.fit(X_train, yb_train)
        y_pred = clf.predict(X_test)
        y_prob = clf.predict_proba(X_test) if hasattr(clf, "predict_proba") else None

        precision = precision_score(yb_test, y_pred, average='macro')
        recall = recall_score(yb_test, y_pred, average='macro')
        f1 = f1_score(yb_test, y_pred, average='macro')
        acc = accuracy_score(yb_test, y_pred)

        # Macro-AUC (One-vs-Rest), if probabilities available
        if y_prob is not None:
            auc = roc_auc_score(yb_test_bin, y_prob, average="macro", multi_class="ovr")
            print(f"  {name:>18}: Acc={acc:.3f}, Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}, AUC={auc:.3f}")
        else:
            print(f"  {name:>18}: Acc={acc:.3f}, Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}, AUC=N/A")

print("\n[Done]")


In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

# === 1. Load and prepare the data ===
with open("df_processed.pkl", "rb") as f:
    df_processed = pickle.load(f)

df_raw = pd.read_csv("ranked_responses_final.csv")
prompt_to_source = (
    df_raw[["Prompt", "Source"]]
    .drop_duplicates()
    .set_index("Prompt")["Source"]
)
df_processed["Source"] = df_processed["prompt"].map(prompt_to_source)
df_processed = df_processed.dropna(subset=["Source"]).reset_index(drop=True)

X = np.vstack(df_processed["embedding"].values)
X_norm = normalize(X, norm="l2", axis=1)

llm_keys = [
    "openai/gpt-4o",
    "anthropic/claude-3.5-sonnet",
    "deepseek/deepseek-chat",
    "perplexity/sonar"
]
y_scores_df = pd.DataFrame(df_processed["scores"].tolist(), columns=llm_keys)
Y = y_scores_df.values

# Macro-category mapping
category_map = {
    'Summaries': ['summaries'],
    'English': ['english_1', 'english_2'],
    'Math': ['math_1', 'math_2'],
    'Coding': ['coding_1', 'coding_2', 'coding_3'],
    'Reasoning': ['reasoning_1', 'reasoning_2', 'reasoning_3']
}

# For stratified split, create macro-category labels
macro_labels = []
macro_names = []
for macro_cat, subcats in category_map.items():
    macro_names.append(macro_cat)
    macro_labels.extend([macro_cat] * sum(df_processed["Source"].isin(subcats)))

df_processed["Macro"] = None
for macro_cat, subcats in category_map.items():
    df_processed.loc[df_processed["Source"].isin(subcats), "Macro"] = macro_cat
macro_label_array = df_processed["Macro"].values

# Split once for global router/classifier
X_train, X_test, Y_train, Y_test, macro_train, macro_test = train_test_split(
    X_norm, Y, macro_label_array, test_size=0.20, stratify=macro_label_array, random_state=42
)

# Train regression router (global)
rf = MultiOutputRegressor(RandomForestRegressor(
    n_estimators=200, max_depth=25, min_samples_leaf=5, random_state=42, n_jobs=-1
))
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)

# === Load or train global MLP binary classifier ===
with open("df_pairwise_v2.pkl", "rb") as f:
    df_pairwise = pickle.load(f)
X_embed_cls = np.vstack(df_pairwise["embedding"].values)
encoder_cls = OneHotEncoder(sparse_output=False)
X_cat_cls = encoder_cls.fit_transform(df_pairwise[["llm_A", "llm_B"]])
X_cls = np.hstack([X_embed_cls, X_cat_cls])
y_cls = df_pairwise["label"].values
from sklearn.model_selection import train_test_split
X_train_cls, X_val_cls, y_train_cls, y_val_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42
)
mlp = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=300, random_state=42)
mlp.fit(X_train_cls, y_train_cls)

key_to_short = {
    "openai/gpt-4o": "gpt-4o",
    "anthropic/claude-3.5-sonnet": "claude",
    "deepseek/deepseek-chat": "deepseek",
    "perplexity/sonar": "perplexity"
}

# === Per macro-category evaluation ===
thresholds = np.arange(0.01, 0.21, 0.01)
category_results = {}

for macro_cat, subcats in category_map.items():
    idx = [i for i, c in enumerate(macro_test) if c == macro_cat]
    if len(idx) == 0:
        print(f"[!] No test samples for macro-category {macro_cat}, skipping.")
        continue
    print(f"\n=== Macro-category: {macro_cat} ({len(idx)} samples) ===")
    routed_accuracies = []
    selection_accuracies = []
    for tau in thresholds:
        routed_set_contains_best = []
        overall_selection_correct = []

        for j in idx:
            emb = X_test[j]
            y_pred = Y_pred[j]
            true_best = np.argmax(Y_test[j])
            sorted_preds = np.sort(y_pred)[::-1]
            ranked_preds = np.argsort(y_pred)[::-1]
            top1_idx = ranked_preds[0]
            top2_idx = ranked_preds[1]
            top1_score = sorted_preds[0]
            top2_score = sorted_preds[1]
            gap = top1_score - top2_score

            routed_llms = [top1_idx]
            use_classifier = False
            if gap < tau:
                routed_llms.append(top2_idx)
                use_classifier = True

            routed_set_contains_best.append(true_best in routed_llms)

            # Final pick:
            if not use_classifier:
                selected_idx = top1_idx
            else:
                llmA_idx, llmB_idx = top1_idx, top2_idx
                llmA_full = llm_keys[llmA_idx]
                llmB_full = llm_keys[llmB_idx]
                llmA_name = key_to_short[llmA_full]
                llmB_name = key_to_short[llmB_full]
                pair = pd.DataFrame([[llmA_name, llmB_name]], columns=["llm_A", "llm_B"])
                pair_cat = encoder_cls.transform(pair)
                mlp_input = np.hstack([emb, pair_cat[0]])
                prob = mlp.predict_proba([mlp_input])[0][1]
                selected_idx = llmA_idx if prob >= 0.5 else llmB_idx

            overall_selection_correct.append(selected_idx == true_best)

        routed_acc = np.mean(routed_set_contains_best)
        overall_acc = np.mean(overall_selection_correct)
        routed_accuracies.append(routed_acc)
        selection_accuracies.append(overall_acc)
        print(f"  tau={tau:.2f}: Coverage={routed_acc:.3f}, Overall Selection={overall_acc:.3f}")

    # Save and plot per macro-category
    category_results[macro_cat] = pd.DataFrame({
        "Threshold": thresholds,
        "Coverage Accuracy": routed_accuracies,
        "Overall Selection Accuracy": selection_accuracies
    })

    plt.figure(figsize=(7, 4))
    plt.plot(thresholds, routed_accuracies, marker='o', label="Coverage Accuracy")
    plt.plot(thresholds, selection_accuracies, marker='s', label="Overall Selection Accuracy")
    plt.xlabel("Gap Threshold ($\\tau$)")
    plt.ylabel("Accuracy")
    plt.title(f"Macro-Category: {macro_cat} — Confidence-Based Routing")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# category_results["Coding"], etc. now hold each macro-category's table


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(9, 6))
for macro_cat, df in category_results.items():
    plt.plot(df["Threshold"], df["Overall Selection Accuracy"], marker='o', label=macro_cat, linewidth=2, markersize=7)

plt.axvline(x=0.12, color='gray', linestyle='--', linewidth=2)  # Vertical line at tau=0.12

plt.xlabel("Gap Threshold ($\\tau$)", fontsize=18)
plt.ylabel("Overall Selection Accuracy", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.legend(title="Category", fontsize=15, title_fontsize=16)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("macro_category_selection_accuracy.pdf")   # Save as PDF

plt.show()


In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

# === 1. Load and prepare the data ===
with open("df_processed.pkl", "rb") as f:
    df_processed = pickle.load(f)

df_raw = pd.read_csv("ranked_responses_final.csv")
prompt_to_source = (
    df_raw[["Prompt", "Source"]]
    .drop_duplicates()
    .set_index("Prompt")["Source"]
)
df_processed["Source"] = df_processed["prompt"].map(prompt_to_source)
df_processed = df_processed.dropna(subset=["Source"]).reset_index(drop=True)

X = np.vstack(df_processed["embedding"].values)
X_norm = normalize(X, norm="l2", axis=1)

llm_keys = [
    "openai/gpt-4o",
    "anthropic/claude-3.5-sonnet",
    "deepseek/deepseek-chat",
    "perplexity/sonar"
]
y_scores_df = pd.DataFrame(df_processed["scores"].tolist(), columns=llm_keys)
Y = y_scores_df.values

# Macro-category mapping
category_map = {
    'Summaries': ['summaries'],
    'English': ['english_1', 'english_2'],
    'Math': ['math_1', 'math_2'],
    'Coding': ['coding_1', 'coding_2', 'coding_3'],
    'Reasoning': ['reasoning_1', 'reasoning_2', 'reasoning_3']
}

# Map Source to macro-category
df_processed["Macro"] = None
for macro_cat, subcats in category_map.items():
    df_processed.loc[df_processed["Source"].isin(subcats), "Macro"] = macro_cat
macro_label_array = df_processed["Macro"].values

# Train/test split
X_train, X_test, Y_train, Y_test, macro_train, macro_test = train_test_split(
    X_norm, Y, macro_label_array, test_size=0.20, stratify=macro_label_array, random_state=42
)

# Train regressor
rf = MultiOutputRegressor(RandomForestRegressor(
    n_estimators=200, max_depth=25, min_samples_leaf=5, random_state=42, n_jobs=-1
))
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)

# Load pairwise classifier
with open("df_pairwise_v2.pkl", "rb") as f:
    df_pairwise = pickle.load(f)
X_embed_cls = np.vstack(df_pairwise["embedding"].values)
encoder_cls = OneHotEncoder(sparse_output=False)
X_cat_cls = encoder_cls.fit_transform(df_pairwise[["llm_A", "llm_B"]])
X_cls = np.hstack([X_embed_cls, X_cat_cls])
y_cls = df_pairwise["label"].values
X_train_cls, X_val_cls, y_train_cls, y_val_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42
)
mlp = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=300, random_state=42)
mlp.fit(X_train_cls, y_train_cls)

key_to_short = {
    "openai/gpt-4o": "gpt-4o",
    "anthropic/claude-3.5-sonnet": "claude",
    "deepseek/deepseek-chat": "deepseek",
    "perplexity/sonar": "perplexity"
}

# === Per macro-category evaluation ===
thresholds = np.arange(0.01, 0.21, 0.01)
category_results = {}

for macro_cat, subcats in category_map.items():
    idx = [i for i, c in enumerate(macro_test) if c == macro_cat]
    if len(idx) == 0:
        print(f"[!] No test samples for macro-category {macro_cat}, skipping.")
        continue
    print(f"\n=== Macro-category: {macro_cat} ({len(idx)} samples) ===")
    routed_accuracies = []
    selection_accuracies = []
    classifier_usages = []

    for tau in thresholds:
        routed_set_contains_best = []
        overall_selection_correct = []
        classifier_flags = []

        for j in idx:
            emb = X_test[j]
            y_pred = Y_pred[j]
            true_best = np.argmax(Y_test[j])
            sorted_preds = np.sort(y_pred)[::-1]
            ranked_preds = np.argsort(y_pred)[::-1]
            top1_idx = ranked_preds[0]
            top2_idx = ranked_preds[1]
            top1_score = sorted_preds[0]
            top2_score = sorted_preds[1]
            gap = top1_score - top2_score

            routed_llms = [top1_idx]
            use_classifier = False
            if gap < tau:
                routed_llms.append(top2_idx)
                use_classifier = True
                classifier_flags.append(1)
            else:
                classifier_flags.append(0)

            routed_set_contains_best.append(true_best in routed_llms)

            # Final pick:
            if not use_classifier:
                selected_idx = top1_idx
            else:
                llmA_idx, llmB_idx = top1_idx, top2_idx
                llmA_full = llm_keys[llmA_idx]
                llmB_full = llm_keys[llmB_idx]
                llmA_name = key_to_short[llmA_full]
                llmB_name = key_to_short[llmB_full]
                pair = pd.DataFrame([[llmA_name, llmB_name]], columns=["llm_A", "llm_B"])
                pair_cat = encoder_cls.transform(pair)
                mlp_input = np.hstack([emb, pair_cat[0]])
                prob = mlp.predict_proba([mlp_input])[0][1]
                selected_idx = llmA_idx if prob >= 0.5 else llmB_idx

            overall_selection_correct.append(selected_idx == true_best)

        routed_acc = np.mean(routed_set_contains_best)
        overall_acc = np.mean(overall_selection_correct)
        classifier_usage = np.mean(classifier_flags)

        routed_accuracies.append(routed_acc)
        selection_accuracies.append(overall_acc)
        classifier_usages.append(classifier_usage)

        print(f"  tau={tau:.2f}: Coverage={routed_acc:.3f}, Overall Selection={overall_acc:.3f}, Classifier Usage={classifier_usage:.3f}")

    category_results[macro_cat] = pd.DataFrame({
        "Threshold": thresholds,
        "Coverage Accuracy": routed_accuracies,
        "Overall Selection Accuracy": selection_accuracies,
        "Classifier Usage": classifier_usages
    })

    # Optional plot
    plt.figure(figsize=(7, 4))
    plt.plot(thresholds, routed_accuracies, marker='o', label="Coverage Accuracy")
    plt.plot(thresholds, selection_accuracies, marker='s', label="Overall Selection Accuracy")
    plt.plot(thresholds, classifier_usages, marker='^', label="Classifier Usage", linestyle='--')
    plt.xlabel("Gap Threshold ($\\tau$)")
    plt.ylabel("Metric Value")
    plt.title(f"Macro-Category: {macro_cat} — Confidence-Based Routing")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# You can now access category_results["Reasoning"], etc. for tables
