In [1]:
import pandas as pd
import re
import openai
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import tabulate
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score, 
                             confusion_matrix, accuracy_score, precision_score, recall_score, f1_score)
from tqdm.auto import tqdm
import joblib
from itertools import product
from sklearn.ensemble import RandomForestRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pickle
import pandas as pd
from itertools import combinations

# === Load processed embedding-based data ===
with open("df_processed.pkl", "rb") as f:
    df_processed = pickle.load(f)

# === Load original raw data to map Prompt → Source ===
df_raw = pd.read_csv("ranked_responses_final.csv")
prompt_to_source = (
    df_raw[["Prompt", "Source"]]
    .drop_duplicates()
    .set_index("Prompt")["Source"]
)

# === Add raw Source column to df_processed ===
df_processed["Source"] = df_processed["prompt"].map(prompt_to_source)

# === Define category grouping map ===
category_map = {
    'Summaries': ['summaries'],
    'English': ['english_1', 'english_2'],
    'Math': ['math_1', 'math_2'],
    'Coding': ['coding_1', 'coding_2', 'coding_3'],
    'Reasoning': ['reasoning_1', 'reasoning_2', 'reasoning_3']
}

# === Flatten to original → grouped mapping
source_remap = {
    original: general
    for general, originals in category_map.items()
    for original in originals
}

# === Apply mapping and clean
df_processed["Source"] = df_processed["Source"].map(source_remap)
df_processed = df_processed.dropna(subset=["Source"]).reset_index(drop=True)

# === LLM keys and names
llm_keys = [
    "openai/gpt-4o",
    "anthropic/claude-3.5-sonnet",
    "deepseek/deepseek-chat",
    "perplexity/sonar"
]
llm_names = ["gpt-4o", "claude", "deepseek", "perplexity"]

pairwise_data = []

# === Create pairwise dataset with score_diff and mapped Source ===
for _, row in df_processed.iterrows():
    embedding = row["embedding"]
    scores = row["scores"]
    source = row["Source"]

    scores_ordered = [scores[k] for k in llm_keys]

    for i, j in combinations(range(4), 2):
        score_i, score_j = scores_ordered[i], scores_ordered[j]

        if score_i == score_j:
            continue  # skip ties

        # A > B
        pairwise_data.append({
            "embedding": embedding,
            "llm_A": llm_names[i],
            "llm_B": llm_names[j],
            "label": int(score_i > score_j),
            "score_diff": score_i - score_j,
            "Source": source
        })

        # B > A
        pairwise_data.append({
            "embedding": embedding,
            "llm_A": llm_names[j],
            "llm_B": llm_names[i],
            "label": int(score_j > score_i),
            "score_diff": score_j - score_i,
            "Source": source
        })

# === Create final DataFrame
df_pairwise_v2 = pd.DataFrame(pairwise_data)

# === Show a few samples
print("\n=== Sample of Enhanced Pairwise Dataset ===")
print(df_pairwise_v2.sample(5))

# === Save to disk
with open("df_pairwise_v2.pkl", "wb") as f:
    pickle.dump(df_pairwise_v2, f)

print("\n✅ Enhanced pairwise dataset saved as 'df_pairwise_v2.pkl'")


In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier

# === Load dataset ===
with open("df_pairwise_v2.pkl", "rb") as f:
    df_pairwise = pickle.load(f)

# === Extract prompt embeddings
X_embed = np.vstack(df_pairwise["embedding"].values)

# === One-hot encode llm_A and llm_B
encoder = OneHotEncoder(sparse_output=False)
X_cat = encoder.fit_transform(df_pairwise[["llm_A", "llm_B"]])

# === Final feature matrix
X = np.hstack([X_embed, X_cat])
y = df_pairwise["label"].values

# === Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Define classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=25, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss', random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=300, random_state=42),
    "Ridge Classifier": RidgeClassifier(),
    "SVC": SVC(kernel='rbf', probability=True, random_state=42)
}

# === Evaluate each model
print(f"{'Model':20s}  Acc   Prec  Recall   F1     AUC")
print("-" * 60)

for name, model in classifiers.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        # fallback for models without predict_proba
        y_prob = y_pred

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(f"{name:20s}  {acc:.3f}  {prec:.3f}  {rec:.3f}   {f1:.3f}  {auc:.3f}")
