# Who is Judging the Judges?

### Binary Classification Judges

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report, cohen_kappa_score, mean_absolute_error, mean_squared_error
)

from scipy import stats
from scipy.stats import pearsonr, spearmanr, kendalltau

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the data
expert_df = pd.read_csv('expert_eval/expert_eval_20250622_230830.csv')
judge_a_df = pd.read_csv('llm_eval/binary_class_judge_output_A.csv')
judge_b_df = pd.read_csv('llm_eval/binary_class_judge_output_B.csv')

print(f"Data loaded successfully:")
print(f"- Expert evaluations: {len(expert_df)} rows")
print(f"- Judge A predictions: {len(judge_a_df)} rows") 
print(f"- Judge B predictions: {len(judge_b_df)} rows")
print()

In [None]:
# Data preprocessing - ensure boolean values are properly handled
def convert_to_bool(df, col_name):
    """Convert string boolean values to actual booleans"""
    if df[col_name].dtype == 'object':
        df[col_name] = df[col_name].map({'True': True, 'False': False, True: True, False: False})
    return df

expert_df = convert_to_bool(expert_df, 'evaluation')
judge_a_df = convert_to_bool(judge_a_df, 'label')
judge_b_df = convert_to_bool(judge_b_df, 'label')

In [None]:
# Verify data alignment by turn_id
print("Verifying data alignment...")
assert all(expert_df['turn_id'].sort_values() == judge_a_df['turn_id'].sort_values()), "Turn IDs don't match between expert and Judge A"
assert all(expert_df['turn_id'].sort_values() == judge_b_df['turn_id'].sort_values()), "Turn IDs don't match between expert and Judge B"
print("✓ All datasets have matching turn_ids")
print()

# Sort all dataframes by turn_id for proper alignment
expert_df = expert_df.sort_values('turn_id').reset_index(drop=True)
judge_a_df = judge_a_df.sort_values('turn_id').reset_index(drop=True)
judge_b_df = judge_b_df.sort_values('turn_id').reset_index(drop=True)


In [None]:
# Sort all dataframes by turn_id for proper alignment
expert_df = expert_df.sort_values('turn_id').reset_index(drop=True)
judge_a_df = judge_a_df.sort_values('turn_id').reset_index(drop=True)
judge_b_df = judge_b_df.sort_values('turn_id').reset_index(drop=True)

# Extract ground truth and predictions
y_true = expert_df['evaluation'].values
y_pred_a = judge_a_df['label'].values
y_pred_b = judge_b_df['label'].values

print("Class Distribution in Ground Truth:")
print(f"True (Positive): {sum(y_true)} ({sum(y_true)/len(y_true)*100:.1f}%)")
print(f"False (Negative): {len(y_true) - sum(y_true)} ({(len(y_true) - sum(y_true))/len(y_true)*100:.1f}%)")
print()

In [None]:
def calculate_metrics(y_true, y_pred, judge_name):
    """Calculate comprehensive evaluation metrics"""
    metrics = {}
    
    # Basic metrics
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_true, y_pred, zero_division=0)
    
    # Inter-rater agreement
    metrics['cohen_kappa'] = cohen_kappa_score(y_true, y_pred)
    
    # Simple agreement rate
    metrics['agreement_rate'] = np.mean(y_true == y_pred)
    
    # Confusion matrix components
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['true_positives'] = tp
    metrics['false_positives'] = fp
    metrics['true_negatives'] = tn
    metrics['false_negatives'] = fn
    
    # Specificity (True Negative Rate)
    metrics['specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    return metrics

# Calculate metrics for both judges
metrics_a = calculate_metrics(y_true, y_pred_a, "Judge A")
metrics_b = calculate_metrics(y_true, y_pred_b, "Judge B")

In [None]:
print("EVALUATION RESULTS")
print("=" * 50)

# Create comparison table
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Cohen\'s Kappa', 'Specificity'],
    'Judge A': [
        f"{metrics_a['accuracy']:.3f}",
        f"{metrics_a['precision']:.3f}",
        f"{metrics_a['recall']:.3f}",
        f"{metrics_a['f1']:.3f}",
        f"{metrics_a['cohen_kappa']:.3f}",
        f"{metrics_a['specificity']:.3f}"
    ],
    'Judge B': [
        f"{metrics_b['accuracy']:.3f}",
        f"{metrics_b['precision']:.3f}",
        f"{metrics_b['recall']:.3f}",
        f"{metrics_b['f1']:.3f}",
        f"{metrics_b['cohen_kappa']:.3f}",
        f"{metrics_b['specificity']:.3f}"
    ]
})

print("Performance Comparison:")
print(comparison_df.to_string(index=False))
print()

# Detailed breakdown
print("DETAILED BREAKDOWN")
print("=" * 30)

for judge_name, metrics in [("Judge A", metrics_a), ("Judge B", metrics_b)]:
    print(f"\n{judge_name}:")
    print(f"  Agreement Rate: {metrics['agreement_rate']:.1%}")
    print(f"  True Positives:  {metrics['true_positives']}")
    print(f"  False Positives: {metrics['false_positives']}")
    print(f"  True Negatives:  {metrics['true_negatives']}")
    print(f"  False Negatives: {metrics['false_negatives']}")

### Continuous Scale Judges

In [None]:
# Load the data
expert_df = pd.read_csv('expert_eval/likert_eval_20250623_081631.csv')
judge_a_df = pd.read_csv('llm_eval/continuous_judge_output_A.csv')
judge_b_df = pd.read_csv('llm_eval/continuous_judge_output_B.csv')

print(f"Data loaded successfully:")
print(f"- Expert Likert evaluations: {len(expert_df)} rows")
print(f"- Judge A predictions: {len(judge_a_df)} rows") 
print(f"- Judge B predictions: {len(judge_b_df)} rows")
print()

# Verify data alignment
print("Verifying data alignment...")
assert all(expert_df['turn_id'].sort_values() == judge_a_df['turn_id'].sort_values()), "Turn IDs don't match between expert and Judge A"
assert all(expert_df['turn_id'].sort_values() == judge_b_df['turn_id'].sort_values()), "Turn IDs don't match between expert and Judge B"
print("✓ All datasets have matching turn_ids")
print()

In [None]:
# Sort all dataframes by turn_id for proper alignment
expert_df = expert_df.sort_values('turn_id').reset_index(drop=True)
judge_a_df = judge_a_df.sort_values('turn_id').reset_index(drop=True)
judge_b_df = judge_b_df.sort_values('turn_id').reset_index(drop=True)

# Extract ratings
expert_ratings = expert_df['likert_rating'].values
judge_a_ratings = judge_a_df['label'].values
judge_b_ratings = judge_b_df['label'].values

# Validate scale range
print("Rating Scale Validation:")
print(f"Expert ratings range: {expert_ratings.min()} - {expert_ratings.max()}")
print(f"Judge A ratings range: {judge_a_ratings.min()} - {judge_a_ratings.max()}")
print(f"Judge B ratings range: {judge_b_ratings.min()} - {judge_b_ratings.max()}")

# Check if all ratings are within 1-5 scale
all_ratings = np.concatenate([expert_ratings, judge_a_ratings, judge_b_ratings])
if not all((1 <= rating <= 5) for rating in all_ratings):
    print("⚠ WARNING: Some ratings are outside the 1-5 Likert scale range!")
else:
    print("✓ All ratings are within the expected 1-5 Likert scale")
print()

In [None]:
# === RATING DISTRIBUTION ANALYSIS ===

print("RATING DISTRIBUTION ANALYSIS")
print("=" * 40)

def analyze_distribution(ratings, name):
    """Analyze the distribution of ratings"""
    distribution = pd.Series(ratings).value_counts().sort_index()
    print(f"\n{name} Distribution:")
    for rating in range(1, 6):
        count = distribution.get(rating, 0)
        percentage = (count / len(ratings)) * 100
        print(f"  {rating}: {count} ({percentage:.1f}%)")
    
    print(f"  Mean: {np.mean(ratings):.2f}")
    print(f"  Median: {np.median(ratings):.1f}")
    print(f"  Std Dev: {np.std(ratings):.2f}")
    return distribution

expert_dist = analyze_distribution(expert_ratings, "Expert")
judge_a_dist = analyze_distribution(judge_a_ratings, "Judge A")
judge_b_dist = analyze_distribution(judge_b_ratings, "Judge B")

In [None]:
# === CORRELATION ANALYSIS ===

print("\n\nCORRELATION ANALYSIS")
print("=" * 30)

def calculate_correlations(ground_truth, predictions, judge_name):
    """Calculate various correlation metrics"""
    correlations = {}
    
    # Pearson correlation (linear relationship)
    pearson_r, pearson_p = pearsonr(ground_truth, predictions)
    correlations['pearson_r'] = pearson_r
    correlations['pearson_p'] = pearson_p
    
    # Spearman correlation (monotonic relationship)
    spearman_r, spearman_p = spearmanr(ground_truth, predictions)
    correlations['spearman_r'] = spearman_r
    correlations['spearman_p'] = spearman_p
    
    # Kendall's tau (rank-based, handles ties well)
    kendall_tau, kendall_p = kendalltau(ground_truth, predictions)
    correlations['kendall_tau'] = kendall_tau
    correlations['kendall_p'] = kendall_p
    
    return correlations

corr_a = calculate_correlations(expert_ratings, judge_a_ratings, "Judge A")
corr_b = calculate_correlations(expert_ratings, judge_b_ratings, "Judge B")

# Create correlation comparison table
correlation_df = pd.DataFrame({
    'Metric': ['Pearson r', 'Spearman ρ', 'Kendall τ'],
    'Judge A': [
        f"{corr_a['pearson_r']:.3f} (p={corr_a['pearson_p']:.3f})",
        f"{corr_a['spearman_r']:.3f} (p={corr_a['spearman_p']:.3f})",
        f"{corr_a['kendall_tau']:.3f} (p={corr_a['kendall_p']:.3f})"
    ],
    'Judge B': [
        f"{corr_b['pearson_r']:.3f} (p={corr_b['pearson_p']:.3f})",
        f"{corr_b['spearman_r']:.3f} (p={corr_b['spearman_p']:.3f})",
        f"{corr_b['kendall_tau']:.3f} (p={corr_b['kendall_p']:.3f})"
    ]
})

print("Correlation with Expert Ratings:")
print(correlation_df.to_string(index=False))

In [None]:
# === ERROR METRICS ===

print("\n\nERROR METRICS")
print("=" * 20)

def calculate_error_metrics(ground_truth, predictions):
    """Calculate error-based metrics"""
    errors = {}
    
    # Mean Absolute Error
    errors['mae'] = mean_absolute_error(ground_truth, predictions)
    
    # Root Mean Square Error
    errors['rmse'] = np.sqrt(mean_squared_error(ground_truth, predictions))
    
    # Mean error (bias)
    errors['mean_error'] = np.mean(predictions - ground_truth)
    
    # Percentage of exact matches
    errors['exact_match'] = np.mean(ground_truth == predictions) * 100
    
    # Percentage within 1 point
    errors['within_1'] = np.mean(np.abs(ground_truth - predictions) <= 1) * 100
    
    return errors

errors_a = calculate_error_metrics(expert_ratings, judge_a_ratings)
errors_b = calculate_error_metrics(expert_ratings, judge_b_ratings)

error_df = pd.DataFrame({
    'Metric': ['Mean Absolute Error', 'Root Mean Square Error', 'Mean Error (Bias)', 
               'Exact Match %', 'Within ±1 Point %'],
    'Judge A': [
        f"{errors_a['mae']:.3f}",
        f"{errors_a['rmse']:.3f}",
        f"{errors_a['mean_error']:+.3f}",
        f"{errors_a['exact_match']:.1f}%",
        f"{errors_a['within_1']:.1f}%"
    ],
    'Judge B': [
        f"{errors_b['mae']:.3f}",
        f"{errors_b['rmse']:.3f}",
        f"{errors_b['mean_error']:+.3f}",
        f"{errors_b['exact_match']:.1f}%",
        f"{errors_b['within_1']:.1f}%"
    ]
})

print("Error Analysis:")
print(error_df.to_string(index=False))