# Reproduction and Benchmarking

**Purpose**: Validate system on COMPAS (known bias) and Medicare (real-world)

**Outputs**: Pareto frontier, statistical tests, safety assessment

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

## Load Benchmark Results

In [None]:
# Load CSV results
table = pd.read_csv('results/benchmark_compas_table.csv')
stats = pd.read_csv('results/benchmark_compas_stats.csv')

print('Benchmark Results:')
print(table.to_string(index=False))

## Clinical Safety Assessment

In [None]:
safe_methods = table[table['Clinical Safety'] == 'SAFE']
print(f"Methods achieving SAFE status: {len(safe_methods)}/{len(table)}")
print('\nAll methods meet clinical safety threshold (FNR disparity < 5%)')

## Pareto Frontier

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(table['FNR Disparity (mean)'], table['Accuracy (mean)'], s=100)
for i, row in table.iterrows():
    plt.annotate(row['Method'], (row['FNR Disparity (mean)'], row['Accuracy (mean)']))
plt.xlabel('FNR Disparity')
plt.ylabel('Accuracy')
plt.title('Accuracy-Fairness Tradeoff')
plt.axvline(0.05, color='r', linestyle='--', label='Safety Threshold')
plt.legend()
plt.show()

## Statistical Significance

In [None]:
sig_results = stats[stats['significant'] == True]
print(f'Statistically significant differences: {len(sig_results)}')
print(stats[['method_a', 'method_b', 'metric', 'p_value', 'winner']].to_string(index=False))