# Fine-tuned GPT-4o vs Self-Consistency Models

Comparing fine-tuned (K=1) against self-consistency (K=5, K=7) on accuracy, latency, and cost.

**Models**: GPT-4o K=5/K=7, GPT-3.5 K=5/K=7, Llama K=5/K=7, Fine-tuned K=1

**Dataset**: 250 interviews with human ground truth

In [None]:
!pip install pandas numpy scipy matplotlib seaborn pingouin -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu
import pingouin as pg
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100

print("✓ Setup complete")

## Configuration

Update paths to match your local file structure.

In [None]:
# UPDATE THESE PATHS!
BASE_PATH = '/Users/shreya_sudan/Desktop/ServiceAgent/CleanAgent/metaPromptOpt/data'

FILE_PATHS = {
    'gpt4o_k5': f'{BASE_PATH}/k5RunsResults/final_gpt-4o_20251113-222901.csv',
    'gpt4o_k7': f'{BASE_PATH}/k7RunsResults/final_gpt-4o_20251120-163127.csv',
    'gpt35_k5': f'{BASE_PATH}/k5RunsResults/final_gpt-3.5-turbo_20251113-211454.csv',
    'gpt35_k7': f'{BASE_PATH}/k7RunsResults/final_gpt-3.5-turbo_20251120-151235.csv',
    'llama_k5': f'{BASE_PATH}/k5RunsResults/final_llama_k5.csv',  # Update with actual filename
    'llama_k7': f'{BASE_PATH}/k7RunsResults/final_llama_k7.csv',  # Update with actual filename
    'finetuned': f'{BASE_PATH}/finetuning_output/full_dataset_predictions.csv',
    'human': f'{BASE_PATH}/humanScores/hiring_evaluations.csv'
}

MODELS_CONFIG = {
    'gpt4o_k5': {'label': 'GPT-4o (K=5)', 'k': 5, 'cost_in': 2.50/1000, 'cost_out': 10.00/1000, 'color': '#3498db'},
    'gpt4o_k7': {'label': 'GPT-4o (K=7)', 'k': 7, 'cost_in': 2.50/1000, 'cost_out': 10.00/1000, 'color': '#2980b9'},
    'finetuned': {'label': 'GPT-4o FT (K=1)', 'k': 1, 'cost_in': 2.50/1000, 'cost_out': 10.00/1000, 'training_cost': 26.12, 'color': '#e74c3c'},
    'gpt35_k5': {'label': 'GPT-3.5 (K=5)', 'k': 5, 'cost_in': 0.50/1000, 'cost_out': 1.50/1000, 'color': '#95a5a6'},
    'gpt35_k7': {'label': 'GPT-3.5 (K=7)', 'k': 7, 'cost_in': 0.50/1000, 'cost_out': 1.50/1000, 'color': '#7f8c8d'},
    'llama_k5': {'label': 'Llama (K=5)', 'k': 5, 'cost_in': 0.10/1000, 'cost_out': 0.20/1000, 'color': '#34495e'},
    'llama_k7': {'label': 'Llama (K=7)', 'k': 7, 'cost_in': 0.10/1000, 'cost_out': 0.20/1000, 'color': '#2c3e50'},
}

METRICS = ['cognitive_ability', 'experience', 'problem_solving', 'reliability', 'professionalism', 'communication']
ABBREV = {'cognitive_ability': 'ca', 'experience': 'exp', 'problem_solving': 'ps', 
          'reliability': 'rel', 'professionalism': 'prof', 'communication': 'comm'}

print("✓ Config loaded")

## Load & Standardize Data

In [None]:
def load_and_standardize(path, model_key):
    """Load and standardize column names to {metric}_score format"""
    df = pd.read_csv(path)
    df_out = df.copy()
    
    # K-runs: ca_score -> cognitive_ability_score
    if 'ca_score' in df.columns:
        for metric, abbrev in ABBREV.items():
            if f'{abbrev}_score' in df.columns:
                df_out[f'{metric}_score'] = df[f'{abbrev}_score']
    
    # Fine-tuned: predicted_cognitive_ability -> cognitive_ability_score
    elif model_key == 'finetuned':
        for metric in METRICS:
            if f'predicted_{metric}' in df.columns:
                df_out[f'{metric}_score'] = df[f'predicted_{metric}']
    
    # Human: cognitive_ability -> cognitive_ability_score
    elif model_key == 'human':
        for metric in METRICS:
            if metric in df.columns:
                df_out[f'{metric}_score'] = df[metric]
    
    return df_out

# Load all
print("Loading datasets...")
data = {}
for key, path in FILE_PATHS.items():
    try:
        data[key] = load_and_standardize(path, key)
        print(f"  ✓ {key}: {len(data[key])} samples")
    except Exception as e:
        print(f"  ⚠️  {key}: {str(e)[:50]}")

print(f"\n✓ Loaded {len(data)} datasets")

## Calculate Metrics vs Human

In [None]:
def calc_metrics(df_model, df_human):
    """Calculate MAE, ICC, agreement vs human"""
    merged = pd.merge(df_model[['interview_id'] + [f'{m}_score' for m in METRICS]],
                     df_human[['interview_id'] + [f'{m}_score' for m in METRICS]],
                     on='interview_id', suffixes=('_m', '_h'))
    
    all_m = np.concatenate([merged[f'{m}_score_m'].values for m in METRICS])
    all_h = np.concatenate([merged[f'{m}_score_h'].values for m in METRICS])
    
    # Calculate ICC
    try:
        icc_vals = []
        for metric in METRICS:
            icc_df = pd.DataFrame({
                'target': list(merged['interview_id'].astype(str)) * 2,
                'rater': ['model']*len(merged) + ['human']*len(merged),
                'score': list(merged[f'{metric}_score_m']) + list(merged[f'{metric}_score_h'])
            })
            icc_result = pg.intraclass_corr(data=icc_df, targets='target', raters='rater', ratings='score')
            icc_vals.append(icc_result[icc_result['Type'] == 'ICC2']['ICC'].values[0])
        avg_icc = np.mean(icc_vals)
    except:
        avg_icc = np.nan
    
    return {
        'mae': np.mean(np.abs(all_m - all_h)),
        'rmse': np.sqrt(np.mean((all_m - all_h)**2)),
        'icc': avg_icc,
        'within_1': np.mean(np.abs(all_m - all_h) <= 1) * 100,
        'within_2': np.mean(np.abs(all_m - all_h) <= 2) * 100,
        'n': len(merged)
    }

# Calculate for all models
print("\nCalculating metrics vs human...")
results = {}
for key in MODELS_CONFIG.keys():
    if key in data and 'human' in data:
        results[key] = calc_metrics(data[key], data['human'])
        print(f"  {MODELS_CONFIG[key]['label']}: MAE={results[key]['mae']:.3f}, ICC={results[key]['icc']:.3f}")

print("\n✓ Metrics calculated")

## Extract Latency & Calculate Costs

In [None]:
# Extract latency and costs from actual data
print("Extracting latency and costs from data...")
print("="*70)

for key in results.keys():
    df = data[key]
    cfg = MODELS_CONFIG[key]
    
    # LATENCY
    lat_cols = [c for c in df.columns if 'latency' in c.lower() and 'ms' in c.lower()]
    if lat_cols:
        # K-runs might have rewrite_latency_ms (this is per sample, already includes all K runs)
        results[key]['latency_ms'] = df[lat_cols[0]].median()
        results[key]['total_latency_ms'] = results[key]['latency_ms']  # Already total for K runs
    else:
        results[key]['latency_ms'] = np.nan
        results[key]['total_latency_ms'] = np.nan
    
    # COST
    # For K-runs, use actual total_cost from data (already includes K runs)
    if 'total_cost' in df.columns:
        # This is the actual cost per candidate (already averaged over K runs)
        cost_per_cand = df['total_cost'].mean()
        results[key]['cost_per_cand'] = cost_per_cand
        results[key]['cost_250'] = cost_per_cand * 250
        results[key]['cost_source'] = 'actual'
    
    # For fine-tuned, calculate from config
    elif key == 'finetuned':
        # Fine-tuned uses same token costs but only K=1
        AVG_IN_TOKENS, AVG_OUT_TOKENS = 1000, 100
        inf_cost = AVG_IN_TOKENS * cfg['cost_in'] + AVG_OUT_TOKENS * cfg['cost_out']
        train_cost = cfg.get('training_cost', 0)
        
        results[key]['cost_per_cand'] = inf_cost + (train_cost / 250)
        results[key]['cost_250'] = inf_cost * 250 + train_cost
        results[key]['training_cost'] = train_cost
        results[key]['cost_source'] = 'estimated+training'
    
    else:
        # Fallback estimation
        AVG_IN_TOKENS, AVG_OUT_TOKENS = 1000, 100
        inf_cost = (AVG_IN_TOKENS * cfg['cost_in'] + AVG_OUT_TOKENS * cfg['cost_out']) * cfg['k']
        results[key]['cost_per_cand'] = inf_cost
        results[key]['cost_250'] = inf_cost * 250
        results[key]['cost_source'] = 'estimated'
    
    print(f"  {cfg['label']}:")
    print(f"    Latency: {results[key].get('latency_ms', 0):.0f}ms (total for K={cfg['k']})")
    print(f"    Cost: ${results[key]['cost_per_cand']:.4f}/cand (source: {results[key].get('cost_source', 'unknown')})")
    if key == 'finetuned':
        print(f"    Training cost: ${cfg.get('training_cost', 0):.2f} (one-time)")

print("\n" + "="*70)
print("✓ Latency and cost data extracted")


## Results Table

In [None]:
# Build table
rows = []
for key, res in results.items():
    rows.append({
        'Model': MODELS_CONFIG[key]['label'],
        'K': MODELS_CONFIG[key]['k'],
        'MAE': res['mae'],
        'ICC': res['icc'],
        'Within ±1%': res['within_1'],
        'Total Latency (ms)': res['total_latency_ms'],
        'Cost/Cand ($)': res['cost_per_cand'],
        'Cost 250 ($)': res['cost_250'],
        '_key': key
    })

df = pd.DataFrame(rows).sort_values('MAE')

print("\n" + "="*80)
print("COMPREHENSIVE RESULTS")
print("="*80)
print(df[['Model', 'K', 'MAE', 'ICC', 'Within ±1%', 'Total Latency (ms)', 'Cost/Cand ($)']].to_string(index=False))

print("\n" + "="*80)
print("BEST PERFORMERS")
print("="*80)
print(f"🎯 Best Accuracy: {df.iloc[0]['Model']} (MAE={df.iloc[0]['MAE']:.3f})")
print(f"💰 Lowest Cost: {df.sort_values('Cost/Cand ($)').iloc[0]['Model']} (${df.sort_values('Cost/Cand ($)').iloc[0]['Cost/Cand ($)']:.4f}/cand)")
print(f"⚡ Fastest: {df.sort_values('Total Latency (ms)').iloc[0]['Model']} ({df.sort_values('Total Latency (ms)').iloc[0]['Total Latency (ms)']:.0f}ms total)")

## Visualizations

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Fine-tuned vs Self-Consistency: Comprehensive Comparison', fontsize=16, fontweight='bold')

colors = [MODELS_CONFIG[row['_key']]['color'] for _, row in df.iterrows()]

# 1. Cost vs Accuracy
ax = axes[0, 0]
for _, row in df.iterrows():
    ax.scatter(row['Cost 250 ($)'], row['ICC'], s=400, alpha=0.7, 
              color=MODELS_CONFIG[row['_key']]['color'], edgecolors='black', linewidths=2)
    ax.annotate(row['Model'].split('(')[0], (row['Cost 250 ($)'], row['ICC']), 
               fontsize=8, xytext=(3, 3), textcoords='offset points')
ax.set_xlabel('Total Cost 250 ($)', fontweight='bold')
ax.set_ylabel('ICC', fontweight='bold')
ax.set_title('Cost vs Accuracy')
ax.grid(alpha=0.3)
ax.axhline(0.75, color='green', linestyle='--', alpha=0.5)

# 2. MAE
ax = axes[0, 1]
ax.barh(range(len(df)), df['MAE'], color=colors, edgecolor='black')
ax.set_yticks(range(len(df)))
ax.set_yticklabels(df['Model'], fontsize=9)
ax.set_xlabel('MAE (Lower Better)', fontweight='bold')
ax.set_title('Accuracy')
ax.invert_xaxis()
ax.grid(axis='x', alpha=0.3)

# 3. Latency
ax = axes[0, 2]
ax.barh(range(len(df)), df['Total Latency (ms)'], color=colors, edgecolor='black')
ax.set_yticks(range(len(df)))
ax.set_yticklabels(df['Model'], fontsize=9)
ax.set_xlabel('Total Latency (ms)', fontweight='bold')
ax.set_title('Speed')
ax.invert_xaxis()
ax.grid(axis='x', alpha=0.3)

# 4. Agreement
ax = axes[1, 0]
x = np.arange(len(df))
ax.bar(x, df['Within ±1%'], color=colors, edgecolor='black', alpha=0.8)
ax.set_xticks(x)
ax.set_xticklabels([m.split('(')[0] for m in df['Model']], rotation=45, ha='right', fontsize=8)
ax.set_ylabel('Within ±1 (%)', fontweight='bold')
ax.set_title('Agreement Rate')
ax.axhline(80, color='green', linestyle='--', alpha=0.5)
ax.grid(axis='y', alpha=0.3)

# 5. K vs Accuracy
ax = axes[1, 1]
for _, row in df.iterrows():
    ax.scatter(row['K'], row['MAE'], s=300, color=MODELS_CONFIG[row['_key']]['color'],
              edgecolors='black', linewidths=2, alpha=0.7)
ax.set_xlabel('K Runs', fontweight='bold')
ax.set_ylabel('MAE', fontweight='bold')
ax.set_title('K vs Accuracy')
ax.set_xticks([1, 3, 5, 7])
ax.grid(alpha=0.3)
ax.invert_yaxis()

# 6. Efficiency Score
ax = axes[1, 2]
norm_icc = df['ICC'] / df['ICC'].max()
norm_cost = 1 - (df['Cost/Cand ($)'] / df['Cost/Cand ($)'].max())
norm_speed = 1 - (df['Total Latency (ms)'] / df['Total Latency (ms)'].max())
eff = (norm_icc * 0.5 + norm_cost * 0.25 + norm_speed * 0.25) * 100
ax.barh(range(len(df)), eff, color=colors, edgecolor='black')
ax.set_yticks(range(len(df)))
ax.set_yticklabels(df['Model'], fontsize=9)
ax.set_xlabel('Efficiency Score', fontweight='bold')
ax.set_title('Overall Efficiency\n(50% Acc, 25% Cost, 25% Speed)')
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

## ROI Analysis

In [None]:
if 'finetuned' in results and 'gpt4o_k5' in results:
    ft = results['finetuned']
    k5 = results['gpt4o_k5']
    
    training_cost = 26.12
    savings_per_cand = k5['cost_per_cand'] - ft['cost_per_cand']
    breakeven = training_cost / savings_per_cand if savings_per_cand > 0 else np.inf
    
    latency_improvement = ((k5['total_latency_ms'] - ft['total_latency_ms']) / k5['total_latency_ms']) * 100
    mae_change = ((ft['mae'] - k5['mae']) / k5['mae']) * 100
    
    print("\n" + "="*80)
    print("ROI ANALYSIS: Fine-tuned (K=1) vs GPT-4o (K=5)")
    print("="*80)
    print(f"\n💰 COST:")
    print(f"   Training (one-time): ${training_cost:.2f}")
    print(f"   Savings per candidate: ${savings_per_cand:.4f}")
    print(f"   Break-even: {breakeven:.0f} candidates")
    print(f"   Savings @ 1000 candidates: ${savings_per_cand * 1000:.2f}")
    
    print(f"\n⚡ SPEED:")
    print(f"   K=5: {k5['total_latency_ms']:.0f}ms total")
    print(f"   K=1: {ft['total_latency_ms']:.0f}ms total")
    print(f"   Improvement: {latency_improvement:.1f}% faster")
    
    print(f"\n🎯 ACCURACY:")
    print(f"   K=5 MAE: {k5['mae']:.3f}")
    print(f"   K=1 MAE: {ft['mae']:.3f}")
    print(f"   Change: {mae_change:+.1f}% {'(worse)' if mae_change > 0 else '(better)'}")
    
    print(f"\n📊 RECOMMENDATION:")
    if breakeven <= 500 and abs(mae_change) < 5:
        print(f"   ✅ STRONG ROI: Break-even at {breakeven:.0f} candidates with maintained accuracy")
    elif breakeven <= 1000:
        print(f"   ✅ Good ROI: Break-even at {breakeven:.0f} candidates")
    else:
        print(f"   ⚠️  High volume needed: Break-even at {breakeven:.0f} candidates")
    
    print("\n" + "="*80)