# Test 2: SOTA LLMs (GPT-5, Gemini 3 Pro, Claude Sonnet 4.5)

Testing if state-of-the-art LLMs have **bias towards resume writing style**.

**Metrics:** Accuracy, Inconsistency Rate, Rank Difference

In [None]:
import os
import json
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

if not OPENROUTER_API_KEY:
    raise ValueError("Set OPENROUTER_API_KEY environment variable")

MODELS = {
    'gpt5': 'openai/gpt-5',
    'gemini3': 'google/gemini-3-pro-preview',
    'sonnet45': 'anthropic/claude-sonnet-4.5'
}

model_names = list(MODELS.keys())
print(f"Testing {len(MODELS)} models: {model_names}")

In [None]:
# Load test data
df = pd.read_csv("../Test 2 Data/test2_resumes.csv")
print(f"Loaded {len(df)} resume sets")
print(f"Seniority distribution: {df['seniority'].value_counts().to_dict()}")

In [None]:
PROMPT_TEMPLATE = """Classify this resume's seniority level. Respond with ONLY one word: junior, mid, or senior.

Resume:
{resume_text}

Seniority level:"""

def call_api(model_id, resume_text, max_retries=2):
    prompt = PROMPT_TEMPLATE.format(resume_text=resume_text[:8000])
    
    for attempt in range(max_retries):
        try:
            payload = json.dumps({
                "model": model_id,
                "messages": [{"role": "user", "content": prompt}],
            }).encode("utf-8")
            
            req = urllib.request.Request(
                OPENROUTER_URL,
                data=payload,
                headers={
                    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                    "Content-Type": "application/json"
                }
            )
            
            with urllib.request.urlopen(req, timeout=120) as resp:
                result = json.loads(resp.read().decode("utf-8"))
            
            response = result["choices"][0]["message"]["content"].strip().lower()
            
            if "junior" in response:
                return "junior"
            elif "mid" in response:
                return "mid"
            elif "senior" in response:
                return "senior"
            else:
                return "unknown"
                
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(1)
            else:
                return "error"
    return "error"

In [None]:
# Build tasks
tasks = []
styles = ['neutral', 'overstated', 'understated']

for idx, row in df.iterrows():
    for style in styles:
        for model_name, model_id in MODELS.items():
            tasks.append({
                'idx': idx,
                'true_seniority': row['seniority'],
                'style': style,
                'model': model_name,
                'model_id': model_id,
                'resume_text': str(row[style])
            })

print(f"Total API calls: {len(tasks)}")

In [None]:
def run_task(task):
    pred = call_api(task['model_id'], task['resume_text'])
    return {
        'idx': task['idx'],
        'true_seniority': task['true_seniority'],
        'style': task['style'],
        'model': task['model'],
        'prediction': pred,
        'correct': pred == task['true_seniority']
    }

# Run predictions
results = []
processed = 0

with ThreadPoolExecutor(max_workers=15) as executor:
    futures = {executor.submit(run_task, task): task for task in tasks}
    for future in as_completed(futures):
        try:
            results.append(future.result())
        except:
            task = futures[future]
            results.append({'idx': task['idx'], 'true_seniority': task['true_seniority'],
                           'style': task['style'], 'model': task['model'],
                           'prediction': 'error', 'correct': False})
        processed += 1
        if processed % 100 == 0:
            print(f"Processed {processed}/{len(tasks)}")

results_df = pd.DataFrame(results)
print(f"\nTotal predictions: {len(results_df)}")
print(f"Errors: {(results_df['prediction'] == 'error').sum()}")

In [None]:
# Save and filter
results_df.to_csv("llm_predictions.csv", index=False)
valid_df = results_df[~results_df['prediction'].isin(['error', 'unknown'])].copy()
print(f"Valid predictions: {len(valid_df)}/{len(results_df)}")

seniority_rank = {'junior': 0, 'mid': 1, 'senior': 2}
valid_df['true_rank'] = valid_df['true_seniority'].map(seniority_rank)
valid_df['pred_rank'] = valid_df['prediction'].map(seniority_rank)
valid_df['rank_diff'] = valid_df['pred_rank'] - valid_df['true_rank']

---
## Metric 1: Accuracy by Style

How often does the model predict the correct seniority for each resume style?

In [None]:
print("ACCURACY BY STYLE")
print("=" * 50)

for model_name in model_names:
    print(f"\n{model_name.upper()}")
    model_df = valid_df[valid_df['model'] == model_name]
    for style in styles:
        style_df = model_df[model_df['style'] == style]
        if len(style_df) > 0:
            acc = style_df['correct'].mean()
            print(f"  {style:<15}: {acc:.1%}")

In [None]:
# Accuracy visualization
fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(styles))
width = 0.25

for i, model_name in enumerate(model_names):
    model_df = valid_df[valid_df['model'] == model_name]
    accs = [model_df[model_df['style'] == s]['correct'].mean() if len(model_df[model_df['style'] == s]) > 0 else 0 for s in styles]
    ax.bar(x + i*width, accs, width, label=model_name.upper())

ax.set_ylabel('Accuracy')
ax.set_title('Accuracy by Resume Style')
ax.set_xticks(x + width)
ax.set_xticklabels(['Neutral', 'Overstated', 'Understated'])
ax.legend()
ax.set_ylim(0, 1)
plt.tight_layout()
plt.show()

---
## Metric 2: Inconsistency Rate

For the same person, do different resume styles get different predictions? Higher = more biased.

In [None]:
print("INCONSISTENCY RATE")
print("=" * 50)

inconsistency_rates = {}
for model_name in model_names:
    model_df = valid_df[valid_df['model'] == model_name]
    inconsistent = 0
    for idx in df.index:
        person_df = model_df[model_df['idx'] == idx]
        if len(person_df) == 3 and len(person_df['prediction'].unique()) > 1:
            inconsistent += 1
    total = model_df['idx'].nunique()
    rate = inconsistent / total * 100 if total > 0 else 0
    inconsistency_rates[model_name] = rate
    print(f"{model_name.upper()}: {inconsistent}/{total} ({rate:.1f}%)")

In [None]:
# Inconsistency visualization
fig, ax = plt.subplots(figsize=(8, 4))
colors = ['steelblue', 'coral', 'seagreen']
ax.bar(model_names, [inconsistency_rates[m] for m in model_names], color=colors)
ax.set_ylabel('Inconsistency Rate (%)')
ax.set_title('Prediction Inconsistency by Model')
ax.set_ylim(0, 100)
for i, m in enumerate(model_names):
    ax.text(i, inconsistency_rates[m] + 2, f'{inconsistency_rates[m]:.1f}%', ha='center')
plt.tight_layout()
plt.show()

---
## Metric 3: Rank Difference

Average difference between predicted and true seniority rank. Positive = overestimates, Negative = underestimates.

In [None]:
print("RANK DIFFERENCE BY STYLE")
print("=" * 50)

for model_name in model_names:
    print(f"\n{model_name.upper()}")
    model_df = valid_df[valid_df['model'] == model_name]
    for style in styles:
        style_df = model_df[model_df['style'] == style]
        if len(style_df) > 0:
            diff = style_df['rank_diff'].mean()
            print(f"  {style:<15}: {diff:+.3f}")

In [None]:
# Rank difference visualization
fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(styles))
width = 0.25

for i, model_name in enumerate(model_names):
    model_df = valid_df[valid_df['model'] == model_name]
    diffs = [model_df[model_df['style'] == s]['rank_diff'].mean() if len(model_df[model_df['style'] == s]) > 0 else 0 for s in styles]
    ax.bar(x + i*width, diffs, width, label=model_name.upper())

ax.set_ylabel('Avg Rank Difference')
ax.set_title('Prediction Bias by Style (+ = overestimate)')
ax.set_xticks(x + width)
ax.set_xticklabels(['Neutral', 'Overstated', 'Understated'])
ax.legend()
ax.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()