# Comparison of human vs. LLM performance

## A. Human diagnostic accuracy

In [1]:
# Load human diagnoses from spreadsheet
import pandas as pd
human_diagnoses_path = "../../results/human_to_llm_comparison/Clinical Annotation_ Task 3 - Produce Human Diagnoses.xlsx"
juliana_zhang = pd.read_excel(human_diagnoses_path, sheet_name="Juliana Zhang")
yasna_rostam_abadi = pd.read_excel(human_diagnoses_path, sheet_name="Yasna Rostam Abadi")

In [2]:
# Combine all annotations into a single DataFrame, adding a column for diagnostician
human_diagnoses = pd.concat([
    juliana_zhang.assign(diagnostician="Juliana Zhang"),
    yasna_rostam_abadi.assign(diagnostician="Yasna Rostam Abadi")
], ignore_index=True)

In [3]:
# Convert relevant columns for easier analysis
human_diagnoses.columns = ["case_id",
                           "vignette",
                           "human_diagnosis",
                           "reasoning",
                           "diagnostician"]

In [4]:
# Attach true diagnoses from the reasoning evaluation sheet back to the dataframe
annotations = "../../results/evaluate_diagnostic_reasoning/clinician_annotations/Clinical Annotation_ Task 2B - Judge LLM Reasoning (Blinded).xlsx"
true_diagnoses = pd.read_excel(annotations, sheet_name="Carolyn Rodriguez", usecols=["Case ID", "True Diagnosis"], nrows=30)
true_diagnoses.columns = ["case_id", "true_diagnosis"]
human_diagnoses = human_diagnoses.merge(true_diagnoses, on="case_id", how="left")

human_diagnoses.head()

Unnamed: 0,case_id,vignette,human_diagnosis,reasoning,diagnostician,true_diagnosis
0,181,"Terry Najarian, a 65-year-old salesman for a l...",1. Paraphilia (F 65)\n2. Adjustment disorder w...,Pt demonstrates a paraphilia (persistent sexua...,Juliana Zhang,Fetishistic disorder
1,62,A 27-year-old man was admitted to this hospita...,1. Functional neurologic disorder (F 44.5)\n2....,Pt has history of whole convulsions that were ...,Juliana Zhang,Functional seizures.
2,169,Only upon the repeated and fervent insistence ...,1. Post-concussion syndrome (F 07.81)\n2. Alco...,Parents describe an abrupt change in the patie...,Juliana Zhang,1. Mild neurocognitive disorder due to traumat...
3,155,"Lucas Sandahl, a 32-year-old landscape archite...",1. Intermittent explosive disorder (F 63.81)\n...,Pt has four episodes of verbal aggression week...,Juliana Zhang,Intermittent explosive disorder
4,32,J is a mid-20s military service member living ...,1. PTSD (F 43.1)\n2. Autism spectrum disorder ...,Pt has significant childhood adverse events an...,Juliana Zhang,"Autism Spectrum Disorder, Major Depressive Dis..."


In [10]:
# Helper functions: Parse ground truth diagnoses and model-predicted diagnoses strings from DataFrame into lists
import re

def parse_ground_truth_diagnoses(diagnosis_str) -> list:
    """
    Converts '1. Diagnosis A\n2. Diagnosis B' into ['Diagnosis A', 'Diagnosis B']
    """
    # Handle empty or non-string inputs
    if not isinstance(diagnosis_str, str): return []

    # Check if the diagnosis string is a numbered list (starts with "1." or similar)
    if re.search(r'^\d+\.', diagnosis_str.strip()):
        # Split by newline and map each diagnosis into a list
        diagnoses = []
        for line in diagnosis_str.strip().split('\n'):
            # Remove the numbering and any leading/trailing whitespace
            match = re.match(r'\d+\.\s+(.*)', line)  # Regex to capture text after numbering
            if match:
                diagnoses.append(match.group(1).strip())  # Add the cleaned diagnosis to the list
            else:
                diagnoses.append(line.strip())  # If not numbered, just add the line as-is
        return diagnoses
    else:
        # If not a numbered list, split by semicolons
        diagnoses = re.split(r';', diagnosis_str)
        return [diag.strip() for diag in diagnoses if diag.strip()]

def parse_model_predicted_diagnoses(model_diagnoses_str) -> list:
    """
    Converts '1. Diagnosis A\n2. Diagnosis B' into ['Diagnosis A', 'Diagnosis B']
    """
    # Handle empty or non-string inputs
    if not isinstance(model_diagnoses_str, str): return []

    diagnoses = []
    for line in model_diagnoses_str.strip().split('\n'):
        # Remove the numbering and any leading/trailing whitespace
        match = re.match(r'\d+\.\s+(.*)', line)
        if match:
            diagnoses.append(match.group(1).strip())
        else:
            # If not numbered, just add the line
            diagnoses.append(line.strip())
    return diagnoses

In [11]:
# Compare ground truth and predicted diagnoses using hybrid fuzzy + LLM approach for one case
from rapidfuzz import fuzz
from openai import OpenAI
from dotenv import load_dotenv
from pydantic import BaseModel
import json
import os

# Load API key from environment variable
load_dotenv()

# Initialize the OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define hybrid evaluator class for one case
class HybridEvaluator:
    def __init__(self, fuzzy_threshold=90, llm_model="gpt-5-mini"):
        self.fuzzy_threshold = fuzzy_threshold
        self.llm_model = llm_model
        # The cache prevents paying for the same comparison twice
        # Structure: {"True Term || Pred Term": True/False}
        self.cache = {} 
        self.llm_calls = 0

    def check_match(self, true_diag, pred_diag):
        """
        Returns True if match, False if not.
        Uses Fuzzy first, then falls back to LLM.
        """
        # 1. Normalize strings
        t = true_diag.lower().strip()
        p = pred_diag.lower().strip()
        
        # 2. TIER 1: Fuzzy String Matching (Free & Fast)
        # token_set_ratio handles reordering (e.g. "Type 2 Diabetes" == "Diabetes Type 2")
        fuzzy_score = fuzz.token_set_ratio(t, p)
        if fuzzy_score >= self.fuzzy_threshold:
            return True

        # 3. TIER 2: LLM Judge (Semantic)
        # Only runs if fuzzy score is low (e.g., < 95)
        # Check cache first
        cache_key = f"{t} || {p}"
        if cache_key in self.cache:
            return self.cache[cache_key]
            
        # Call LLM
        #print(f"Fuzzy threshold exceeded. Invoking LLM for: '{t}' vs '{p}'")
        is_match = self._ask_llm(t, p)
        
        # Update Cache
        self.cache[cache_key] = is_match
        self.llm_calls += 1
        return is_match
    

    def _ask_llm(self, t, p):
        # Define prompt for LLM-as-a-judge
        prompt = f"""

        Your task is to act as a strict medical adjudicator specializing in psychiatry and identify whether the predicted diagnosis is clinically equivalent to (or a valid subclass of) the true diagnosis. Your standards are exacting, and you must consider the nuances of each diagnosis carefully. As much as possible, adhere to the diagnostic language laid out in the DSM-5-TR, and utilize the included ICD-10 F-codes to aid your determination.
        
        True Diagnosis: "{t}"
        Predicted Diagnosis: "{p}"
        
        Return JSON ONLY: {{ "match": <true/false> }}
        """

        # Define response schema
        class DiagnosisMatch(BaseModel):
            match: bool # True if match, False if not
        
        try:
            response = client.responses.parse(
                model="gpt-5-mini",
                input=[
                    {
                          "role": "user",
                          "content": prompt
                    }
                ],
                text_format=DiagnosisMatch,
            )
            result = json.loads(response.output[1].content[0].text)
            return result.get("match", False)  # Default to False if key missing
        except Exception as e:
            print(f"LLM Error: {e}")
            return False

In [7]:
# Determine which humans to evaluate
humans = human_diagnoses['diagnostician'].unique().tolist()
results_path = "../../results/human_to_llm_comparison"

In [8]:
# Calculate accuracy metrics for humans (n=30)
from tqdm import tqdm

COL_TRUE = 'true_diagnosis' 
COL_PRED = 'human_diagnosis'

# Initialize Evaluator
evaluator = HybridEvaluator(fuzzy_threshold=90, llm_model="gpt-5-mini")

results = []

# Iterate through DataFrame
for human in humans:
    print(f"Starting evaluation for {human}...")

    # Isolate to current human
    diagnostician_df = human_diagnoses[human_diagnoses['diagnostician'] == human]

    # Iterate through cases for this diagnostician
    for index, row in tqdm(diagnostician_df.iterrows(), total=len(diagnostician_df)):
        
        # 1. Parse Data
        y_true = parse_ground_truth_diagnoses(row[COL_TRUE])
        y_pred = parse_model_predicted_diagnoses(row[COL_PRED])
        
        # If no ground truth, skip
        if not y_true:
            continue

        # 2. Analyze Matches
        # We map which TRUE diagnoses were found in the PRED list
        found_indices = set()
        first_match_rank = None # For MRR
        
        # Iterate through predictions (Order matters for Rank!)
        for rank_idx, pred_item in enumerate(y_pred):
            current_rank = rank_idx + 1 # 1-based rank
            
            # Check against ALL true items
            is_this_pred_correct = False
            
            for true_idx, true_item in enumerate(y_true):
                # THE HYBRID CHECK
                if evaluator.check_match(true_item, pred_item):
                    is_this_pred_correct = True
                    found_indices.add(true_idx)
                    
            # If this prediction was a match, and it's the first one we've seen...
            if is_this_pred_correct and first_match_rank is None:
                first_match_rank = current_rank

        # 3. Calculate Metrics
        
        # Hybrid Recall@5: % of true diagnoses found
        recall_score = len(found_indices) / len(y_true)
        
        # Hybrid Hit Rate: Did we find at least one?
        hit_rate = 1.0 if len(found_indices) > 0 else 0.0
        
        # Hybrid MRR: 1 / Rank of first match
        mrr_score = (1 / first_match_rank) if first_match_rank else 0.0
        
        # Hybrid Top-1: Did the very first prediction match *any* truth?
        # We can check if Rank 1 was the first match
        top1_score = 1.0 if first_match_rank == 1 else 0.0

        results.append({
            "case_id": row['case_id'],
            "y_true": y_true,
            "y_pred": y_pred,
            "hybrid_top1": top1_score,
            "hybrid_hit_rate": hit_rate,
            "hybrid_recall": recall_score,
            "hybrid_mrr": mrr_score
        })

    results_df = pd.DataFrame(results)
    final_df = pd.concat([human_diagnoses.reset_index(drop=True), results_df], axis=1)

    print(f"Done! Made {evaluator.llm_calls} calls to LLM.")

Starting evaluation for Juliana Zhang...


100%|██████████| 30/30 [17:41<00:00, 35.37s/it]


Done! Made 232 calls to LLM.
Starting evaluation for Yasna Rostam Abadi...


100%|██████████| 30/30 [18:29<00:00, 36.97s/it]

Done! Made 446 calls to LLM.





In [None]:
for human in humans:
    print(f"\n=== RESULTS FOR {human} ===")
    # Isolate to current human
    diagnostician_df = human_diagnoses[human_diagnoses['diagnostician'] == human]
    results_df = final_df[final_df['diagnostician'] == human]

    # 1. Aggregate Statistics
    stats = {
        "Metric": ["Top-1 Accuracy", "Top-5 Accuracy", "Recall@5", "MRR"],
        "Score": [
            results_df['hybrid_top1'].mean(),
            results_df['hybrid_hit_rate'].mean(),
            results_df['hybrid_recall'].mean(),
            results_df['hybrid_mrr'].mean()
        ]
    }
    stats_df = pd.DataFrame(stats)

    # Display nicely formatted percentages
    print("\n=== FINAL DIAGNOSTIC PERFORMANCE (Mean Scores) ===")
    stats_df.style.format({"Score": "{:.2%}"})

    # Save stats to CSV
    stats_df.to_csv(f"{human}_diagnostic_evaluation_summary.csv", index=False)
    print(f"\nSaved summary statistics to '{human}_diagnostic_evaluation_summary.csv'")


=== RESULTS FOR Juliana Zhang ===

=== FINAL DIAGNOSTIC PERFORMANCE (Mean Scores) ===

Saved summary statistics to 'Juliana Zhang_diagnostic_evaluation_summary.csv'

=== RESULTS FOR Yasna Rostam Abadi ===

=== FINAL DIAGNOSTIC PERFORMANCE (Mean Scores) ===

Saved summary statistics to 'Yasna Rostam Abadi_diagnostic_evaluation_summary.csv'


In [13]:
# Calculate mean of each metric across all humans
overall_stats = {
    "Metric": ["Top-1 Accuracy", "Top-5 Accuracy", "Recall@5", "MRR"],
    "Score": [
        final_df['hybrid_top1'].mean(),
        final_df['hybrid_hit_rate'].mean(),
        final_df['hybrid_recall'].mean(),
        final_df['hybrid_mrr'].mean()
    ]
}
overall_stats_df = pd.DataFrame(overall_stats)
overall_stats_df.style.format({"Score": "{:.2%}"})

# Save overall stats to CSV
overall_stats_df.to_csv(f"overall_diagnostic_evaluation_summary.csv", index=False)
print(f"\nSaved overall summary statistics to 'overall_diagnostic_evaluation_summary.csv'")


Saved overall summary statistics to 'overall_diagnostic_evaluation_summary.csv'


## B. LLM diagnostic accuracy (Claude 4.5 Opus)

In [3]:
import pandas as pd

# Use human diagnoses data for evaluation of LLMs; remove columns
human_diagnoses_path = "../../results/human_to_llm_comparison/Clinical Annotation_ Task 3 - Produce Human Diagnoses.xlsx"
human_diagnoses = pd.read_excel(human_diagnoses_path, sheet_name="Juliana Zhang")

model_diagnoses = human_diagnoses.drop(columns=["Diagnosis", "Reasoning"])
model_diagnoses.columns = ["case_id", "vignette"]
dataset_name = "model_diagnoses"
model_diagnoses.head()

Unnamed: 0,case_id,vignette
0,181,"Terry Najarian, a 65-year-old salesman for a l..."
1,62,A 27-year-old man was admitted to this hospita...
2,169,Only upon the repeated and fervent insistence ...
3,155,"Lucas Sandahl, a 32-year-old landscape archite..."
4,32,J is a mid-20s military service member living ...


In [14]:
# Attach true diagnoses from the reasoning evaluation sheet back to the dataframe
annotations = "../../results/evaluate_diagnostic_reasoning/clinician_annotations/Clinical Annotation_ Task 2B - Judge LLM Reasoning (Blinded).xlsx"
true_diagnoses = pd.read_excel(annotations, sheet_name="Carolyn Rodriguez", usecols=["Case ID", "True Diagnosis"], nrows=30)
true_diagnoses.columns = ["case_id", "true_diagnosis"]
model_diagnoses = model_diagnoses.merge(true_diagnoses, on="case_id", how="left")

In [4]:
# Define system instructions and user prompt
with open("../../code/prompts/top_5_accuracy/system_prompt.txt") as f:
    system_prompt = f.read()

with open("../../code/prompts/top_5_accuracy/user_prompt.txt") as f:
    user_prompt = f.read()

In [5]:
# Generate differential diagnosis for one case using Claude Opus 4.5
def generate_top5_diagnoses(client, model, system_prompt, user_prompt, vignette):
    # Prepare API call parameters
    # Anthropic Claude: Make API call and create response object
    if model.startswith("claude"):
        response = client.messages.create(
            model=model,
            max_tokens=20000,  # Max output for Claude Opus 4.5 is 64k but >20k requires streaming
            system=system_prompt,
            # Extended thinking mode is not compatible with temperature, top_p, or top_k sampling
            thinking={
                "type": "enabled",
                "budget_tokens": 19000  # Allocate tokens for thinking - model may not use entire budget
            },
            messages=[
                {
                    "role": "user", 
                    "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"
                }
            ]
        )

        # Handle model refusal to answer
        if response.stop_reason == "refusal":
            reasoning = "N/A"
            answer = "Model refused to answer the prompt."
            return reasoning, answer

        # Extract the response content
        for block in response.content:
            if block.type == "text":  # Extract differential diagnosis block
                answer = block.text
            elif block.type == "thinking":  # Extract summarized thinking block
                reasoning = block.thinking
            elif block.type == "redacted_thinking":  # Handle redacted thinking block
                print(f"Redacted thinking detected for \"{vignette[:30]}...\"")
                reasoning = block.thinking

    return reasoning, answer

In [6]:
# Initialize Anthropic client
import anthropic
from tqdm import tqdm

anthropic_client = anthropic.Anthropic()
model = "claude-opus-4-5-20251101"

print("***********************************************")
print(f"Processing model {model} on dataset {dataset_name}...")

# Process each case sequentially
pbar = tqdm(model_diagnoses.iterrows(), total=model_diagnoses.shape[0])  # Progress bar for tracking

for index, row in pbar:
    pbar.set_description(f"Generating differential diagnoses for case {index + 1} out of {model_diagnoses.shape[0]} (case {row['case_id']})")
    reasoning, answer = generate_top5_diagnoses(anthropic_client,
                                                model,
                                                system_prompt,
                                                user_prompt,
                                                row["vignette"],
                                                # Temperature not compatible with extended thinking mode
                                                )
    model_diagnoses.loc[index, "model_thoughts"] = reasoning
    model_diagnoses.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {model_diagnoses.shape[0]} (case {row['case_id']}).")

***********************************************
Processing model claude-opus-4-5-20251101 on dataset model_diagnoses...


Generating differential diagnoses for case 2 out of 30 (case 62):   3%|▎         | 1/30 [00:55<26:52, 55.60s/it] 

Completed case 1 out of 30 (case 181).


Generating differential diagnoses for case 3 out of 30 (case 169):   7%|▋         | 2/30 [01:41<23:17, 49.91s/it]

Completed case 2 out of 30 (case 62).


Generating differential diagnoses for case 4 out of 30 (case 155):  10%|█         | 3/30 [02:31<22:23, 49.76s/it]

Completed case 3 out of 30 (case 169).


Generating differential diagnoses for case 5 out of 30 (case 32):  13%|█▎        | 4/30 [03:14<20:31, 47.38s/it] 

Completed case 4 out of 30 (case 155).


Generating differential diagnoses for case 6 out of 30 (case 1013):  17%|█▋        | 5/30 [03:57<18:59, 45.59s/it]

Completed case 5 out of 30 (case 32).


Generating differential diagnoses for case 7 out of 30 (case 94):  20%|██        | 6/30 [04:48<18:58, 47.46s/it]  

Completed case 6 out of 30 (case 1013).


Generating differential diagnoses for case 8 out of 30 (case 1032):  23%|██▎       | 7/30 [05:56<20:44, 54.13s/it]

Completed case 7 out of 30 (case 94).


Generating differential diagnoses for case 9 out of 30 (case 1003):  27%|██▋       | 8/30 [06:39<18:37, 50.80s/it]

Completed case 8 out of 30 (case 1032).


Generating differential diagnoses for case 10 out of 30 (case 80):  30%|███       | 9/30 [07:39<18:46, 53.65s/it] 

Completed case 9 out of 30 (case 1003).


Generating differential diagnoses for case 11 out of 30 (case 109):  33%|███▎      | 10/30 [08:53<19:56, 59.80s/it]

Completed case 10 out of 30 (case 80).


Generating differential diagnoses for case 12 out of 30 (case 118):  37%|███▋      | 11/30 [10:15<21:05, 66.63s/it]

Completed case 11 out of 30 (case 109).


Generating differential diagnoses for case 13 out of 30 (case 159):  40%|████      | 12/30 [11:34<21:06, 70.36s/it]

Completed case 12 out of 30 (case 118).


Generating differential diagnoses for case 14 out of 30 (case 1021):  43%|████▎     | 13/30 [12:30<18:43, 66.11s/it]

Completed case 13 out of 30 (case 159).


Generating differential diagnoses for case 15 out of 30 (case 104):  47%|████▋     | 14/30 [14:09<20:16, 76.01s/it] 

Completed case 14 out of 30 (case 1021).


Generating differential diagnoses for case 16 out of 30 (case 1049):  50%|█████     | 15/30 [14:40<15:35, 62.38s/it]

Completed case 15 out of 30 (case 104).


Generating differential diagnoses for case 17 out of 30 (case 115):  53%|█████▎    | 16/30 [15:15<12:37, 54.12s/it] 

Completed case 16 out of 30 (case 1049).


Generating differential diagnoses for case 18 out of 30 (case 1009):  57%|█████▋    | 17/30 [16:06<11:32, 53.23s/it]

Completed case 17 out of 30 (case 115).


Generating differential diagnoses for case 19 out of 30 (case 99):  60%|██████    | 18/30 [16:36<09:16, 46.39s/it]  

Completed case 18 out of 30 (case 1009).


Generating differential diagnoses for case 20 out of 30 (case 23):  63%|██████▎   | 19/30 [17:13<07:57, 43.44s/it]

Completed case 19 out of 30 (case 99).


Generating differential diagnoses for case 21 out of 30 (case 1020):  67%|██████▋   | 20/30 [17:57<07:17, 43.73s/it]

Completed case 20 out of 30 (case 23).


Generating differential diagnoses for case 22 out of 30 (case 85):  70%|███████   | 21/30 [18:56<07:13, 48.13s/it]  

Completed case 21 out of 30 (case 1020).


Generating differential diagnoses for case 23 out of 30 (case 1012):  73%|███████▎  | 22/30 [19:38<06:10, 46.32s/it]

Completed case 22 out of 30 (case 85).


Generating differential diagnoses for case 24 out of 30 (case 178):  77%|███████▋  | 23/30 [20:29<05:33, 47.70s/it] 

Completed case 23 out of 30 (case 1012).


Generating differential diagnoses for case 25 out of 30 (case 1047):  80%|████████  | 24/30 [21:09<04:33, 45.57s/it]

Completed case 24 out of 30 (case 178).


Generating differential diagnoses for case 26 out of 30 (case 17):  83%|████████▎ | 25/30 [21:58<03:51, 46.33s/it]  

Completed case 25 out of 30 (case 1047).


Generating differential diagnoses for case 27 out of 30 (case 1033):  87%|████████▋ | 26/30 [22:30<02:49, 42.29s/it]

Completed case 26 out of 30 (case 17).


Generating differential diagnoses for case 28 out of 30 (case 1056):  90%|█████████ | 27/30 [22:55<01:51, 37.01s/it]

Completed case 27 out of 30 (case 1033).


Generating differential diagnoses for case 29 out of 30 (case 108):  93%|█████████▎| 28/30 [24:05<01:33, 46.91s/it] 

Completed case 28 out of 30 (case 1056).


Generating differential diagnoses for case 30 out of 30 (case 97):  97%|█████████▋| 29/30 [25:15<00:53, 53.89s/it] 

Completed case 29 out of 30 (case 108).


Generating differential diagnoses for case 30 out of 30 (case 97): 100%|██████████| 30/30 [26:34<00:00, 53.13s/it]

Completed case 30 out of 30 (case 97).





In [15]:
# Save output to CSV
model_diagnoses.to_csv(f"../../results/human_to_llm_comparison/predicted_diagnoses_{model}.csv", index=False)

In [16]:
# Calculate accuracy metrics for the model (n=30)
from tqdm import tqdm

COL_TRUE = 'true_diagnosis' 
COL_PRED = 'model_diagnosis'

# Initialize Evaluator
evaluator = HybridEvaluator(fuzzy_threshold=90, llm_model="gpt-5-mini")

results = []

# Iterate through DataFrame

print(f"Starting evaluation for {model}...")

# Iterate through cases for this diagnostician
for index, row in tqdm(model_diagnoses.iterrows(), total=len(model_diagnoses)):
    
    # 1. Parse Data
    y_true = parse_ground_truth_diagnoses(row[COL_TRUE])
    y_pred = parse_model_predicted_diagnoses(row[COL_PRED])
    
    # If no ground truth, skip
    if not y_true:
        continue

    # 2. Analyze Matches
    # We map which TRUE diagnoses were found in the PRED list
    found_indices = set()
    first_match_rank = None # For MRR
    
    # Iterate through predictions (Order matters for Rank!)
    for rank_idx, pred_item in enumerate(y_pred):
        current_rank = rank_idx + 1 # 1-based rank
        
        # Check against ALL true items
        is_this_pred_correct = False
        
        for true_idx, true_item in enumerate(y_true):
            # THE HYBRID CHECK
            if evaluator.check_match(true_item, pred_item):
                is_this_pred_correct = True
                found_indices.add(true_idx)
                
        # If this prediction was a match, and it's the first one we've seen...
        if is_this_pred_correct and first_match_rank is None:
            first_match_rank = current_rank

    # 3. Calculate Metrics
    
    # Hybrid Recall@5: % of true diagnoses found
    recall_score = len(found_indices) / len(y_true)
    
    # Hybrid Hit Rate: Did we find at least one?
    hit_rate = 1.0 if len(found_indices) > 0 else 0.0
    
    # Hybrid MRR: 1 / Rank of first match
    mrr_score = (1 / first_match_rank) if first_match_rank else 0.0
    
    # Hybrid Top-1: Did the very first prediction match *any* truth?
    # We can check if Rank 1 was the first match
    top1_score = 1.0 if first_match_rank == 1 else 0.0

    results.append({
        "case_id": row['case_id'],
        "y_true": y_true,
        "y_pred": y_pred,
        "hybrid_top1": top1_score,
        "hybrid_hit_rate": hit_rate,
        "hybrid_recall": recall_score,
        "hybrid_mrr": mrr_score
    })

results_df = pd.DataFrame(results)
final_df = pd.concat([model_diagnoses.reset_index(drop=True), results_df], axis=1)

print(f"Done! Made {evaluator.llm_calls} calls to LLM.")

Starting evaluation for claude-opus-4-5-20251101...


100%|██████████| 30/30 [15:19<00:00, 30.65s/it]

Done! Made 228 calls to LLM.





In [17]:
# 1. Aggregate Statistics
stats = {
    "Metric": ["Top-1 Accuracy", "Top-5 Accuracy", "Recall@5", "MRR"],
    "Score": [
        results_df['hybrid_top1'].mean(),
        results_df['hybrid_hit_rate'].mean(),
        results_df['hybrid_recall'].mean(),
        results_df['hybrid_mrr'].mean()
    ]
}
stats_df = pd.DataFrame(stats)

# Display nicely formatted percentages
print("\n=== FINAL DIAGNOSTIC PERFORMANCE (Mean Scores) ===")
stats_df.style.format({"Score": "{:.2%}"})

# Save stats to CSV
stats_df.to_csv(f"{model}_diagnostic_evaluation_summary.csv", index=False)
print(f"\nSaved summary statistics to '{model}_diagnostic_evaluation_summary.csv'")


=== FINAL DIAGNOSTIC PERFORMANCE (Mean Scores) ===

Saved summary statistics to 'claude-opus-4-5-20251101_diagnostic_evaluation_summary.csv'
