# 1. Generate model thought traces for evaluation of diagnostic reasoning of 4 frontier LLMs (n = 30)

## Part A. Load in combined dataset

In [3]:
# Load the combined (literature + synthetic) dataset
import json
import pandas as pd

dataset_path = "../../datasets/combined/combined_jama.json"
with open(dataset_path, "r") as f:
    combined = json.load(f)

dataset = pd.DataFrame(combined)
dataset.head()

Unnamed: 0,case_id,vignette,diagnosis,source,difficulty,diagnosis_dsm,diagnosis_icd,reasoning
0,2,A black male in his early 20s with a remote hi...,Catatonia,case_reports_in_psychiatry,,,,
1,4,A 12-year-old male with no specific birth or m...,Anorexia nervosa with focal cortical dysplasia,case_reports_in_psychiatry,,,,
2,13,We report on a 35-year-old female with a histo...,Pseudocyesis,case_reports_in_psychiatry,,,,
3,14,The patient is a 58-year-old woman with a psyc...,Illness anxiety disorder (IAD),case_reports_in_psychiatry,,,,
4,15,The patient is a 20-year-old single man. He is...,Social anxiety disorder (SAD) and major depres...,case_reports_in_psychiatry,,,,


## Part B. Create dataset subset for diagnostic reasoning analysis

In [4]:
# Draw 30 random cases from the combined dataset for diagnostic reasoning analysis and human comparison
import random
random.seed(1000)  # For reproducibility

# Exclude a set of case IDs if needed (seen by clinicians in previous round of evaluation)
excluded_case_ids = [48, 81, 90, 142, 166, 168, 176, 177, 44, 43,
            113, 114, 131, 132, 152, 156, 21, 27, 50, 89,
            110, 123, 153, 175, 1001, 1002, 1010, 1011, 1014, 1019]

filtered_combined = [case for case in combined if case['case_id'] not in excluded_case_ids]

reasoning_subset = random.sample(filtered_combined, 30)

In [5]:
# Export reasoning subset as Excel spreadsheet to preserve Unicode characters like quotation marks
reasoning_subset = pd.DataFrame(reasoning_subset)
reasoning_output_path = "../../datasets/reasoning/jama_reasoning_subset.xlsx"

reasoning_subset.to_excel(reasoning_output_path, index=False)
print(f"Exported reasoning subset to {reasoning_output_path}.")

Exported reasoning subset to ../../datasets/reasoning/jama_reasoning_subset.xlsx.


## Part C. Set up LLM APIs

### 0. Set up prompts

In [10]:
# Define system instructions and user prompt
with open("../prompts/system_prompt.txt") as f:
    system_prompt = f.read()

with open("../prompts/user_prompt.txt") as f:
    user_prompt = f.read()

### 1. Google Gemini 3 Pro

In [5]:
from google import genai
from google.genai import types
from dotenv import load_dotenv

# Load API key from environment variable
load_dotenv()

# Initialize the GenAI client
google_client = genai.Client()

True

In [None]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str, temperature: float) -> tuple:
    # Prepare API call parameters
    # Google Gemini: Make API call and create response object
    if model.startswith("gemini"):
        response = client.models.generate_content(
            model=model,
            contents=user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>",  # User prompt with inserted vignette
            config=types.GenerateContentConfig(
                thinking_config=types.ThinkingConfig(
                    thinking_level="high",  # Use thinking_level for Gemini 3, not thinking_budget since it may result in subpar performance
                    include_thoughts=True  # Include thought summaries in parts/thought within `response` parameters
                    ),
                system_instruction=system_prompt,  # System prompt
                temperature=temperature  # Model temperature
            ),
        )

        # Iterate through response object
        for part in response.parts:
            if not part.text:
                continue
            if part.thought:
                reasoning = part.text  # Extract thought summary
            else:
                answer = part.text  # Extract differential diagnosis list

    return reasoning, answer

In [7]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "gemini-3-pro-preview"
pbar = tqdm(reasoning_subset.iterrows(), total=reasoning_subset.shape[0])  # Progress bar for tracking

for index, row in pbar:
    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(google_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                      1,  # Google advises keeping temperature at 1 for Gemini 3 to avoid messing with reasoning behavior
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

Generating diagnostic reasoning trace 2 out of 30 (case 62):   3%|▎         | 1/30 [00:47<22:58, 47.54s/it] 

Completed case 0 out of 30 (case 181).


Generating diagnostic reasoning trace 3 out of 30 (case 169):   7%|▋         | 2/30 [01:39<23:19, 49.98s/it]

Completed case 1 out of 30 (case 62).


Generating diagnostic reasoning trace 4 out of 30 (case 155):  10%|█         | 3/30 [02:43<25:24, 56.48s/it]

Completed case 2 out of 30 (case 169).


Generating diagnostic reasoning trace 5 out of 30 (case 32):  13%|█▎        | 4/30 [04:05<28:49, 66.52s/it] 

Completed case 3 out of 30 (case 155).


Generating diagnostic reasoning trace 6 out of 30 (case 1013):  17%|█▋        | 5/30 [04:50<24:29, 58.78s/it]

Completed case 4 out of 30 (case 32).


Generating diagnostic reasoning trace 7 out of 30 (case 94):  20%|██        | 6/30 [06:01<25:11, 62.99s/it]  

Completed case 5 out of 30 (case 1013).


Generating diagnostic reasoning trace 8 out of 30 (case 1032):  23%|██▎       | 7/30 [07:58<30:52, 80.53s/it]

Completed case 6 out of 30 (case 94).


Generating diagnostic reasoning trace 9 out of 30 (case 1003):  27%|██▋       | 8/30 [09:31<30:58, 84.49s/it]

Completed case 7 out of 30 (case 1032).


Generating diagnostic reasoning trace 10 out of 30 (case 80):  30%|███       | 9/30 [11:01<30:12, 86.31s/it] 

Completed case 8 out of 30 (case 1003).


Generating diagnostic reasoning trace 11 out of 30 (case 109):  33%|███▎      | 10/30 [12:14<27:25, 82.25s/it]

Completed case 9 out of 30 (case 80).


Generating diagnostic reasoning trace 12 out of 30 (case 118):  37%|███▋      | 11/30 [13:23<24:46, 78.23s/it]

Completed case 10 out of 30 (case 109).


Generating diagnostic reasoning trace 13 out of 30 (case 159):  40%|████      | 12/30 [14:27<22:11, 73.96s/it]

Completed case 11 out of 30 (case 118).


Generating diagnostic reasoning trace 14 out of 30 (case 1021):  43%|████▎     | 13/30 [15:52<21:51, 77.14s/it]

Completed case 12 out of 30 (case 159).


Generating diagnostic reasoning trace 15 out of 30 (case 104):  47%|████▋     | 14/30 [17:01<19:56, 74.78s/it] 

Completed case 13 out of 30 (case 1021).


Generating diagnostic reasoning trace 16 out of 30 (case 1049):  50%|█████     | 15/30 [18:02<17:37, 70.49s/it]

Completed case 14 out of 30 (case 104).


Generating diagnostic reasoning trace 17 out of 30 (case 115):  53%|█████▎    | 16/30 [19:09<16:11, 69.37s/it] 

Completed case 15 out of 30 (case 1049).


Generating diagnostic reasoning trace 18 out of 30 (case 1009):  57%|█████▋    | 17/30 [19:53<13:23, 61.83s/it]

Completed case 16 out of 30 (case 115).


Generating diagnostic reasoning trace 19 out of 30 (case 99):  60%|██████    | 18/30 [20:55<12:22, 61.90s/it]  

Completed case 17 out of 30 (case 1009).


Generating diagnostic reasoning trace 20 out of 30 (case 23):  63%|██████▎   | 19/30 [21:38<10:18, 56.27s/it]

Completed case 18 out of 30 (case 99).


Generating diagnostic reasoning trace 21 out of 30 (case 1020):  67%|██████▋   | 20/30 [22:50<10:09, 60.95s/it]

Completed case 19 out of 30 (case 23).


Generating diagnostic reasoning trace 22 out of 30 (case 85):  70%|███████   | 21/30 [24:46<11:37, 77.51s/it]  

Completed case 20 out of 30 (case 1020).


Generating diagnostic reasoning trace 23 out of 30 (case 1012):  73%|███████▎  | 22/30 [26:58<12:31, 93.93s/it]

Completed case 21 out of 30 (case 85).


Generating diagnostic reasoning trace 24 out of 30 (case 178):  77%|███████▋  | 23/30 [27:46<09:21, 80.18s/it] 

Completed case 22 out of 30 (case 1012).


Generating diagnostic reasoning trace 25 out of 30 (case 1047):  80%|████████  | 24/30 [28:30<06:55, 69.32s/it]

Completed case 23 out of 30 (case 178).


Generating diagnostic reasoning trace 26 out of 30 (case 17):  83%|████████▎ | 25/30 [29:30<05:31, 66.32s/it]  

Completed case 24 out of 30 (case 1047).


Generating diagnostic reasoning trace 27 out of 30 (case 1033):  87%|████████▋ | 26/30 [30:30<04:17, 64.46s/it]

Completed case 25 out of 30 (case 17).


Generating diagnostic reasoning trace 28 out of 30 (case 1056):  90%|█████████ | 27/30 [30:58<02:41, 53.72s/it]

Completed case 26 out of 30 (case 1033).


Generating diagnostic reasoning trace 29 out of 30 (case 108):  93%|█████████▎| 28/30 [31:59<01:51, 55.70s/it] 

Completed case 27 out of 30 (case 1056).


Generating diagnostic reasoning trace 30 out of 30 (case 97):  97%|█████████▋| 29/30 [33:03<00:58, 58.39s/it] 

Completed case 28 out of 30 (case 108).


Generating diagnostic reasoning trace 30 out of 30 (case 97): 100%|██████████| 30/30 [34:18<00:00, 68.63s/it]

Completed case 29 out of 30 (case 97).





In [None]:
# Check for missing or empty values in the results because GPT sometimes returns incomplete outputs
missing_values = reasoning_subset[
    (reasoning_subset["model_thoughts"].isnull()) | 
    (reasoning_subset["model_thoughts"] == "") |
    (reasoning_subset["model_diagnosis"].isnull()) | 
    (reasoning_subset["model_diagnosis"] == "")
]

if not missing_values.empty:
    print("Missing or empty values found in the following cases:")
    print(missing_values[["case_id", "model_thoughts", "model_diagnosis"]])
else:
    print("No missing or empty values found in the results.")


In [8]:
# Get timestamp
import datetime

# Save to a JSON file
results_path = f"../../results/evaluate_diagnostic_reasoning/reasoning_samples_{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

reasoning_subset.to_json(results_path, orient="records", indent=2)
print(f"Reasoning samples for {model} saved to {results_path}.")

Reasoning samples for gemini-3-pro-preview saved to ../../results/evaluate_diagnostic_reasoning/reasoning_samples_gemini-3-pro-preview_20251211_001432.json.


### 2. OpenAI GPT 5.2

In [21]:
# Refresh the dataset to avoid overwriting previous results
reasoning_subset = pd.read_excel(reasoning_output_path)

In [22]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load API key from environment variable
load_dotenv()

# Initialize the OpenAI client
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [46]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str) -> tuple:
    # Prepare API call parameters
    # OpenAI: Make API call and create response object
    if model.startswith("gpt"):
        response = client.responses.create(
            model=model,  # gpt-5.2-pro is way too expensive; use gpt-5.2
            reasoning={
                "effort": "xhigh",  # Favors even more complete reasoning
                "summary": "detailed"  # Give as much detail as possible in thinking block
            },
            text={
                "verbosity": "low"  # To keep the model on task for diagnosis
            },
            input=[
                {
                    "role": "user",
                    "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"
                }
            ]
        )
    
    # Handle different output array lengths dynamically
    if len(response.output) == 1:
        # Only differential diagnosis present
        reasoning = None
        answer = response.output[0].content[0].text
    elif len(response.output) >= 2:
        # Extract the thought summary by concatenating all thinking blocks using newlines and a list comprehension
        reasoning = "\n\n".join([block.text for block in response.output[0].summary])
        
        # Extract the differential diagnosis list
        answer = response.output[1].content[0].text
    else:
        # Handle unexpected cases
        reasoning = None
        answer = None

    return reasoning, answer

In [32]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "gpt-5.2"

# Start from specific row if resuming
#indices_to_skip = [13, 24] # Skip these indices because model isn't returning reasoning properly for these cases
start_row_index = 0
pbar = tqdm(reasoning_subset.iloc[start_row_index:].iterrows(), total=reasoning_subset.shape[0] - start_row_index)  # Progress bar for tracking

for index, row in pbar:
    # if index in indices_to_skip:
    #     print(f"Skipping case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")
    #     continue

    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(openai_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                    # Temperature not supported with reasoning effort set to high 
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

Generating diagnostic reasoning trace 27 out of 30 (case 1033):  20%|██        | 1/5 [02:28<09:53, 148.46s/it]

Completed case 26 out of 30 (case 17).


Generating diagnostic reasoning trace 28 out of 30 (case 1056):  40%|████      | 2/5 [03:02<04:03, 81.21s/it] 

Completed case 27 out of 30 (case 1033).


Generating diagnostic reasoning trace 29 out of 30 (case 108):  60%|██████    | 3/5 [05:48<03:59, 119.92s/it] 

Completed case 28 out of 30 (case 1056).


Generating diagnostic reasoning trace 30 out of 30 (case 97):  80%|████████  | 4/5 [10:16<02:58, 178.48s/it] 

Completed case 29 out of 30 (case 108).


Generating diagnostic reasoning trace 30 out of 30 (case 97): 100%|██████████| 5/5 [17:03<00:00, 204.64s/it]

Completed case 30 out of 30 (case 97).





In [71]:
# Check for missing or empty values and rerun until none remain
max_iterations = 10  # Prevent infinite loop
iteration = 0

while iteration < max_iterations:
    # Check for missing or empty values
    missing_values = reasoning_subset[
        (reasoning_subset["model_thoughts"].isnull()) | 
        (reasoning_subset["model_thoughts"] == "") |
        (reasoning_subset["model_diagnosis"].isnull()) | 
        (reasoning_subset["model_diagnosis"] == "")
    ]
    
    if missing_values.empty:
        print("No missing or empty values found in the results.")
        break
    
    iteration += 1
    print(f"\n=== Iteration {iteration} ===")
    print(f"Missing or empty values found in {len(missing_values)} cases:")
    print(missing_values[["case_id", "model_thoughts", "model_diagnosis"]])
    
    # Get all indices with missing values
    missing_indices = missing_values.index.tolist()
    print(f"Rerunning {len(missing_indices)} cases with missing data: {missing_indices}")
    
    for index_to_rerun in missing_indices:
        print(f"\nRerunning case {index_to_rerun} (case_id: {reasoning_subset.iloc[index_to_rerun]['case_id']})")
        
        try:
            reasoning, answer = generate_diagnostic_reasoning(openai_client,
                                                            model,
                                                            system_prompt,
                                                            user_prompt,
                                                            reasoning_subset.iloc[index_to_rerun]["vignette"],
                                                        )
            
            reasoning_subset.loc[index_to_rerun, "model_thoughts"] = reasoning
            reasoning_subset.loc[index_to_rerun, "model_diagnosis"] = answer
            print(f"Successfully completed case {index_to_rerun}")
            
        except Exception as e:
            print(f"Error processing case {index_to_rerun}: {str(e)}")
            continue

print(f"\nCompleted after {iteration} iteration(s).")



=== Iteration 1 ===
Missing or empty values found in 1 cases:
    case_id model_thoughts                                    model_diagnosis
23      178           None  1. Sedative-, Hypnotic-, or Anxiolytic Use Dis...
Rerunning 1 cases with missing data: [23]

Rerunning case 23 (case_id: 178)
Successfully completed case 23

=== Iteration 2 ===
Missing or empty values found in 1 cases:
    case_id model_thoughts                                    model_diagnosis
23      178           None  1. Sedative-, hypnotic-, or anxiolytic use dis...
Rerunning 1 cases with missing data: [23]

Rerunning case 23 (case_id: 178)
Successfully completed case 23

=== Iteration 3 ===
Missing or empty values found in 1 cases:
    case_id model_thoughts                                    model_diagnosis
23      178           None  1. Sedative-, hypnotic-, or anxiolytic use dis...
Rerunning 1 cases with missing data: [23]

Rerunning case 23 (case_id: 178)
Successfully completed case 23

=== Iteration 4 ===
M

In [65]:
# Get timestamp
import datetime

# Save to a JSON file
results_path = f"../../results/evaluate_diagnostic_reasoning/reasoning_samples_{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

reasoning_subset.to_json(results_path, orient="records", indent=2)
print(f"Reasoning samples for {model} saved to {results_path}.")

Reasoning samples for gpt-5.2 saved to ../../results/evaluate_diagnostic_reasoning/reasoning_samples_gpt-5.2_20251212_035538.json.


### 3. Anthropic Claude Opus 4.5

In [16]:
# Refresh the dataset to avoid overwriting previous results
reasoning_subset = pd.read_excel(reasoning_output_path)

In [17]:
import anthropic
from dotenv import load_dotenv

# Load API key from environment variable
load_dotenv()

# Initialize the Anthropic client
anthropic_client = anthropic.Anthropic()

In [18]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str) -> tuple:
    # Prepare API call parameters
    # Anthropic Claude: Make API call and create response object
    if model.startswith("claude"):
        response = client.messages.create(
            model=model,
            max_tokens=20000,  # Max output for Claude Opus 4.5 is 64k but >20k requires streaming
            system=system_prompt,
            # Extended thinking mode is not compatible with temperature, top_p, or top_k sampling
            thinking={
                "type": "enabled",
                "budget_tokens": 19000  # Allocate tokens for thinking - model may not use entire budget
            },
            messages=[
                {
                    "role": "user", 
                    "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"
                }
            ]
        )

         # Handle model refusal to answer
        if response.stop_reason == "refusal":
            reasoning = "N/A"
            answer = "Model refused to answer the prompt."
            return reasoning, answer

        # Extract the response content
        for block in response.content:
            if block.type == "text":  # Extract differential diagnosis block
                answer = block.text
            elif block.type == "thinking":  # Extract summarized thinking block
                reasoning = block.thinking
            elif block.type == "redacted_thinking":  # Handle redacted thinking block
                print(f"Redacted thinking detected for \"{vignette[:30]}...\"")
                reasoning = block.thinking

    return reasoning, answer

In [19]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "claude-opus-4-5-20251101"
pbar = tqdm(reasoning_subset.iterrows(), total=reasoning_subset.shape[0])  # Progress bar for tracking

for index, row in pbar:
    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(anthropic_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                    # Temperature not compatible with extended thinking mode
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

Generating diagnostic reasoning trace 2 out of 30 (case 62):   3%|▎         | 1/30 [00:53<25:57, 53.71s/it] 

Completed case 1 out of 30 (case 181).


Generating diagnostic reasoning trace 3 out of 30 (case 169):   7%|▋         | 2/30 [01:18<17:07, 36.69s/it]

Completed case 2 out of 30 (case 62).


Generating diagnostic reasoning trace 4 out of 30 (case 155):  10%|█         | 3/30 [02:09<19:30, 43.36s/it]

Completed case 3 out of 30 (case 169).


Generating diagnostic reasoning trace 5 out of 30 (case 32):  13%|█▎        | 4/30 [02:44<17:17, 39.91s/it] 

Completed case 4 out of 30 (case 155).


Generating diagnostic reasoning trace 6 out of 30 (case 1013):  17%|█▋        | 5/30 [03:19<15:54, 38.18s/it]

Completed case 5 out of 30 (case 32).


Generating diagnostic reasoning trace 7 out of 30 (case 94):  20%|██        | 6/30 [04:08<16:40, 41.70s/it]  

Completed case 6 out of 30 (case 1013).


Generating diagnostic reasoning trace 8 out of 30 (case 1032):  23%|██▎       | 7/30 [04:47<15:45, 41.09s/it]

Completed case 7 out of 30 (case 94).


Generating diagnostic reasoning trace 9 out of 30 (case 1003):  27%|██▋       | 8/30 [05:40<16:21, 44.61s/it]

Completed case 8 out of 30 (case 1032).


Generating diagnostic reasoning trace 10 out of 30 (case 80):  30%|███       | 9/30 [06:33<16:35, 47.39s/it] 

Completed case 9 out of 30 (case 1003).


Generating diagnostic reasoning trace 11 out of 30 (case 109):  33%|███▎      | 10/30 [07:58<19:42, 59.13s/it]

Completed case 10 out of 30 (case 80).


Generating diagnostic reasoning trace 12 out of 30 (case 118):  37%|███▋      | 11/30 [09:11<20:03, 63.35s/it]

Completed case 11 out of 30 (case 109).


Generating diagnostic reasoning trace 13 out of 30 (case 159):  40%|████      | 12/30 [10:02<17:48, 59.36s/it]

Completed case 12 out of 30 (case 118).


Generating diagnostic reasoning trace 14 out of 30 (case 1021):  43%|████▎     | 13/30 [11:24<18:46, 66.28s/it]

Completed case 13 out of 30 (case 159).


Generating diagnostic reasoning trace 15 out of 30 (case 104):  47%|████▋     | 14/30 [13:03<20:19, 76.22s/it] 

Completed case 14 out of 30 (case 1021).


Generating diagnostic reasoning trace 16 out of 30 (case 1049):  50%|█████     | 15/30 [13:35<15:41, 62.75s/it]

Completed case 15 out of 30 (case 104).


Generating diagnostic reasoning trace 17 out of 30 (case 115):  53%|█████▎    | 16/30 [14:17<13:11, 56.56s/it] 

Completed case 16 out of 30 (case 1049).


Generating diagnostic reasoning trace 18 out of 30 (case 1009):  57%|█████▋    | 17/30 [15:06<11:44, 54.22s/it]

Completed case 17 out of 30 (case 115).


Generating diagnostic reasoning trace 19 out of 30 (case 99):  60%|██████    | 18/30 [15:34<09:19, 46.62s/it]  

Completed case 18 out of 30 (case 1009).


Generating diagnostic reasoning trace 20 out of 30 (case 23):  63%|██████▎   | 19/30 [16:22<08:34, 46.81s/it]

Completed case 19 out of 30 (case 99).


Generating diagnostic reasoning trace 21 out of 30 (case 1020):  67%|██████▋   | 20/30 [16:59<07:19, 44.00s/it]

Completed case 20 out of 30 (case 23).


Generating diagnostic reasoning trace 22 out of 30 (case 85):  70%|███████   | 21/30 [17:38<06:23, 42.59s/it]  

Completed case 21 out of 30 (case 1020).


Generating diagnostic reasoning trace 23 out of 30 (case 1012):  73%|███████▎  | 22/30 [18:16<05:29, 41.18s/it]

Completed case 22 out of 30 (case 85).


Generating diagnostic reasoning trace 24 out of 30 (case 178):  77%|███████▋  | 23/30 [19:01<04:56, 42.29s/it] 

Completed case 23 out of 30 (case 1012).


Generating diagnostic reasoning trace 25 out of 30 (case 1047):  80%|████████  | 24/30 [19:39<04:05, 40.99s/it]

Completed case 24 out of 30 (case 178).


Generating diagnostic reasoning trace 26 out of 30 (case 17):  83%|████████▎ | 25/30 [20:43<03:58, 47.75s/it]  

Completed case 25 out of 30 (case 1047).


Generating diagnostic reasoning trace 27 out of 30 (case 1033):  87%|████████▋ | 26/30 [21:22<03:00, 45.22s/it]

Completed case 26 out of 30 (case 17).


Generating diagnostic reasoning trace 28 out of 30 (case 1056):  90%|█████████ | 27/30 [21:50<02:00, 40.18s/it]

Completed case 27 out of 30 (case 1033).


Generating diagnostic reasoning trace 29 out of 30 (case 108):  93%|█████████▎| 28/30 [22:25<01:17, 38.52s/it] 

Completed case 28 out of 30 (case 1056).


Generating diagnostic reasoning trace 30 out of 30 (case 97):  97%|█████████▋| 29/30 [24:09<00:58, 58.09s/it] 

Completed case 29 out of 30 (case 108).


Generating diagnostic reasoning trace 30 out of 30 (case 97): 100%|██████████| 30/30 [25:03<00:00, 50.12s/it]

Completed case 30 out of 30 (case 97).





In [None]:
# Check for missing or empty values in the results because GPT sometimes returns incomplete outputs
missing_values = reasoning_subset[
    (reasoning_subset["model_thoughts"].isnull()) | 
    (reasoning_subset["model_thoughts"] == "") |
    (reasoning_subset["model_diagnosis"].isnull()) | 
    (reasoning_subset["model_diagnosis"] == "")
]

if not missing_values.empty:
    print("Missing or empty values found in the following cases:")
    print(missing_values[["case_id", "model_thoughts", "model_diagnosis"]])
else:
    print("No missing or empty values found in the results.")


In [20]:
# Get timestamp
import datetime

# Save to a JSON file
results_path = f"../../results/evaluate_diagnostic_reasoning/reasoning_samples_{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

reasoning_subset.to_json(results_path, orient="records", indent=2)
print(f"Reasoning samples for {model} saved to {results_path}.")

Reasoning samples for claude-opus-4-5-20251101 saved to ../../results/evaluate_diagnostic_reasoning/reasoning_samples_claude-opus-4-5-20251101_20251212_010234.json.


### 4. DeepSeek-V3.2

In [11]:
# Refresh the dataset to avoid overwriting previous results
reasoning_subset = pd.read_excel(reasoning_output_path)

In [12]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()
deepseek_client = OpenAI(api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")

In [13]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str, temperature: float) -> tuple:
    # Prepare API call parameters
    # DeepSeek: Make API call and create response object
    if model.startswith("deepseek"):
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"},
            ],
            temperature=temperature,
            stream=False
        )
        # Extract the response content
        answer = response.choices[0].message.content

        # Extract the thinking block
        reasoning = response.choices[0].message.reasoning_content

    return reasoning, answer

In [14]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "deepseek-reasoner"  # Select latest reasoning model; in this case, DeepSeek-V3.2
pbar = tqdm(reasoning_subset.iterrows(), total=reasoning_subset.shape[0])  # Progress bar for tracking

for index, row in pbar:
    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(deepseek_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                      0  # DeepSeek recommends temperature 0 for coding/math tasks where there is a correct answer
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

Generating diagnostic reasoning trace 2 out of 30 (case 62):   3%|▎         | 1/30 [00:14<06:54, 14.28s/it] 

Completed case 1 out of 30 (case 181).


Generating diagnostic reasoning trace 3 out of 30 (case 169):   7%|▋         | 2/30 [00:37<09:07, 19.54s/it]

Completed case 2 out of 30 (case 62).


Generating diagnostic reasoning trace 4 out of 30 (case 155):  10%|█         | 3/30 [00:50<07:24, 16.46s/it]

Completed case 3 out of 30 (case 169).


Generating diagnostic reasoning trace 5 out of 30 (case 32):  13%|█▎        | 4/30 [03:29<31:36, 72.96s/it] 

Completed case 4 out of 30 (case 155).


Generating diagnostic reasoning trace 6 out of 30 (case 1013):  17%|█▋        | 5/30 [03:42<21:18, 51.16s/it]

Completed case 5 out of 30 (case 32).


Generating diagnostic reasoning trace 7 out of 30 (case 94):  20%|██        | 6/30 [04:21<18:51, 47.15s/it]  

Completed case 6 out of 30 (case 1013).


Generating diagnostic reasoning trace 8 out of 30 (case 1032):  23%|██▎       | 7/30 [04:34<13:46, 35.95s/it]

Completed case 7 out of 30 (case 94).


Generating diagnostic reasoning trace 9 out of 30 (case 1003):  27%|██▋       | 8/30 [04:49<10:44, 29.30s/it]

Completed case 8 out of 30 (case 1032).


Generating diagnostic reasoning trace 10 out of 30 (case 80):  30%|███       | 9/30 [06:42<19:25, 55.52s/it] 

Completed case 9 out of 30 (case 1003).


Generating diagnostic reasoning trace 11 out of 30 (case 109):  33%|███▎      | 10/30 [06:54<14:00, 42.02s/it]

Completed case 10 out of 30 (case 80).


Generating diagnostic reasoning trace 12 out of 30 (case 118):  37%|███▋      | 11/30 [11:07<33:45, 106.59s/it]

Completed case 11 out of 30 (case 109).


Generating diagnostic reasoning trace 13 out of 30 (case 159):  40%|████      | 12/30 [11:21<23:28, 78.26s/it] 

Completed case 12 out of 30 (case 118).


Generating diagnostic reasoning trace 14 out of 30 (case 1021):  43%|████▎     | 13/30 [11:32<16:24, 57.93s/it]

Completed case 13 out of 30 (case 159).


Generating diagnostic reasoning trace 15 out of 30 (case 104):  47%|████▋     | 14/30 [11:58<12:53, 48.35s/it] 

Completed case 14 out of 30 (case 1021).


Generating diagnostic reasoning trace 16 out of 30 (case 1049):  50%|█████     | 15/30 [12:32<10:59, 44.00s/it]

Completed case 15 out of 30 (case 104).


Generating diagnostic reasoning trace 17 out of 30 (case 115):  53%|█████▎    | 16/30 [12:44<08:00, 34.32s/it] 

Completed case 16 out of 30 (case 1049).


Generating diagnostic reasoning trace 18 out of 30 (case 1009):  57%|█████▋    | 17/30 [12:55<05:57, 27.51s/it]

Completed case 17 out of 30 (case 115).


Generating diagnostic reasoning trace 19 out of 30 (case 99):  60%|██████    | 18/30 [18:08<22:39, 113.29s/it]  

Completed case 18 out of 30 (case 1009).


Generating diagnostic reasoning trace 20 out of 30 (case 23):  63%|██████▎   | 19/30 [18:19<15:06, 82.42s/it] 

Completed case 19 out of 30 (case 99).


Generating diagnostic reasoning trace 21 out of 30 (case 1020):  67%|██████▋   | 20/30 [18:34<10:23, 62.35s/it]

Completed case 20 out of 30 (case 23).


Generating diagnostic reasoning trace 22 out of 30 (case 85):  70%|███████   | 21/30 [18:46<07:02, 46.95s/it]  

Completed case 21 out of 30 (case 1020).


Generating diagnostic reasoning trace 23 out of 30 (case 1012):  73%|███████▎  | 22/30 [18:59<04:54, 36.77s/it]

Completed case 22 out of 30 (case 85).


Generating diagnostic reasoning trace 24 out of 30 (case 178):  77%|███████▋  | 23/30 [19:13<03:30, 30.09s/it] 

Completed case 23 out of 30 (case 1012).


Generating diagnostic reasoning trace 25 out of 30 (case 1047):  80%|████████  | 24/30 [19:25<02:27, 24.63s/it]

Completed case 24 out of 30 (case 178).


Generating diagnostic reasoning trace 26 out of 30 (case 17):  83%|████████▎ | 25/30 [19:38<01:46, 21.30s/it]  

Completed case 25 out of 30 (case 1047).


Generating diagnostic reasoning trace 27 out of 30 (case 1033):  87%|████████▋ | 26/30 [19:50<01:13, 18.33s/it]

Completed case 26 out of 30 (case 17).


Generating diagnostic reasoning trace 28 out of 30 (case 1056):  90%|█████████ | 27/30 [20:01<00:48, 16.16s/it]

Completed case 27 out of 30 (case 1033).


Generating diagnostic reasoning trace 29 out of 30 (case 108):  93%|█████████▎| 28/30 [20:13<00:29, 14.92s/it] 

Completed case 28 out of 30 (case 1056).


Generating diagnostic reasoning trace 30 out of 30 (case 97):  97%|█████████▋| 29/30 [20:26<00:14, 14.26s/it] 

Completed case 29 out of 30 (case 108).


Generating diagnostic reasoning trace 30 out of 30 (case 97): 100%|██████████| 30/30 [20:39<00:00, 41.31s/it]

Completed case 30 out of 30 (case 97).





In [None]:
# Check for missing or empty values in the results because GPT sometimes returns incomplete outputs
missing_values = reasoning_subset[
    (reasoning_subset["model_thoughts"].isnull()) | 
    (reasoning_subset["model_thoughts"] == "") |
    (reasoning_subset["model_diagnosis"].isnull()) | 
    (reasoning_subset["model_diagnosis"] == "")
]

if not missing_values.empty:
    print("Missing or empty values found in the following cases:")
    print(missing_values[["case_id", "model_thoughts", "model_diagnosis"]])
else:
    print("No missing or empty values found in the results.")


In [15]:
# Get timestamp
import datetime

# Save to a JSON file
results_path = f"../../results/evaluate_diagnostic_reasoning/reasoning_samples_{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

reasoning_subset.to_json(results_path, orient="records", indent=2)
print(f"Reasoning samples for {model} saved to {results_path}.")

Reasoning samples for deepseek-reasoner saved to ../../results/evaluate_diagnostic_reasoning/reasoning_samples_deepseek-reasoner_20251212_003550.json.
