# 1. Generate model thought traces for evaluation of diagnostic reasoning of 4 frontier LLMs (n = 30)

## Part A. Load in combined dataset

In [1]:
# Load the combined (literature + synthetic) dataset
import json
import pandas as pd

dataset_path = "../../datasets/combined/combined_jama.json"
with open(dataset_path, "r") as f:
    combined = json.load(f)

dataset = pd.DataFrame(combined)
dataset.head()

Unnamed: 0,case_id,vignette,diagnosis,source,difficulty,diagnosis_dsm,diagnosis_icd,reasoning
0,2,A black male in his early 20s with a remote hi...,Catatonia,case_reports_in_psychiatry,,,,
1,4,A 12-year-old male with no specific birth or m...,Anorexia nervosa with focal cortical dysplasia,case_reports_in_psychiatry,,,,
2,13,We report on a 35-year-old female with a histo...,Pseudocyesis,case_reports_in_psychiatry,,,,
3,14,The patient is a 58-year-old woman with a psyc...,Illness anxiety disorder (IAD),case_reports_in_psychiatry,,,,
4,15,The patient is a 20-year-old single man. He is...,Social anxiety disorder (SAD) and major depres...,case_reports_in_psychiatry,,,,


## Part B. Create dataset subset for diagnostic reasoning analysis

In [2]:
# Draw 30 random cases from the combined dataset for diagnostic reasoning analysis and human comparison
import random
random.seed(1000)  # For reproducibility

# Exclude a set of case IDs if needed (seen by clinicians in previous round of evaluation)
excluded_case_ids = [48, 81, 90, 142, 166, 168, 176, 177, 44, 43,
            113, 114, 131, 132, 152, 156, 21, 27, 50, 89,
            110, 123, 153, 175, 1001, 1002, 1010, 1011, 1014, 1019]

filtered_combined = [case for case in combined if case['case_id'] not in excluded_case_ids]

reasoning_subset = random.sample(filtered_combined, 30)

In [3]:
# Export reasoning subset as Excel spreadsheet to preserve Unicode characters like quotation marks
reasoning_subset = pd.DataFrame(reasoning_subset)
reasoning_output_path = "../../datasets/reasoning/jama_reasoning_subset.xlsx"

reasoning_subset.to_excel(reasoning_output_path, index=False)
print(f"Exported reasoning subset to {reasoning_output_path}.")

Exported reasoning subset to ../../datasets/reasoning/jama_reasoning_subset.xlsx.


## Part C. Set up LLM APIs

In [4]:
# Define system instructions and user prompt
with open("../prompts/system_prompt.txt") as f:
    system_prompt = f.read()

with open("../prompts/user_prompt.txt") as f:
    user_prompt = f.read()

### 1. Google Gemini 3 Pro

In [None]:
from google import genai
from google.genai import types
from dotenv import load_dotenv

# Load API key from environment variable
load_dotenv()

# Initialize the GenAI client
google_client = genai.Client()

True

In [None]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str, temperature: float) -> tuple:
    # Prepare API call parameters
    # Google Gemini: Make API call and create response object
    if model.startswith("gemini"):
        response = client.models.generate_content(
            model=model,
            contents=user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>",  # User prompt with inserted vignette
            config=types.GenerateContentConfig(
                thinking_config=types.ThinkingConfig(
                    thinking_level="high",  # Use thinking_level for Gemini 3, not thinking_budget since it may result in subpar performance
                    include_thoughts=True  # Include thought summaries in parts/thought within `response` parameters
                    ),
                system_instruction=system_prompt,  # System prompt
                temperature=temperature  # Model temperature
            ),
        )

        # Iterate through response object
        for part in response.parts:
            if not part.text:
                continue
            if part.thought:
                reasoning = part.text  # Extract thought summary
            else:
                answer = part.text  # Extract differential diagnosis list

    return reasoning, answer

In [None]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "gemini-3-pro-preview"
pbar = tqdm(reasoning_subset.iterrows(), total=reasoning_subset.shape[0])  # Progress bar for tracking

for index, row in pbar:
    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(google_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                      1,  # Google advises keeping temperature at 1 for Gemini 3 to avoid messing with reasoning behavior
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

Generating diagnostic reasoning trace 2 out of 30 (case 62):   3%|▎         | 1/30 [00:47<22:58, 47.54s/it] 

Completed case 0 out of 30 (case 181).


Generating diagnostic reasoning trace 3 out of 30 (case 169):   7%|▋         | 2/30 [01:39<23:19, 49.98s/it]

Completed case 1 out of 30 (case 62).


Generating diagnostic reasoning trace 4 out of 30 (case 155):  10%|█         | 3/30 [02:43<25:24, 56.48s/it]

Completed case 2 out of 30 (case 169).


Generating diagnostic reasoning trace 5 out of 30 (case 32):  13%|█▎        | 4/30 [04:05<28:49, 66.52s/it] 

Completed case 3 out of 30 (case 155).


Generating diagnostic reasoning trace 6 out of 30 (case 1013):  17%|█▋        | 5/30 [04:50<24:29, 58.78s/it]

Completed case 4 out of 30 (case 32).


Generating diagnostic reasoning trace 7 out of 30 (case 94):  20%|██        | 6/30 [06:01<25:11, 62.99s/it]  

Completed case 5 out of 30 (case 1013).


Generating diagnostic reasoning trace 8 out of 30 (case 1032):  23%|██▎       | 7/30 [07:58<30:52, 80.53s/it]

Completed case 6 out of 30 (case 94).


Generating diagnostic reasoning trace 9 out of 30 (case 1003):  27%|██▋       | 8/30 [09:31<30:58, 84.49s/it]

Completed case 7 out of 30 (case 1032).


Generating diagnostic reasoning trace 10 out of 30 (case 80):  30%|███       | 9/30 [11:01<30:12, 86.31s/it] 

Completed case 8 out of 30 (case 1003).


Generating diagnostic reasoning trace 11 out of 30 (case 109):  33%|███▎      | 10/30 [12:14<27:25, 82.25s/it]

Completed case 9 out of 30 (case 80).


Generating diagnostic reasoning trace 12 out of 30 (case 118):  37%|███▋      | 11/30 [13:23<24:46, 78.23s/it]

Completed case 10 out of 30 (case 109).


Generating diagnostic reasoning trace 13 out of 30 (case 159):  40%|████      | 12/30 [14:27<22:11, 73.96s/it]

Completed case 11 out of 30 (case 118).


Generating diagnostic reasoning trace 14 out of 30 (case 1021):  43%|████▎     | 13/30 [15:52<21:51, 77.14s/it]

Completed case 12 out of 30 (case 159).


Generating diagnostic reasoning trace 15 out of 30 (case 104):  47%|████▋     | 14/30 [17:01<19:56, 74.78s/it] 

Completed case 13 out of 30 (case 1021).


Generating diagnostic reasoning trace 16 out of 30 (case 1049):  50%|█████     | 15/30 [18:02<17:37, 70.49s/it]

Completed case 14 out of 30 (case 104).


Generating diagnostic reasoning trace 17 out of 30 (case 115):  53%|█████▎    | 16/30 [19:09<16:11, 69.37s/it] 

Completed case 15 out of 30 (case 1049).


Generating diagnostic reasoning trace 18 out of 30 (case 1009):  57%|█████▋    | 17/30 [19:53<13:23, 61.83s/it]

Completed case 16 out of 30 (case 115).


Generating diagnostic reasoning trace 19 out of 30 (case 99):  60%|██████    | 18/30 [20:55<12:22, 61.90s/it]  

Completed case 17 out of 30 (case 1009).


Generating diagnostic reasoning trace 20 out of 30 (case 23):  63%|██████▎   | 19/30 [21:38<10:18, 56.27s/it]

Completed case 18 out of 30 (case 99).


Generating diagnostic reasoning trace 21 out of 30 (case 1020):  67%|██████▋   | 20/30 [22:50<10:09, 60.95s/it]

Completed case 19 out of 30 (case 23).


Generating diagnostic reasoning trace 22 out of 30 (case 85):  70%|███████   | 21/30 [24:46<11:37, 77.51s/it]  

Completed case 20 out of 30 (case 1020).


Generating diagnostic reasoning trace 23 out of 30 (case 1012):  73%|███████▎  | 22/30 [26:58<12:31, 93.93s/it]

Completed case 21 out of 30 (case 85).


Generating diagnostic reasoning trace 24 out of 30 (case 178):  77%|███████▋  | 23/30 [27:46<09:21, 80.18s/it] 

Completed case 22 out of 30 (case 1012).


Generating diagnostic reasoning trace 25 out of 30 (case 1047):  80%|████████  | 24/30 [28:30<06:55, 69.32s/it]

Completed case 23 out of 30 (case 178).


Generating diagnostic reasoning trace 26 out of 30 (case 17):  83%|████████▎ | 25/30 [29:30<05:31, 66.32s/it]  

Completed case 24 out of 30 (case 1047).


Generating diagnostic reasoning trace 27 out of 30 (case 1033):  87%|████████▋ | 26/30 [30:30<04:17, 64.46s/it]

Completed case 25 out of 30 (case 17).


Generating diagnostic reasoning trace 28 out of 30 (case 1056):  90%|█████████ | 27/30 [30:58<02:41, 53.72s/it]

Completed case 26 out of 30 (case 1033).


Generating diagnostic reasoning trace 29 out of 30 (case 108):  93%|█████████▎| 28/30 [31:59<01:51, 55.70s/it] 

Completed case 27 out of 30 (case 1056).


Generating diagnostic reasoning trace 30 out of 30 (case 97):  97%|█████████▋| 29/30 [33:03<00:58, 58.39s/it] 

Completed case 28 out of 30 (case 108).


Generating diagnostic reasoning trace 30 out of 30 (case 97): 100%|██████████| 30/30 [34:18<00:00, 68.63s/it]

Completed case 29 out of 30 (case 97).





In [8]:
# Get timestamp
import datetime

# Save to a JSON file
results_path = f"../../results/evaluate_diagnostic_reasoning/reasoning_samples_{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

reasoning_subset.to_json(results_path, orient="records", indent=2)
print(f"Reasoning samples for {model} saved to {results_path}.")

Reasoning samples for gemini-3-pro-preview saved to ../../results/evaluate_diagnostic_reasoning/reasoning_samples_gemini-3-pro-preview_20251211_001432.json.


### 2. OpenAI GPT 5.2

In [None]:
# Refresh the dataset to avoid overwriting previous results
reasoning_subset = pd.read_excel(reasoning_output_path)

In [15]:
from openai import OpenAI
from dotenv import load_dotenv

# Load API key from environment variable
load_dotenv()

# Initialize the OpenAI client
openai_client = OpenAI()

In [None]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str) -> tuple:
    # Prepare API call parameters
    # OpenAI: Make API call and create response object
    if model.startswith("gpt"):
        response = client.responses.create(
            model=model,  # gpt-5.2-pro is way too expensive; use gpt-5.2
            reasoning={
                "effort": "xhigh",  # Favors even more complete reasoning
                "summary": "detailed"  # Give as much detail as possible in thinking block
            },
            text={
                "verbosity": "low"  # To keep the model on task for diagnosis
            },
            input=[
                {
                    "role": "user",
                    "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"
                }
            ]
        )
    
    # Extract the thought summary
    reasoning = "\n\n".join([block.text for block in response.output[0].summary])

    # Extract the differential diagnosis list
    answer = response.output[1].content[0].text

    return reasoning, answer

In [21]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "gpt-5.2"
pbar = tqdm(reasoning_subset.iterrows(), total=reasoning_subset.shape[0])  # Progress bar for tracking

for index, row in pbar:
    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(openai_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                    # Temperature not supported with reasoning effort set to high 
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

Generating diagnostic reasoning trace 2 out of 30 (case 62):   3%|▎         | 1/30 [00:02<01:20,  2.79s/it] 

Completed case 0 out of 30 (case 181).


Generating diagnostic reasoning trace 3 out of 30 (case 169):   7%|▋         | 2/30 [00:19<05:07, 10.98s/it]

Completed case 1 out of 30 (case 62).


Generating diagnostic reasoning trace 4 out of 30 (case 155):  10%|█         | 3/30 [00:41<07:10, 15.95s/it]

Completed case 2 out of 30 (case 169).


Generating diagnostic reasoning trace 4 out of 30 (case 155):  10%|█         | 3/30 [00:48<07:17, 16.22s/it]


KeyboardInterrupt: 

### 3. Anthropic Claude Opus 4.5

In [None]:
# Refresh the dataset to avoid overwriting previous results
reasoning_subset = pd.read_excel(reasoning_output_path)

In [None]:
import anthropic
from dotenv import load_dotenv

# Load API key from environment variable
load_dotenv()

# Initialize the Anthropic client
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

In [None]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str, temperature: float) -> tuple:
    # Prepare API call parameters
    # Anthropic Claude: Make API call and create response object
    if model.startswith("claude"):
        response = client.messages.create(
            model=model,
            max_tokens=20000,  # Max output for Claude Opus 4.5 is 64k but >20k requires streaming
            system=system_prompt,
            # Extended thinking mode is not compatible with temperature, top_p, or top_k sampling
            thinking={
                "type": "enabled",
                "budget_tokens": 19000  # Allocate tokens for thinking - model may not use entire budget
            },
            messages=[
                {
                    "role": "user", 
                    "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"
                }
            ]
        )

         # Handle model refusal to answer
        if response.stop_reason == "refusal":
            reasoning = "N/A"
            answer = "Model refused to answer the prompt."
            return reasoning, answer

        for block in response.content:
            # Extract the response content
            if block.type == "text":  # Extract differential diagnosis block
                answer = block.text
            elif block.type == "thinking":  # Extract summarized thinking block
                reasoning = block.thinking
            elif block.type == "redacted_thinking":  # Handle redacted thinking block
                print(f"Redacted thinking detected for \"{vignette[:30]}...\"")
                reasoning = block.thinking

    return reasoning, answer

In [None]:
model = "claude-opus-4-5-20251101"

### 4. DeepSeek-V3.2

In [None]:
# Refresh the dataset to avoid overwriting previous results
reasoning_subset = pd.read_excel(reasoning_output_path)

In [None]:
model = "deepseek-reasoner"  # Select latest reasoning model

In [None]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(base_url="https://api.deepseek.com")

In [None]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str, temperature: float) -> tuple:
    # Prepare API call parameters
    # DeepSeek: Make API call and create response object
    if model.startswith("deepseek"):
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"},
            ],
            temperature=temperature,
            stream=False
        )
        # Extract the response content
        answer = response.choices[0].message.content

        # Extract the thinking block
        reasoning = response.choices[0].message.reasoning_content

    return reasoning, answer