# 1. Generate model thought traces for evaluation of diagnostic reasoning of 4 frontier LLMs (n = 30)

## Part A. Load in combined dataset

In [18]:
# Load the combined (literature + synthetic) dataset
import json
import pandas as pd

dataset_path = "../../datasets/combined/combined_jama.json"
with open(dataset_path, "r") as f:
    combined = json.load(f)

dataset = pd.DataFrame(combined)
dataset.head()

Unnamed: 0,case_id,vignette,diagnosis,source,difficulty,diagnosis_dsm,diagnosis_icd,reasoning
0,2,A black male in his early 20s with a remote hi...,Catatonia,case_reports_in_psychiatry,,,,
1,4,A 12-year-old male with no specific birth or m...,Anorexia nervosa with focal cortical dysplasia,case_reports_in_psychiatry,,,,
2,13,We report on a 35-year-old female with a histo...,Pseudocyesis,case_reports_in_psychiatry,,,,
3,14,The patient is a 58-year-old woman with a psyc...,Illness anxiety disorder (IAD),case_reports_in_psychiatry,,,,
4,15,The patient is a 20-year-old single man. He is...,Social anxiety disorder (SAD) and major depres...,case_reports_in_psychiatry,,,,


## Part B. Create dataset subset for diagnostic reasoning analysis

In [19]:
# Draw 30 random cases from the combined dataset for diagnostic reasoning analysis and human comparison
import random
random.seed(1000)  # For reproducibility

# Exclude a set of case IDs if needed (seen by clinicians in previous round of evaluation)
excluded_case_ids = [48, 81, 90, 142, 166, 168, 176, 177, 44, 43,
            113, 114, 131, 132, 152, 156, 21, 27, 50, 89,
            110, 123, 153, 175, 1001, 1002, 1010, 1011, 1014, 1019]

filtered_combined = [case for case in combined if case['case_id'] not in excluded_case_ids]

reasoning_subset = random.sample(filtered_combined, 30)

In [20]:
# Export reasoning subset as Excel spreadsheet to preserve Unicode characters like quotation marks
reasoning_subset = pd.DataFrame(reasoning_subset)
reasoning_output_path = "../../datasets/reasoning/jama_reasoning_subset.xlsx"

reasoning_subset.to_excel(reasoning_output_path, index=False)
print(f"Exported reasoning subset to {reasoning_output_path}.")

Exported reasoning subset to ../../datasets/reasoning/jama_reasoning_subset.xlsx.


## Part C. Set up LLM APIs

### 0. Set up prompts

In [21]:
# Define system instructions and user prompt
with open("../prompts/evaluate_diagnostic_reasoning/system_prompt.txt") as f:
    system_prompt = f.read()

with open("../prompts/evaluate_diagnostic_reasoning/user_prompt.txt") as f:
    user_prompt = f.read()

### 1. Google Gemini 3 Pro

In [22]:
from google import genai
from google.genai import types
from dotenv import load_dotenv

# Load API key from environment variable
load_dotenv()

# Initialize the GenAI client
google_client = genai.Client()

In [23]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str, temperature: float) -> tuple:
    # Prepare API call parameters
    # Google Gemini: Make API call and create response object
    if model.startswith("gemini"):
        response = client.models.generate_content(
            model=model,
            contents=user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>",  # User prompt with inserted vignette
            config=types.GenerateContentConfig(
                thinking_config=types.ThinkingConfig(
                    thinking_level="high",  # Use thinking_level for Gemini 3, not thinking_budget since it may result in subpar performance
                    include_thoughts=True  # Include thought summaries in parts/thought within `response` parameters
                    ),
                system_instruction=system_prompt,  # System prompt
                temperature=temperature  # Model temperature
            ),
        )

        # Iterate through response object
        for part in response.parts:
            if not part.text:
                continue
            if part.thought:
                reasoning = part.text  # Extract thought summary
            else:
                answer = part.text  # Extract differential diagnosis list

    return reasoning, answer

In [24]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "gemini-3-pro-preview"
#pbar = tqdm(reasoning_subset.iterrows(), total=reasoning_subset.shape[0])  # Progress bar for tracking
pbar = tqdm(dataset[dataset['case_id'] == 181].iterrows(), total=1)  # Select specific case for troubleshooting

for index, row in pbar:
    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(google_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                      1,  # Google advises keeping temperature at 1 for Gemini 3 to avoid messing with reasoning behavior
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

Generating diagnostic reasoning trace 134 out of 30 (case 181):   0%|          | 0/1 [00:17<?, ?it/s]


KeyboardInterrupt: 

In [None]:
reasoning, answer = generate_diagnostic_reasoning(google_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                      1,  # Google advises keeping temperature at 1 for Gemini 3 to avoid messing with reasoning behavior
                                                    )

In [None]:
# Check for missing or empty values in the results because GPT sometimes returns incomplete outputs
missing_values = reasoning_subset[
    (reasoning_subset["model_thoughts"].isnull()) | 
    (reasoning_subset["model_thoughts"] == "") |
    (reasoning_subset["model_diagnosis"].isnull()) | 
    (reasoning_subset["model_diagnosis"] == "")
]

if not missing_values.empty:
    print("Missing or empty values found in the following cases:")
    print(missing_values[["case_id", "model_thoughts", "model_diagnosis"]])
else:
    print("No missing or empty values found in the results.")


In [None]:
# Get timestamp
import datetime

# Save to a JSON file
results_path = f"../../results/evaluate_diagnostic_reasoning/reasoning_samples_{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

reasoning_subset.to_json(results_path, orient="records", indent=2)
print(f"Reasoning samples for {model} saved to {results_path}.")

### 2. OpenAI GPT-5.2

In [7]:
# Refresh the dataset to avoid overwriting previous results
reasoning_subset = pd.read_excel(reasoning_output_path)

In [9]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load API key from environment variable
load_dotenv()

# Initialize the OpenAI client
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [10]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str) -> tuple:
    # Prepare API call parameters
    # OpenAI: Make API call and create response object
    if model.startswith("gpt"):
        response = client.responses.create(
            model=model,  # gpt-5.2-pro is way too expensive; use gpt-5.2
            reasoning={
                "effort": "xhigh",  # Favors even more complete reasoning
                "summary": "detailed"  # Give as much detail as possible in thinking block
            },
            text={
                "verbosity": "low"  # To keep the model on task for diagnosis
            },
            input=[
                {
                    "role": "developer",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"
                }
            ]
        )
    
    # Handle different output array lengths dynamically
    if len(response.output) == 1:
        # Only differential diagnosis present
        reasoning = None
        answer = response.output[0].content[0].text
    elif len(response.output) >= 2:
        # Extract the thought summary by concatenating all thinking blocks using newlines and a list comprehension
        reasoning = "\n\n".join([block.text for block in response.output[0].summary])
        
        # Extract the differential diagnosis list
        answer = response.output[1].content[0].text
    else:
        # Handle unexpected cases
        reasoning = None
        answer = None

    return reasoning, answer

In [None]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "gpt-5.2"

# Start from specific row if resuming
pbar = tqdm(reasoning_subset.iterrows(), total=reasoning_subset.shape[0])  # Progress bar for tracking

for index, row in pbar:
    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(openai_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                    # Temperature not supported with reasoning effort set to high 
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

Generating diagnostic reasoning trace 2 out of 30 (case 62):   3%|▎         | 1/30 [04:05<1:58:28, 245.11s/it] 

Completed case 1 out of 30 (case 181).


Generating diagnostic reasoning trace 3 out of 30 (case 169):   7%|▋         | 2/30 [05:09<1:04:48, 138.86s/it]

Completed case 2 out of 30 (case 62).


Generating diagnostic reasoning trace 4 out of 30 (case 155):  10%|█         | 3/30 [13:14<2:13:38, 296.98s/it]

Completed case 3 out of 30 (case 169).


Generating diagnostic reasoning trace 5 out of 30 (case 32):  13%|█▎        | 4/30 [23:04<2:58:51, 412.74s/it] 

Completed case 4 out of 30 (case 155).


Generating diagnostic reasoning trace 6 out of 30 (case 1013):  17%|█▋        | 5/30 [25:59<2:16:09, 326.79s/it]

Completed case 5 out of 30 (case 32).


Generating diagnostic reasoning trace 7 out of 30 (case 94):  20%|██        | 6/30 [28:18<1:45:11, 262.97s/it]  

Completed case 6 out of 30 (case 1013).


Generating diagnostic reasoning trace 8 out of 30 (case 1032):  23%|██▎       | 7/30 [34:42<1:56:00, 302.64s/it]

Completed case 7 out of 30 (case 94).


Generating diagnostic reasoning trace 9 out of 30 (case 1003):  27%|██▋       | 8/30 [38:55<1:45:09, 286.82s/it]

Completed case 8 out of 30 (case 1032).


Generating diagnostic reasoning trace 10 out of 30 (case 80):  30%|███       | 9/30 [43:14<1:37:18, 278.00s/it] 

Completed case 9 out of 30 (case 1003).


Generating diagnostic reasoning trace 11 out of 30 (case 109):  33%|███▎      | 10/30 [46:39<1:25:08, 255.44s/it]

Completed case 10 out of 30 (case 80).


Generating diagnostic reasoning trace 12 out of 30 (case 118):  37%|███▋      | 11/30 [55:46<1:49:09, 344.72s/it]

Completed case 11 out of 30 (case 109).


Generating diagnostic reasoning trace 13 out of 30 (case 159):  40%|████      | 12/30 [1:00:37<1:38:32, 328.47s/it]

Completed case 12 out of 30 (case 118).


Generating diagnostic reasoning trace 14 out of 30 (case 1021):  43%|████▎     | 13/30 [1:09:27<1:50:20, 389.43s/it]

Completed case 13 out of 30 (case 159).


Generating diagnostic reasoning trace 15 out of 30 (case 104):  47%|████▋     | 14/30 [1:16:21<1:45:48, 396.78s/it] 

Completed case 14 out of 30 (case 1021).


Generating diagnostic reasoning trace 16 out of 30 (case 1049):  50%|█████     | 15/30 [1:19:02<1:21:28, 325.88s/it]

Completed case 15 out of 30 (case 104).


Generating diagnostic reasoning trace 17 out of 30 (case 115):  53%|█████▎    | 16/30 [1:22:15<1:06:40, 285.78s/it] 

Completed case 16 out of 30 (case 1049).


Generating diagnostic reasoning trace 18 out of 30 (case 1009):  57%|█████▋    | 17/30 [1:23:50<49:30, 228.47s/it] 

Completed case 17 out of 30 (case 115).


Generating diagnostic reasoning trace 19 out of 30 (case 99):  60%|██████    | 18/30 [1:25:29<37:53, 189.45s/it]  

Completed case 18 out of 30 (case 1009).


Generating diagnostic reasoning trace 20 out of 30 (case 23):  63%|██████▎   | 19/30 [1:27:37<31:21, 171.03s/it]

Completed case 19 out of 30 (case 99).


Generating diagnostic reasoning trace 21 out of 30 (case 1020):  67%|██████▋   | 20/30 [1:32:04<33:20, 200.04s/it]

Completed case 20 out of 30 (case 23).


Generating diagnostic reasoning trace 22 out of 30 (case 85):  70%|███████   | 21/30 [1:34:01<26:15, 175.03s/it]  

Completed case 21 out of 30 (case 1020).


Generating diagnostic reasoning trace 23 out of 30 (case 1012):  73%|███████▎  | 22/30 [1:39:00<28:16, 212.12s/it]

Completed case 22 out of 30 (case 85).


Generating diagnostic reasoning trace 24 out of 30 (case 178):  77%|███████▋  | 23/30 [1:41:03<21:37, 185.33s/it] 

Completed case 23 out of 30 (case 1012).


Generating diagnostic reasoning trace 25 out of 30 (case 1047):  80%|████████  | 24/30 [1:42:59<16:28, 164.76s/it]

Completed case 24 out of 30 (case 178).


Generating diagnostic reasoning trace 26 out of 30 (case 17):  83%|████████▎ | 25/30 [1:48:16<17:31, 210.21s/it]  

Completed case 25 out of 30 (case 1047).


Generating diagnostic reasoning trace 27 out of 30 (case 1033):  87%|████████▋ | 26/30 [1:51:02<13:08, 197.16s/it]

Completed case 26 out of 30 (case 17).


Generating diagnostic reasoning trace 28 out of 30 (case 1056):  90%|█████████ | 27/30 [1:53:03<08:42, 174.17s/it]

Completed case 27 out of 30 (case 1033).


Generating diagnostic reasoning trace 29 out of 30 (case 108):  93%|█████████▎| 28/30 [1:56:19<06:01, 180.69s/it] 

Completed case 28 out of 30 (case 1056).


Generating diagnostic reasoning trace 30 out of 30 (case 97):  97%|█████████▋| 29/30 [2:06:14<05:05, 305.04s/it] 

Completed case 29 out of 30 (case 108).


Generating diagnostic reasoning trace 30 out of 30 (case 97): 100%|██████████| 30/30 [2:20:56<00:00, 281.89s/it]

Completed case 30 out of 30 (case 97).





In [14]:
# Check for missing or empty values and rerun until none remain
max_iterations = 10  # Prevent infinite loop
iteration = 0

while iteration < max_iterations:
    # Check for missing or empty values
    missing_values = reasoning_subset[
        (reasoning_subset["model_thoughts"].isnull()) | 
        (reasoning_subset["model_thoughts"] == "") |
        (reasoning_subset["model_diagnosis"].isnull()) | 
        (reasoning_subset["model_diagnosis"] == "")
    ]
    
    if missing_values.empty:
        print("No missing or empty values found in the results.")
        break
    
    iteration += 1
    print(f"\n=== Iteration {iteration} ===")
    print(f"Missing or empty values found in {len(missing_values)} cases:")
    print(missing_values[["case_id", "model_thoughts", "model_diagnosis"]])
    
    # Get all indices with missing values
    missing_indices = missing_values.index.tolist()
    print(f"Rerunning {len(missing_indices)} cases with missing data: {missing_indices}")
    
    for index_to_rerun in missing_indices:
        print(f"\nRerunning case {index_to_rerun} (case_id: {reasoning_subset.iloc[index_to_rerun]['case_id']})")
        
        try:
            reasoning, answer = generate_diagnostic_reasoning(openai_client,
                                                            model,
                                                            system_prompt,
                                                            user_prompt,
                                                            reasoning_subset.iloc[index_to_rerun]["vignette"],
                                                        )
            
            reasoning_subset.loc[index_to_rerun, "model_thoughts"] = reasoning
            reasoning_subset.loc[index_to_rerun, "model_diagnosis"] = answer
            print(f"Successfully completed case {index_to_rerun}")
            
        except Exception as e:
            print(f"Error processing case {index_to_rerun}: {str(e)}")
            continue

print(f"\nCompleted after {iteration} iteration(s).")



=== Iteration 1 ===
Missing or empty values found in 4 cases:
    case_id model_thoughts                                    model_diagnosis
4        32                 1. Posttraumatic stress disorder (PTSD) - F43....
17     1009                 1. Anorexia nervosa, restricting type - F50.01...
25       17                 1. Schizophrenia - ICD-10 F20.9  \n2. Cannabis...
29       97                 1. Bipolar I disorder, current episode manic, ...
Rerunning 4 cases with missing data: [4, 17, 25, 29]

Rerunning case 4 (case_id: 32)
Successfully completed case 4

Rerunning case 17 (case_id: 1009)
Successfully completed case 17

Rerunning case 25 (case_id: 17)
Successfully completed case 25

Rerunning case 29 (case_id: 97)
Successfully completed case 29
No missing or empty values found in the results.

Completed after 1 iteration(s).


In [15]:
# Get timestamp
import datetime

# Save to a JSON file
results_path = f"../../results/evaluate_diagnostic_reasoning/reasoning_samples_{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

reasoning_subset.to_json(results_path, orient="records", indent=2)
print(f"Reasoning samples for {model} saved to {results_path}.")

Reasoning samples for gpt-5.2 saved to ../../results/evaluate_diagnostic_reasoning/reasoning_samples_gpt-5.2_20251215_213246.json.


### 3. Anthropic Claude Opus 4.5

In [None]:
# Refresh the dataset to avoid overwriting previous results
reasoning_subset = pd.read_excel(reasoning_output_path)

In [None]:
import anthropic
from dotenv import load_dotenv

# Load API key from environment variable
load_dotenv()

# Initialize the Anthropic client
anthropic_client = anthropic.Anthropic()

In [None]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str) -> tuple:
    # Prepare API call parameters
    # Anthropic Claude: Make API call and create response object
    if model.startswith("claude"):
        response = client.messages.create(
            model=model,
            max_tokens=20000,  # Max output for Claude Opus 4.5 is 64k but >20k requires streaming
            system=system_prompt,
            # Extended thinking mode is not compatible with temperature, top_p, or top_k sampling
            thinking={
                "type": "enabled",
                "budget_tokens": 19000  # Allocate tokens for thinking - model may not use entire budget
            },
            messages=[
                {
                    "role": "user", 
                    "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"
                }
            ]
        )

         # Handle model refusal to answer
        if response.stop_reason == "refusal":
            reasoning = "N/A"
            answer = "Model refused to answer the prompt."
            return reasoning, answer

        # Extract the response content
        for block in response.content:
            if block.type == "text":  # Extract differential diagnosis block
                answer = block.text
            elif block.type == "thinking":  # Extract summarized thinking block
                reasoning = block.thinking
            elif block.type == "redacted_thinking":  # Handle redacted thinking block
                print(f"Redacted thinking detected for \"{vignette[:30]}...\"")
                reasoning = block.thinking

    return reasoning, answer

In [None]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "claude-opus-4-5-20251101"
pbar = tqdm(reasoning_subset.iterrows(), total=reasoning_subset.shape[0])  # Progress bar for tracking

for index, row in pbar:
    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(anthropic_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                    # Temperature not compatible with extended thinking mode
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

In [None]:
# Check for missing or empty values in the results because GPT sometimes returns incomplete outputs
missing_values = reasoning_subset[
    (reasoning_subset["model_thoughts"].isnull()) | 
    (reasoning_subset["model_thoughts"] == "") |
    (reasoning_subset["model_diagnosis"].isnull()) | 
    (reasoning_subset["model_diagnosis"] == "")
]

if not missing_values.empty:
    print("Missing or empty values found in the following cases:")
    print(missing_values[["case_id", "model_thoughts", "model_diagnosis"]])
else:
    print("No missing or empty values found in the results.")


In [None]:
# Get timestamp
import datetime

# Save to a JSON file
results_path = f"../../results/evaluate_diagnostic_reasoning/reasoning_samples_{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

reasoning_subset.to_json(results_path, orient="records", indent=2)
print(f"Reasoning samples for {model} saved to {results_path}.")

### 4. DeepSeek-V3.2

In [None]:
# Refresh the dataset to avoid overwriting previous results
reasoning_subset = pd.read_excel(reasoning_output_path)

In [None]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()
deepseek_client = OpenAI(api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")

In [None]:
# Function to generate a diagnosis and extract diagnostic reasoning from the thinking blocks
def generate_diagnostic_reasoning(client, model: str, system_prompt: str, user_prompt: str, vignette: str, temperature: float) -> tuple:
    # Prepare API call parameters
    # DeepSeek: Make API call and create response object
    if model.startswith("deepseek"):
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"},
            ],
            temperature=temperature,
            stream=False
        )
        # Extract the response content
        answer = response.choices[0].message.content

        # Extract the thinking block
        reasoning = response.choices[0].message.reasoning_content

    return reasoning, answer

In [None]:
# Iterate through the reasoning samples and generate diagnostic reasoning, saving the results
from tqdm import tqdm

model = "deepseek-reasoner"  # Select latest reasoning model; in this case, DeepSeek-V3.2
pbar = tqdm(reasoning_subset.iterrows(), total=reasoning_subset.shape[0])  # Progress bar for tracking

for index, row in pbar:
    # Update progress bar description with current case ID
    pbar.set_description(f"Generating diagnostic reasoning trace {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']})")

    # Generate diagnostic reasoning
    reasoning, answer = generate_diagnostic_reasoning(deepseek_client,
                                                      model,
                                                      system_prompt,
                                                      user_prompt,
                                                      row["vignette"],
                                                      0  # DeepSeek recommends temperature 0 for coding/math tasks where there is a correct answer
                                                    )
    
    # Save the results to the DataFrame
    reasoning_subset.loc[index, "model_thoughts"] = reasoning
    reasoning_subset.loc[index, "model_diagnosis"] = answer
    print(f"Completed case {index + 1} out of {reasoning_subset.shape[0]} (case {row['case_id']}).")

In [None]:
# Check for missing or empty values in the results because GPT sometimes returns incomplete outputs
missing_values = reasoning_subset[
    (reasoning_subset["model_thoughts"].isnull()) | 
    (reasoning_subset["model_thoughts"] == "") |
    (reasoning_subset["model_diagnosis"].isnull()) | 
    (reasoning_subset["model_diagnosis"] == "")
]

if not missing_values.empty:
    print("Missing or empty values found in the following cases:")
    print(missing_values[["case_id", "model_thoughts", "model_diagnosis"]])
else:
    print("No missing or empty values found in the results.")


In [None]:
# Get timestamp
import datetime

# Save to a JSON file
results_path = f"../../results/evaluate_diagnostic_reasoning/reasoning_samples_{model}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

reasoning_subset.to_json(results_path, orient="records", indent=2)
print(f"Reasoning samples for {model} saved to {results_path}.")