# 1. Generate top-5 differential diagnoses for complete dataset

## A. Load in the combined dataset

In [4]:
# Load the combined (literature + synthetic) dataset
import json
import pandas as pd

dataset_path = "../../../datasets/combined/combined_jama.json"
with open(dataset_path, "r") as f:
    combined = json.load(f)

dataset = pd.DataFrame(combined)
dataset.head()

Unnamed: 0,case_id,vignette,diagnosis,source,difficulty,diagnosis_dsm,diagnosis_icd,reasoning
0,2,A black male in his early 20s with a remote hi...,Catatonia,case_reports_in_psychiatry,,,,
1,4,A 12-year-old male with no specific birth or m...,Anorexia nervosa with focal cortical dysplasia,case_reports_in_psychiatry,,,,
2,13,We report on a 35-year-old female with a histo...,Pseudocyesis,case_reports_in_psychiatry,,,,
3,14,The patient is a 58-year-old woman with a psyc...,Illness anxiety disorder (IAD),case_reports_in_psychiatry,,,,
4,15,The patient is a 20-year-old single man. He is...,Social anxiety disorder (SAD) and major depres...,case_reports_in_psychiatry,,,,


## B. Set up prompts

In [6]:
# Define system instructions and user prompt
with open("../../prompts/top_5_accuracy/system_prompt.txt") as f:
    system_prompt = f.read()

with open("../../prompts/top_5_accuracy/user_prompt.txt") as f:
    user_prompt = f.read()

In [7]:
from google import genai
from google.genai import types

In [37]:
google_client = genai.Client()
model = "gemini-3-pro-preview"

In [None]:
def generate_top5_diagnoses(client, model, system_prompt, user_prompt, vignette, temperature):
    response = client.models.generate_content(
                model=model,
                contents=user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>",  # User prompt with inserted vignette
                config=types.GenerateContentConfig(
                    thinking_config=types.ThinkingConfig(
                        thinking_level="high",  # Use thinking_level for Gemini 3, not thinking_budget since it may result in subpar performance
                        include_thoughts=True  # Include thought summaries in parts/thought within `response` parameters
                        ),
                    system_instruction=system_prompt,  # System prompt
                    temperature=temperature  # Model temperature
                ),
            )

    # Handle content filter triggering
    if response.prompt_feedback.block_reason:
        print(f"Content filter triggered: {response.prompt_feedback.block_reason.name}")
        print("Skipping case...")
        reasoning = "Content filter triggered."
        answer = "Content filter triggered."
        return reasoning, answer

    # Iterate through response object to extract thought summary and differential diagnosis list
    for part in response.parts:
        if not part.text:
            continue
        if part.thought:
            reasoning = part.text  # Extract thought summary
        else:
            answer = part.text  # Extract differential diagnosis list

    return reasoning, answer

## Troubleshoot specific cases returning unexpected responses

In [None]:
vignette = dataset.loc[dataset["case_id"] == 182, "vignette"].values[0]

response = google_client.models.generate_content(
            model=model,
            contents=user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>",  # User prompt with inserted vignette
            config=types.GenerateContentConfig(
                thinking_config=types.ThinkingConfig(
                    thinking_level="high",  # Use thinking_level for Gemini 3, not thinking_budget since it may result in subpar performance
                    include_thoughts=True  # Include thought summaries in parts/thought within `response` parameters
                    ),
                system_instruction=system_prompt,  # System prompt
                temperature=1  # Model temperature
            ),
        )

In [43]:
response # Case 182 causes Gemini to return an answer with prohibited content; skip this case

GenerateContentResponse(
  automatic_function_calling_history=[],
  model_version='gemini-3-pro-preview',
  prompt_feedback=GenerateContentResponsePromptFeedback(
    block_reason=<BlockedReason.PROHIBITED_CONTENT: 'PROHIBITED_CONTENT'>
  ),
  response_id='4QtDadmyNMjRjMcP5dHq8QY',
  sdk_http_response=HttpResponse(
    headers=<dict len=11>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    prompt_token_count=1689,
    prompt_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.TEXT: 'TEXT'>,
        token_count=1689
      ),
    ],
    total_token_count=1689
  )
)

In [25]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load API key from environment variable
load_dotenv()

# Initialize the OpenAI client
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [28]:
# Generate differential diagnosis for one case using GPT-5.2
def generate_top5_diagnoses(client, model: str, system_prompt: str, user_prompt: str, vignette: str) -> tuple:
    # Prepare API call parameters
    # OpenAI: Make API call and create response object
    if model.startswith("gpt"):
        response = client.responses.create(
            model=model,  # gpt-5.2-pro is way too expensive; use gpt-5.2
            reasoning={
                "effort": "xhigh",  # Favors even more complete reasoning
                "summary": "detailed"  # Give as much detail as possible in thinking block
            },
            text={
                "verbosity": "low"  # To keep the model on task for diagnosis
            },
            input=[
                {
                    "role": "developer",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"
                }
            ]
        )

    # Handle different output array lengths dynamically
    if len(response.output) == 1:
        # Only differential diagnosis present
        reasoning = None
        answer = response.output[0].content[0].text
    elif len(response.output) >= 2:
        # Extract the thought summary by concatenating all thinking blocks using newlines and a list comprehension
        reasoning = "\n\n".join([block.text for block in response.output[0].summary])

        # Extract the differential diagnosis list
        answer = response.output[1].content[0].text
    else:
        # Handle unexpected cases
        reasoning = None
        answer = None

    return reasoning, answer

In [34]:
vignette = dataset.loc[dataset["case_id"] == 175, "vignette"].values[0]
model = "gpt-5.2"
response = openai_client.responses.create(
        model=model,  # gpt-5.2-pro is way too expensive; use gpt-5.2
        reasoning={
            "effort": "xhigh",  # Favors even more complete reasoning
            "summary": "detailed"  # Give as much detail as possible in thinking block
        },
        text={
            "verbosity": "low"  # To keep the model on task for diagnosis
        },
        input=[
            {
                "role": "developer",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt + "\n<vignette>\n" + vignette + "\n</vignette>"
            }
        ]
    )

In [None]:
response # Case 175 cases GPT-5.2 to return an answer blocked by the content filter

Response(id='resp_009c681a5a3816ca006941ca0a63908193b2d21d8d854a89a5', created_at=1765919242.0, error=None, incomplete_details=IncompleteDetails(reason='content_filter'), instructions=None, metadata={}, model='gpt-5.2-2025-12-11', object='response', output=[ResponseReasoningItem(id='rs_009c681a5a3816ca006941ca0ab0088193ad69231991067551', summary=[], type='reasoning', encrypted_content=None, status=None)], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[], top_p=0.98, max_output_tokens=None, previous_response_id=None, reasoning=Reasoning(effort='xhigh', generate_summary=None, summary='detailed'), service_tier='default', status='incomplete', text=ResponseTextConfig(format=ResponseFormatText(type='text'), verbosity='low'), truncation='disabled', usage=ResponseUsage(input_tokens=0, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=0, output_tokens_details=OutputTokensDetails(reasoning_tokens=748), total_tokens=0), user=None, background=False, max

In [36]:
response.output

[ResponseReasoningItem(id='rs_009c681a5a3816ca006941ca0ab0088193ad69231991067551', summary=[], type='reasoning', encrypted_content=None, status=None)]