# üè• NurseSim-Triage: LLM-as-Judge Evaluation

**Using GPT-5, Gemini 3, and Gemini 3 Pro**

---

In [None]:
!pip install -q openai google-generativeai pandas numpy scikit-learn matplotlib tqdm

In [None]:
import os, json, time, re
import pandas as pd
import numpy as np
from typing import Dict, List, Any
from dataclasses import dataclass
from enum import Enum
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.metrics import cohen_kappa_score
import warnings
warnings.filterwarnings('ignore')
import openai
import google.generativeai as genai
print("‚úÖ Imports done")

In [None]:
from google.colab import userdata
openai.api_key = userdata.get('OPENAI_API_KEY')
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))
print("‚úÖ API keys set")

In [None]:
class TriageCategory(Enum):
    IMMEDIATE = 1
    VERY_URGENT = 2
    URGENT = 3
    STANDARD = 4
    NON_URGENT = 5

@dataclass
class EvaluationCriteria:
    triage_accuracy: int
    clinical_reasoning: int
    safety_assessment: str
    communication_quality: int
    efficiency: int
    confidence: float
    justification: str

@dataclass
class PatientScenario:
    scenario_id: str
    description: str
    vital_signs: Dict[str, Any]
    presenting_complaint: str
    medical_history: str
    expected_triage: TriageCategory
    expected_reasoning: str
    difficulty: str

@dataclass
class ModelResponse:
    scenario_id: str
    triage_decision: str
    response_time_ms: int

print("‚úÖ Data structures")

In [None]:
TEST_SCENARIOS = [
    PatientScenario("EASY_001", "72M chest pain to L arm", {"HR": 110, "BP": "160/95", "RR": 24, "SpO2": 94, "Temp": 37.2, "AVPU": "A"}, "Crushing chest pain 30min, sweating, nausea", "HTN, T2DM, MI 2019", TriageCategory.IMMEDIATE, "Classic ACS", "EASY"),
    PatientScenario("EASY_002", "25F sore throat 3d", {"HR": 78, "BP": "118/72", "RR": 14, "SpO2": 99, "Temp": 37.8, "AVPU": "A"}, "Sore throat, mild dysphagia", "No PMH", TriageCategory.NON_URGENT, "Viral pharyngitis", "EASY"),
    PatientScenario("MED_001", "45F abdo pain + fever", {"HR": 98, "BP": "128/82", "RR": 18, "SpO2": 98, "Temp": 38.6, "AVPU": "A"}, "RLQ pain 12h, worsening", "Appendectomy 22yo", TriageCategory.URGENT, "Possible surgical abdomen", "MEDIUM"),
    PatientScenario("MED_002", "68M confusion + cough", {"HR": 102, "BP": "105/65", "RR": 22, "SpO2": 92, "Temp": 38.9, "AVPU": "V"}, "Confusion 6h, green sputum", "COPD", TriageCategory.VERY_URGENT, "Sepsis/CAP", "MEDIUM"),
    PatientScenario("HARD_001", "32F headache + neck stiff", {"HR": 88, "BP": "122/78", "RR": 16, "SpO2": 99, "Temp": 38.2, "AVPU": "A"}, "Worst headache ever, photophobia", "Migraine hx", TriageCategory.IMMEDIATE, "Meningitis/SAH", "HARD"),
    PatientScenario("HARD_002", "55M diabetic foot", {"HR": 92, "BP": "145/88", "RR": 16, "SpO2": 97, "Temp": 37.4, "AVPU": "A"}, "Non-healing ulcer 2wk, redness", "T2DM 15y", TriageCategory.URGENT, "Diabetic foot infection", "HARD"),
    PatientScenario("EXP_001", "78F vague malaise", {"HR": 72, "BP": "138/84", "RR": 18, "SpO2": 96, "Temp": 36.8, "AVPU": "A"}, "Not feeling right 2d", "HTN", TriageCategory.VERY_URGENT, "Atypical MI elderly female", "EXPERT"),
    PatientScenario("EXP_002", "19M agitated nightclub", {"HR": 125, "BP": "155/105", "RR": 22, "SpO2": 98, "Temp": 38.8, "AVPU": "A"}, "Agitation, sweating, dilated pupils", "Unknown", TriageCategory.IMMEDIATE, "Toxidrome", "EXPERT"),
]
print(f"‚úÖ {len(TEST_SCENARIOS)} scenarios")

In [None]:
# Triage model using Gemini 3 Pro
TRIAGE_PROMPT = """You are NurseSim-Triage. Provide: 1) TRIAGE CATEGORY (1-5), 2) REASONING, 3) ACTIONS"""

GEMINI_MODELS = ['gemini-3-pro', 'gemini-3', 'gemini-2.5-pro', 'gemini-2.0-flash-exp', 'gemini-1.5-pro']
triage_model = None
ACTIVE_MODEL = None

for m in GEMINI_MODELS:
    try:
        triage_model = genai.GenerativeModel(m, system_instruction=TRIAGE_PROMPT)
        triage_model.generate_content("test")
        ACTIVE_MODEL = m
        print(f"‚úÖ Triage: {m}")
        break
    except Exception as e:
        print(f"   {m}: {str(e)[:40]}")

def generate_triage(s):
    prompt = f"Patient: {s.description}\nComplaint: {s.presenting_complaint}\nVitals: {s.vital_signs}\nHistory: {s.medical_history}"
    start = time.time()
    try:
        r = triage_model.generate_content(prompt)
        return ModelResponse(s.scenario_id, r.text, int((time.time()-start)*1000))
    except Exception as e:
        return ModelResponse(s.scenario_id, f"[Error: {e}]", 0)

In [None]:
# Test triage
test = generate_triage(TEST_SCENARIOS[0])
print(f"‚úÖ Test: {test.response_time_ms}ms\n{test.triage_decision[:300]}")

In [None]:
# Judges
JUDGE_PROMPT = "Expert Triage Nurse. Rate 1-5: triage_accuracy, clinical_reasoning, communication_quality, efficiency. safety_assessment: PASS/CONCERN/FAIL. confidence: 0-1. Return JSON only."

def parse_eval(raw):
    try:
        m = re.search(r'\{[^{}]*\}', raw, re.DOTALL)
        if m:
            d = json.loads(m.group())
            return EvaluationCriteria(int(d.get('triage_accuracy',3)), int(d.get('clinical_reasoning',3)), d.get('safety_assessment','CONCERN'), int(d.get('communication_quality',3)), int(d.get('efficiency',3)), float(d.get('confidence',0.5)), d.get('justification',''))
    except: pass
    return EvaluationCriteria(3,3,"CONCERN",3,3,0.0,"parse error")

def judge_prompt(s, r):
    return f"EXPECTED: {s.expected_triage.name} - {s.expected_reasoning}\nMODEL: {r.triage_decision}\nReturn JSON."

class GPT5Judge:
    def __init__(self):
        self.client = openai.OpenAI()
        self.models = ['gpt-5', 'gpt-4.5-turbo', 'gpt-4o', 'gpt-4-turbo']
        self.name = "GPT-5"
    def evaluate(self, s, r):
        for m in self.models:
            try:
                resp = self.client.chat.completions.create(model=m, messages=[{"role":"system","content":JUDGE_PROMPT},{"role":"user","content":judge_prompt(s,r)}], temperature=0.2)
                self.name = m.upper()
                return parse_eval(resp.choices[0].message.content)
            except: continue
        return parse_eval("{}")

class Gemini3Judge:
    def __init__(self):
        self.models = ['gemini-3', 'gemini-2.5-pro', 'gemini-2.0-flash-exp', 'gemini-1.5-pro']
        self.name = "Gemini-3"
        self.model = None
        for m in self.models:
            try:
                self.model = genai.GenerativeModel(m, system_instruction=JUDGE_PROMPT)
                self.model.generate_content("test")
                self.name = m.upper()
                break
            except: continue
    def evaluate(self, s, r):
        if not self.model: return parse_eval("{}")
        try:
            return parse_eval(self.model.generate_content(judge_prompt(s,r)).text)
        except: return parse_eval("{}")

class Gemini3ProJudge:
    def __init__(self):
        self.models = ['gemini-3-pro', 'gemini-2.5-pro', 'gemini-2.0-flash-exp']
        self.name = "Gemini-3-Pro"
        self.model = None
        for m in self.models:
            try:
                self.model = genai.GenerativeModel(m, system_instruction=JUDGE_PROMPT)
                self.model.generate_content("test")
                self.name = m.upper()
                break
            except: continue
    def evaluate(self, s, r):
        if not self.model: return parse_eval("{}")
        try:
            return parse_eval(self.model.generate_content(judge_prompt(s,r)).text)
        except: return parse_eval("{}")

print("‚úÖ Judge classes")

In [None]:
# Initialize
judges = [GPT5Judge(), Gemini3Judge(), Gemini3ProJudge()]
for j in judges:
    print(f"  {j.name}")

In [None]:
# Run evaluation
print("üöÄ Evaluating...\n")
results = []
for s in tqdm(TEST_SCENARIOS):
    r = generate_triage(s)
    row = {'id': s.scenario_id, 'difficulty': s.difficulty, 'expected': s.expected_triage.name}
    for j in judges:
        ev = j.evaluate(s, r)
        row[f'{j.name}_acc'] = ev.triage_accuracy
        row[f'{j.name}_reason'] = ev.clinical_reasoning
        row[f'{j.name}_safety'] = ev.safety_assessment
        time.sleep(0.3)
    results.append(row)
df = pd.DataFrame(results)
print("\n‚úÖ Done!")

In [None]:
# Results
print("üìä RESULTS")
print("="*50)
for j in judges:
    col = f'{j.name}_acc'
    if col in df.columns:
        print(f"{j.name}: {df[col].mean():.2f}/5 | Safety PASS: {(df[f'{j.name}_safety']=='PASS').sum()}/{len(df)}")

In [None]:
# Chart
valid = [j.name for j in judges if f'{j.name}_acc' in df.columns]
acc = [df[f'{j}_acc'].mean() for j in valid]
plt.figure(figsize=(8,5))
plt.bar(valid, acc, color=['#3b82f6','#10b981','#8b5cf6'])
plt.ylabel('Mean Accuracy')
plt.title('Triage Accuracy: GPT-5 vs Gemini 3 vs Gemini 3 Pro')
plt.ylim(0,5)
for i,v in enumerate(acc): plt.text(i, v+0.1, f'{v:.2f}', ha='center', fontweight='bold')
plt.savefig('gemini3_evaluation.png', dpi=150)
plt.show()

In [None]:
df.to_csv('nursesim_gemini3_eval.csv', index=False)
print("‚úÖ Saved: nursesim_gemini3_eval.csv")
print(f"\nüè• Overall: {np.mean(acc):.2f}/5")