# üìä NEWS2 Calculation Benchmark: Model Comparison

**Head-to-Head: NurseSim-Triage vs Gemini 3 vs GPT-4o**

Testing which model most accurately calculates NEWS2 scores from vital signs.

---

In [None]:
!pip install -q gradio_client google-generativeai openai pandas matplotlib

In [None]:
import json, re, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import Dict
from gradio_client import Client
import google.generativeai as genai
import openai
from google.colab import userdata

# API Setup
openai.api_key = userdata.get('OPENAI_API_KEY')
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

print("‚úÖ Setup complete")

In [None]:
# Gold Standard NEWS2 Calculator
def calculate_news2(rr, spo2, on_oxygen, sbp, hr, temp, avpu):
    scores = {}
    
    # RR
    if rr <= 8: scores['rr'] = 3
    elif rr <= 11: scores['rr'] = 1
    elif rr <= 20: scores['rr'] = 0
    elif rr <= 24: scores['rr'] = 2
    else: scores['rr'] = 3
    
    # SpO2 (Scale 1)
    if spo2 <= 91: scores['spo2'] = 3
    elif spo2 <= 93: scores['spo2'] = 2
    elif spo2 <= 95: scores['spo2'] = 1
    else: scores['spo2'] = 0
    
    scores['air_o2'] = 2 if on_oxygen else 0
    
    # SBP
    if sbp <= 90: scores['sbp'] = 3
    elif sbp <= 100: scores['sbp'] = 2
    elif sbp <= 110: scores['sbp'] = 1
    elif sbp <= 219: scores['sbp'] = 0
    else: scores['sbp'] = 3
    
    # HR
    if hr <= 40: scores['hr'] = 3
    elif hr <= 50: scores['hr'] = 1
    elif hr <= 90: scores['hr'] = 0
    elif hr <= 110: scores['hr'] = 1
    elif hr <= 130: scores['hr'] = 2
    else: scores['hr'] = 3
    
    # Temp
    if temp <= 35.0: scores['temp'] = 3
    elif temp <= 36.0: scores['temp'] = 1
    elif temp <= 38.0: scores['temp'] = 0
    elif temp <= 39.0: scores['temp'] = 1
    else: scores['temp'] = 2
    
    scores['avpu'] = 0 if avpu.upper() == 'A' else 3
    
    return sum(scores.values())

print("‚úÖ Gold standard calculator ready")

In [None]:
# Test Cases
@dataclass
class TestCase:
    id: str
    desc: str
    rr: int
    spo2: int
    o2: bool
    sbp: int
    hr: int
    temp: float
    avpu: str

TESTS = [
    TestCase("LOW_01", "Stable", 14, 98, False, 125, 72, 36.8, "A"),
    TestCase("LOW_02", "Mild tachycardia", 16, 97, False, 130, 95, 37.2, "A"),
    TestCase("LOW_03", "Post-op fever", 18, 96, False, 118, 88, 38.5, "A"),
    TestCase("MED_01", "Hypoxia + tachy", 20, 94, False, 115, 105, 37.0, "A"),
    TestCase("MED_02", "On O2 + tachypnoea", 22, 95, True, 120, 85, 37.5, "A"),
    TestCase("MED_03", "Sepsis screen", 24, 92, True, 95, 115, 38.8, "A"),
    TestCase("MED_04", "Hypotensive", 20, 96, False, 88, 125, 37.2, "A"),
    TestCase("HIGH_01", "Severe sepsis", 28, 88, True, 82, 135, 39.5, "V"),
    TestCase("HIGH_02", "Resp failure", 32, 85, True, 90, 120, 38.0, "A"),
    TestCase("HIGH_03", "Altered GCS", 18, 96, False, 130, 80, 37.0, "V"),
    TestCase("EDGE_01", "Bradycardia", 12, 99, False, 140, 42, 36.5, "A"),
    TestCase("EDGE_02", "Hypothermia", 14, 97, False, 100, 55, 34.5, "A"),
    TestCase("EDGE_03", "HTN crisis", 16, 98, False, 225, 90, 37.0, "A"),
]

# Add expected scores
for t in TESTS:
    t.expected = calculate_news2(t.rr, t.spo2, t.o2, t.sbp, t.hr, t.temp, t.avpu)

print(f"‚úÖ {len(TESTS)} test cases")

## ü§ñ Model Setup

In [None]:
# 1. NurseSim-Triage (your model via HF Space)
print("Connecting to NurseSim-Triage HF Space...")
try:
    nursesim_client = Client("NurseCitizenDeveloper/NurseSim-Triage-Demo")
    print("‚úÖ NurseSim-Triage connected")
except Exception as e:
    print(f"‚ö†Ô∏è NurseSim connection failed: {e}")
    nursesim_client = None

In [None]:
# 2. Gemini 3
GEMINI_MODELS = ['gemini-3-pro', 'gemini-3', 'gemini-2.0-flash-exp']
gemini_model = None
gemini_name = None
for m in GEMINI_MODELS:
    try:
        gemini_model = genai.GenerativeModel(m)
        gemini_model.generate_content("test")
        gemini_name = m
        print(f"‚úÖ Gemini: {m}")
        break
    except:
        continue

# 3. GPT-4o
gpt_client = openai.OpenAI()
print("‚úÖ GPT-4o ready")

In [None]:
# Query functions
NEWS2_PROMPT = """Calculate NEWS2 score for these vitals. Return ONLY the total score as a number.
RR: {rr}, SpO2: {spo2}%, On O2: {o2}, SBP: {sbp}, HR: {hr}, Temp: {temp}¬∞C, AVPU: {avpu}
NEWS2 total score:"""

def extract_number(text):
    """Extract first number from text"""
    match = re.search(r'\b(\d+)\b', str(text))
    return int(match.group(1)) if match else -1

def query_nursesim(t):
    if not nursesim_client:
        return -1
    try:
        prompt = f"Calculate NEWS2: RR={t.rr}, SpO2={t.spo2}%, O2={'Yes' if t.o2 else 'No'}, SBP={t.sbp}, HR={t.hr}, Temp={t.temp}, AVPU={t.avpu}. Return ONLY the total score."
        result = nursesim_client.predict(prompt, api_name="/chat")
        return extract_number(result)
    except Exception as e:
        print(f"   NurseSim error: {e}")
        return -1

def query_gemini(t):
    if not gemini_model:
        return -1
    try:
        prompt = NEWS2_PROMPT.format(rr=t.rr, spo2=t.spo2, o2='Yes' if t.o2 else 'No', sbp=t.sbp, hr=t.hr, temp=t.temp, avpu=t.avpu)
        result = gemini_model.generate_content(prompt)
        return extract_number(result.text)
    except:
        return -1

def query_gpt(t):
    try:
        prompt = NEWS2_PROMPT.format(rr=t.rr, spo2=t.spo2, o2='Yes' if t.o2 else 'No', sbp=t.sbp, hr=t.hr, temp=t.temp, avpu=t.avpu)
        resp = gpt_client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=50
        )
        return extract_number(resp.choices[0].message.content)
    except:
        return -1

print("‚úÖ Query functions ready")

## üî¨ Run Benchmark

In [None]:
print("üî¨ Running NEWS2 Benchmark...\n")
results = []

for t in TESTS:
    print(f"{t.id}: Expected={t.expected}", end=" ")
    
    # Query all models
    ns = query_nursesim(t)
    gm = query_gemini(t)
    gp = query_gpt(t)
    
    print(f"| NurseSim={ns} | Gemini={gm} | GPT={gp}")
    
    results.append({
        'case': t.id,
        'expected': t.expected,
        'nursesim': ns,
        'gemini': gm,
        'gpt': gp,
        'ns_correct': ns == t.expected,
        'gm_correct': gm == t.expected,
        'gp_correct': gp == t.expected,
    })
    time.sleep(0.5)

df = pd.DataFrame(results)
print("\n‚úÖ Benchmark complete!")

In [None]:
# Results
print("\nüìä NEWS2 CALCULATION ACCURACY")
print("=" * 50)

models = [
    ('NurseSim-Triage', 'ns_correct', 'nursesim'),
    (gemini_name or 'Gemini', 'gm_correct', 'gemini'),
    ('GPT-4o', 'gp_correct', 'gpt')
]

summary = {}
for name, col, pred_col in models:
    valid = df[df[pred_col] >= 0]  # Exclude errors
    if len(valid) > 0:
        accuracy = valid[col].mean() * 100
        mae = abs(valid['expected'] - valid[pred_col]).mean()
        summary[name] = {'accuracy': accuracy, 'mae': mae, 'n': len(valid)}
        print(f"\n{name}:")
        print(f"  Exact Match: {valid[col].sum()}/{len(valid)} ({accuracy:.1f}%)")
        print(f"  Mean Abs Error: {mae:.2f}")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Accuracy comparison
ax1 = axes[0]
names = list(summary.keys())
accs = [summary[n]['accuracy'] for n in names]
colors = ['#ef4444', '#10b981', '#3b82f6']  # Red for NurseSim, Green Gemini, Blue GPT
bars = ax1.bar(names, accs, color=colors[:len(names)])
ax1.set_ylabel('Accuracy %')
ax1.set_title('NEWS2 Calculation Accuracy')
ax1.set_ylim(0, 100)
for bar, val in zip(bars, accs):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, f'{val:.0f}%', ha='center', fontweight='bold')

# MAE comparison
ax2 = axes[1]
maes = [summary[n]['mae'] for n in names]
bars2 = ax2.bar(names, maes, color=colors[:len(names)])
ax2.set_ylabel('Mean Absolute Error')
ax2.set_title('NEWS2 Calculation Error (lower is better)')
for bar, val in zip(bars2, maes):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, f'{val:.2f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('news2_model_comparison.png', dpi=150)
plt.show()

In [None]:
# Detailed results table
print("\nüìã Detailed Results")
print(df[['case', 'expected', 'nursesim', 'gemini', 'gpt']].to_string(index=False))

In [None]:
# Generate Report
from datetime import datetime

winner = max(summary.keys(), key=lambda x: summary[x]['accuracy'])

report = f"""# NEWS2 Calculation Benchmark Report
**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M')}

## Summary

| Model | Accuracy | Mean Abs Error |
|-------|----------|----------------|
"""
for name in summary:
    star = "‚≠ê" if name == winner else ""
    report += f"| {name} {star} | {summary[name]['accuracy']:.1f}% | {summary[name]['mae']:.2f} |\n"

report += f"""
**Winner**: {winner} with {summary[winner]['accuracy']:.1f}% accuracy

## Detailed Results

| Case | Expected | NurseSim | Gemini | GPT |
|------|----------|----------|--------|-----|
"""
for _, row in df.iterrows():
    ns = "‚úÖ" if row['ns_correct'] else str(row['nursesim'])
    gm = "‚úÖ" if row['gm_correct'] else str(row['gemini'])
    gp = "‚úÖ" if row['gp_correct'] else str(row['gpt'])
    report += f"| {row['case']} | {row['expected']} | {ns} | {gm} | {gp} |\n"

report += "\n---\n*NurseSim-Triage NEWS2 Benchmark | practicedev.cloud*"

print(report)
with open('news2_comparison_report.md', 'w') as f:
    f.write(report)
print("\n‚úÖ Saved: news2_comparison_report.md")

In [None]:
df.to_csv('news2_comparison_results.csv', index=False)
print("‚úÖ Saved: news2_comparison_results.csv")