# üß™ Relational Ai for Nursing Multi-Judge Evaluation

**Judges:**
- üîµ **GPT-5.2** (Azure OpenAI)
- üü¢ **Gemini** (Google AI - will try 3 Pro, fallback to 2.5 Pro)

**Model:** `NurseCitizenDeveloper/nursing-llama-3-8b-fons`

In [None]:
# 1. Install Dependencies
!pip install -U bitsandbytes transformers accelerate openai langchain-google-genai google-generativeai -q
print("‚úÖ Installed! Restart runtime if needed, then run Cell 2")

In [None]:
# 2. Load Model from Hugging Face
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

HF_MODEL = "NurseCitizenDeveloper/nursing-llama-3-8b-fons"
print(f"üîÑ Loading model: {HF_MODEL}")

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
model = AutoModelForCausalLM.from_pretrained(HF_MODEL, device_map="auto", torch_dtype=torch.float16)
print("‚úÖ Model loaded!")

In [None]:
# 3. Setup GPT-5.2 Judge (Azure)
from openai import AzureOpenAI

gpt5_client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint="https://ai-lincoln0303ai530606275924.cognitiveservices.azure.com/",
    api_key="YOUR_AZURE_OPENAI_API_KEY" # Secret removed for security
)
print("‚úÖ GPT-5.2 Judge ready!")

In [None]:
# 4. Setup Gemini Judge (Google) - Auto-detects best available model
import os
import google.generativeai as genai

os.environ["GOOGLE_API_KEY"] = "YOUR_GEMINI_API_KEY" # Secret removed for security
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# Try Gemini 3 Pro first, fallback to 2.5 Pro
gemini_model_name = None
for model_name in ["gemini-3-pro", "gemini-2.5-pro", "gemini-2.0-flash", "gemini-1.5-pro"]:
    try:
        test_model = genai.GenerativeModel(model_name)
        test_model.generate_content("test")
        gemini_model_name = model_name
        print(f"‚úÖ Gemini Judge ready: {model_name}")
        break
    except Exception as e:
        print(f"‚ö†Ô∏è {model_name} not available: {str(e)[:50]}")

if gemini_model_name:
    gemini_judge = genai.GenerativeModel(gemini_model_name)
else:
    print("‚ùå No Gemini model available")

In [None]:
# 5. Define Test Cases
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

test_cases = [
    {"instruction": "Summarize key nursing interventions for a patient with delirium.",
     "input": "Patient is an 85-year-old male with acute confusion and visual hallucinations."},
    {"instruction": "What are the FONS principles for person-centred care?",
     "input": "A nurse is documenting care for a patient with dementia."},
    {"instruction": "Explain why skin tone documentation is important in pressure ulcer risk assessment.",
     "input": "Using the Braden Scale for a patient with darker skin."},
    {"instruction": "Describe the ADPIE nursing process.",
     "input": "Training a new nursing student on documentation."},
]

eval_prompt_template = """You are an expert nursing educator. Evaluate this AI response on a scale of 1-10:

1. **Clinical Accuracy** (1-10): Is the information clinically correct?
2. **Person-Centred Language** (1-10): Does it use respectful, dignified language?
3. **FONS Alignment** (1-10): Does it reflect FONS principles (relational care, practice development)?

**Instruction:** {instruction}
**Context:** {context}
**Model Response:** {response}

Provide ONLY the three scores in this exact format:
Accuracy: X/10
Person-Centred: X/10
FONS: X/10
Brief rationale:"""

print(f"üìã {len(test_cases)} test cases loaded")

In [None]:
# 6. Run Multi-Judge Evaluation
print("\n" + "="*70)
print(f"üèÅ Relational Ai for Nursing MULTI-JUDGE EVALUATION (GPT-5.2 vs {gemini_model_name})")
print("="*70)

results = []

for i, case in enumerate(test_cases, 1):
    print(f"\n{'='*70}")
    print(f"Test {i}/{len(test_cases)}: {case['instruction']}")
    print("="*70)
    
    # Generate response from our model
    prompt = alpaca_prompt.format(case["instruction"], case["input"], "")
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Response:")[-1].strip()
    
    print(f"\nü§ñ Model Response: {response[:250]}...")
    
    eval_prompt = eval_prompt_template.format(
        instruction=case["instruction"],
        context=case["input"],
        response=response
    )
    
    # GPT-5.2 Evaluation
    print("\nüîµ GPT-5.2 Judge:")
    try:
        gpt5_response = gpt5_client.chat.completions.create(
            model="gpt-5.2-chat",
            messages=[{"role": "user", "content": eval_prompt}],
            max_tokens=500
        )
        gpt5_eval = gpt5_response.choices[0].message.content
        print(gpt5_eval)
    except Exception as e:
        print(f"Error: {e}")
        gpt5_eval = "N/A"
    
    # Gemini Evaluation
    print(f"\nüü¢ {gemini_model_name} Judge:")
    try:
        gemini_response = gemini_judge.generate_content(eval_prompt)
        gemini_eval = gemini_response.text
        print(gemini_eval)
    except Exception as e:
        print(f"Error: {e}")
        gemini_eval = "N/A"
    
    results.append({
        "test": case["instruction"],
        "response": response,
        "gpt5": gpt5_eval,
        "gemini": gemini_eval
    })

print("\n" + "="*70)
print("‚úÖ MULTI-JUDGE EVALUATION COMPLETE")
print("="*70)

In [None]:
# 7. Summary Comparison
print(f"\nüìä SUMMARY: GPT-5.2 vs {gemini_model_name}")
print("="*60)
for i, r in enumerate(results, 1):
    print(f"\n--- Test {i}: {r['test'][:40]}... ---")
    print(f"\nüîµ GPT-5.2:\n{r['gpt5'][:300]}")
    print(f"\nüü¢ Gemini:\n{r['gemini'][:300]}")
    print("="*60)