# üß™ Relational Ai for Nursing Evaluation Notebook

This notebook evaluates the fine-tuned nursing model using Azure GPT-4o as an "Expert Judge".

**Model:** `NurseCitizenDeveloper/nursing-llama-3-8b-fons`

In [None]:
# 1. Install Dependencies (Run this first, then restart runtime)
!pip install -U bitsandbytes transformers accelerate langchain-openai -q
print("‚úÖ Installed! Now go to Runtime ‚Üí Restart runtime, then run Cell 2")

In [None]:
# 2. Load Model from Hugging Face
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

HF_MODEL = "NurseCitizenDeveloper/nursing-llama-3-8b-fons"
print(f"üîÑ Loading model: {HF_MODEL}")

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    HF_MODEL,
    device_map="auto",
    torch_dtype=torch.float16,
)
print("‚úÖ Model loaded successfully!")

In [None]:
# 3. Setup Azure OpenAI Judge
import os
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import HumanMessage

os.environ["AZURE_OPENAI_ENDPOINT"] = "https://nursing-brain-uk-685.openai.azure.com/"
# Secret removed for security - please enter your key when running
os.environ["AZURE_OPENAI_API_KEY"] = "YOUR_AZURE_KEY"
os.environ["AZURE_OPENAI_DEPLOYMENT"] = "gpt-4o"
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-08-01-preview"

llm = AzureChatOpenAI(
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
)
print("‚úÖ Azure GPT-4o Judge ready!")

In [None]:
# 4. Define Test Cases
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

test_cases = [
    {
        "instruction": "Summarize the key nursing interventions for a patient with delirium.",
        "input": "Patient is an 85-year-old male with acute confusion, fluctuating consciousness, and visual hallucinations."
    },
    {
        "instruction": "What are the FONS principles for person-centred care?",
        "input": "A nurse is documenting care for a patient with dementia."
    },
    {
        "instruction": "Explain why skin tone documentation is important in pressure ulcer risk assessment.",
        "input": "Using the Braden Scale for a patient with darker skin."
    },
    {
        "instruction": "How should a nurse communicate using person-centred language?",
        "input": "Writing clinical notes about a patient with mental health needs."
    },
    {
        "instruction": "Describe the ADPIE nursing process.",
        "input": "Training a new nursing student on documentation."
    },
]
print(f"üìã {len(test_cases)} test cases loaded")

In [None]:
# 5. Run Evaluation
print("\n" + "="*60)
print("üèÅ Relational Ai for Nursing EVALUATION")
print("="*60)

results = []

for i, case in enumerate(test_cases, 1):
    print(f"\n--- Test {i}/{len(test_cases)} ---")
    print(f"üìù Instruction: {case['instruction']}")
    
    # Generate response
    prompt = alpaca_prompt.format(case["instruction"], case["input"], "")
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response:")[-1].strip() if "### Response:" in response else response
    
    print(f"ü§ñ Model Response: {response[:300]}...")
    
    # Azure Judge Evaluation
    eval_prompt = f"""You are an expert nursing educator. Evaluate this AI response on a scale of 1-10:

1. **Clinical Accuracy** (1-10): Is the information clinically correct?
2. **Person-Centred Language** (1-10): Does it use respectful, dignified language?
3. **FONS Alignment** (1-10): Does it reflect FONS principles (relational care, practice development)?

**Instruction:** {case['instruction']}
**Context:** {case['input']}
**Model Response:** {response}

Provide scores and brief rationale for each:"""
    
    evaluation = llm.invoke([HumanMessage(content=eval_prompt)])
    print(f"\n‚öñÔ∏è Expert Evaluation:\n{evaluation.content}")
    print("-" * 50)
    
    results.append({
        "test": case["instruction"],
        "response": response,
        "evaluation": evaluation.content
    })

print("\n" + "="*60)
print("‚úÖ EVALUATION COMPLETE")
print("="*60)

In [None]:
# 6. Summary Report
print("\nüìä EVALUATION SUMMARY")
print("="*40)
for i, r in enumerate(results, 1):
    print(f"\nTest {i}: {r['test'][:50]}...")
    print(f"Response preview: {r['response'][:100]}...")