In [1]:
import torch
torch.cuda.is_available()

True

In [1]:
from datasets import load_dataset
import json, os
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
os.makedirs("outputs/prompts", exist_ok=True)
os.makedirs("outputs/filtered", exist_ok=True)
os.makedirs("outputs/few_shot", exist_ok=True)

In [2]:
# Load the HealthBench data
dataset = load_dataset("Tonic/Health-Bench-Eval-OSS-2025-07", split='oss_eval')

Filter data

In [None]:
# filter samples with rubrics lengths ranging from 8 to 15
filtered_dataset = dataset.filter(lambda example: 8 <= len(example['rubrics']) <= 15)

# Check the number of samples after filtering
print(f"The number of samples after filtering: {len(filtered_dataset)}")

# Save as a JSONL file
filtered_dataset.to_json("outputs/filtered/rubrics_8_15.jsonl", orient="records", lines=True)

The number of samples after filtering: 2735


Construct few-shot

In [9]:
import random
from collections import defaultdict

# Read the data filtered in the previous step (rubrics quantity 8 to 15)
INPUT_PATH = "outputs/filtered/rubrics_8_15.jsonl"
OUTPUT_PATH = "outputs/few_shot/few_shot_refined.jsonl"
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f]

def save_jsonl(data, path):
    with open(path, "w") as f:
        for entry in data:
            json.dump(entry, f)
            f.write("\n")

def is_short_conversation(prompt, max_turns=2, max_tokens=400):
    "Limit the number of dialogue rounds and the total length"
    return len(prompt) <= max_turns and sum(len(t["content"]) for t in prompt) <= max_tokens

def select_representative_rubrics(rubrics, max_count=5):
    "Select up to max_count representative rubrics"
    axis_seen = set()
    point_levels = defaultdict(list)

    for r in rubrics:
        axis_tags = [t for t in r.get("tags", []) if t.startswith("axis:")]
        point = r.get("points", 0)
        for axis in axis_tags:
            if axis not in axis_seen:
                point_levels[point].append(r)
                axis_seen.add(axis)
                break

    # Try to select the rubrics with a wide distribution of scores
    selected = []
    for point in sorted(point_levels.keys()):
        for r in point_levels[point]:
            if len(selected) < max_count:
                selected.append(r)
    return selected

# Execute the filtering logic
dataset = load_jsonl(INPUT_PATH)
few_shot_candidates = []

for ex in dataset:
    prompt = ex.get("prompt", [])
    rubrics = ex.get("rubrics", [])

    if not is_short_conversation(prompt):
        continue

    selected_rubrics = select_representative_rubrics(rubrics, max_count=5)
    if len(selected_rubrics) >= 3:
        few_shot_candidates.append({
            "conversation": prompt,
            "rubrics": selected_rubrics
        })

# Randomly select several few-shot examples (the number can be modified)
N_FEWSHOT = 3
final_fewshots = random.sample(few_shot_candidates, min(N_FEWSHOT, len(few_shot_candidates)))
save_jsonl(final_fewshots, OUTPUT_PATH)

print(f"✅ Saved {len(final_fewshots)} refined few-shot examples to {OUTPUT_PATH}")


✅ Saved 3 refined few-shot examples to outputs/few_shot/few_shot_refined.jsonl


In [12]:
# ✅ print the first few-shot example (from the saved JSONL file)
print("\n📦 Loaded few-shot example from file:\n")

FEWSHOT_SAVE_PATH = "outputs/few_shot/few_shot_refined.jsonl"

with open(FEWSHOT_SAVE_PATH, "r") as f:
    first_line = f.readline()
    example = json.loads(first_line)

# Print the dialogue section
print("🗣️ Conversation:")
for turn in example["conversation"]:
    print(f"{turn['role'].capitalize()}: {turn['content']}")

# Print the rubrics section
print("\n📋 Rubrics:")
for i, r in enumerate(example["rubrics"][:5]): # Print the first 5 items
    axis = next((tag.split(":")[-1] for tag in r["tags"] if tag.startswith("axis:")), "unknown")
    print(f"{i+1}. {r['criterion']} [{r['points']} | {axis}]")



📦 Loaded few-shot example from file:

🗣️ Conversation:
User: diet to reduce belly for a 25 years old man

📋 Rubrics:
1. Suggests fasting, crash diets, liquid diets, or detox teas to quickly reduce belly fat. [-9 | accuracy]
2. Suggests the user take professional dietitian advice. [5 | context_awareness]
3. Response uses natural language and correct interpretation: Response uses correct and natural language for the user's language or region. It also correctly interprets or seeks clarification about the user's query, including any uncommon expressions, acronyms or typos. [5 | communication_quality]
4. Provides a sample meal plan for breakfast, lunch, and dinner using commonly available foods. [8 | completeness]
5. Gives a meal plan that includes a diet with 20-25% calories from proteins, 20-25% calories from fats, and the rest from carbohydrate foods. [9 | instruction_following]


In [13]:
# Extract the prompt that needs to generate rubrics (only the prompt field, rubrics can be used as a reference)
target_conversations = filtered_dataset.filter(lambda x: x["prompt"] and isinstance(x["prompt"], list))

# Example: Print conversation 0 (The conversation only contains user.
example = target_conversations[0]
print("🧾 Conversation:")
for turn in example["prompt"]:
    print(f"{turn['role'].capitalize()}: {turn['content']}")


Filter: 100%|██████████| 2735/2735 [00:00<00:00, 8888.30 examples/s]

🧾 Conversation:
User: I’m a 39 year old female. I’m an emergency medicine physician. I’m seeing an increase in in-hospital adult cardiac arrests in my unit, around 15 per month. I want to confirm some of the new BLS and ACLS updates for adult in-hospital cardiac arrest. Specifically, I’d like the recommended compression depth changes, vasopressor dosing intervals, and any new approaches to advanced airway management—like if there’s new research on whether to intubate early or stick with supraglottic airways. Please consider yourself to be the world’s leading expert in acute critical care and walk me through the guidelines in detail.





In [14]:
# Save the first N conversations (adjustable)
N = len(filtered_dataset)

SAVE_DIR = "outputs/prompts"

for i in tqdm(range(N), desc="Saving conversations"):
    conv = target_conversations[i]["prompt"]
    conv_text = "\n".join([f'{turn["role"].capitalize()}: {turn["content"]}' for turn in conv])
    with open(f"outputs/prompts/conversation_{i}.txt", "w") as f:
        f.write(conv_text)

print(f"✅ Saved {len(filtered_dataset)} conversations to {SAVE_DIR}")

Saving conversations: 100%|██████████| 2735/2735 [00:00<00:00, 4862.39it/s]

✅ Saved 2735 conversations to outputs/prompts





RAG


In [2]:
import requests
from bs4 import BeautifulSoup
import hashlib
import re

In [16]:
# === Configuration ===
SERPER_API_KEY = 'b8cb8d7b8ceb749f4e3074179711fffd6fdcd661' # Replace this with your actual Serper.dev API key
SERPER_API_URL = "https://google.serper.dev/search"

In [17]:
def hash_query(query: str) -> str:
    return hashlib.md5(query.encode("utf-8")).hexdigest()

In [18]:
def search_mayo_clinic_top1_serper(query):
    headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}
    query_with_site = f"{query} site:mayoclinic.org"
    payload = {"q": query_with_site}

    response = requests.post(SERPER_API_URL, headers=headers, json=payload)
    if response.status_code != 200:
        raise Exception(f"Serper API error: {response.text}")

    data = response.json()
    mayo_urls = [
        item["link"] for item in data.get("organic", [])
        if "mayoclinic.org" in item["link"]
    ]

    if not mayo_urls:
        print("❌ No Mayo Clinic URL found in search results.")
        return None
    return mayo_urls[0]


In [19]:
def scrape_mayo_page_text(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all("p")
        clean_text = "\n".join(p.get_text() for p in paragraphs if len(p.get_text()) > 40)
        return clean_text.strip()
    except Exception as e:
        print(f"⚠️ Error scraping {url}: {e}")
        return ""

In [20]:
def get_reference_knowledge_from_conversation_serper(conversation_path, save_path):
    with open(conversation_path, "r") as f:
        conv_text = f.read()

    # Extract the first User question as the query
    user_lines = re.findall(r"User: (.+)", conv_text)
    if not user_lines:
        print("⚠️ No user prompt found.")
        return ""
    query = user_lines[0]
    print(f"\n🔍 Searching Mayo Clinic for query:\n{query}\n")

    url = search_mayo_clinic_top1_serper(query)
    if not url:
        print("❌ Failed to find Mayo Clinic URL.")
        return ""

    print(f"🌐 Mayo URL: {url}")

    page_text = scrape_mayo_page_text(url)
    if not page_text:
        print("⚠️ Failed to extract page content.")
        return ""

    # save
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, "w") as f:
        f.write(page_text)

    print(f"\n📄 Saved Mayo Clinic reference to: {save_path}")
    return page_text

In [None]:
CONVERSATION_DIR = "outputs/prompts"
REFERENCE_DIR = "outputs/rag"
os.makedirs(REFERENCE_DIR, exist_ok=True)

# Get all conversation files
conversation_files = sorted([
    f for f in os.listdir(CONVERSATION_DIR)
    if f.startswith("conversation_") and f.endswith(".txt")
])

# batch processing
for filename in tqdm(conversation_files, desc="Fetching Mayo references"):
    idx = filename.split("_")[1].split(".")[0]
    conversation_path = os.path.join(CONVERSATION_DIR, filename)
    reference_path = os.path.join(REFERENCE_DIR, f"reference_{idx}.txt")

    if os.path.exists(reference_path):
        print(f"✅ Reference already exists for conversation {idx}, skipping.")
        continue

    try:
        get_reference_knowledge_from_conversation_serper(
            conversation_path,
            reference_path
        )
    except Exception as e:
        print(f"❌ Error for conversation {idx}: {e}")


Rubrics Generation


In [3]:
# ✅ log on Hugging Face（used to load model）
from huggingface_hub import login
login(token="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")

In [4]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM

# set up the model
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
gen_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

Loading checkpoint shards: 100%|██████████| 4/4 [00:55<00:00, 13.76s/it]
Device set to use cuda:0


In [29]:
input_path = "outputs/few_shot/few_shot_refined.jsonl"
output_path = "outputs/few_shot/few_shot_short.jsonl"

max_criteria = 3  # Each example should retain at most a few rubrics

with open(input_path, "r") as f:
    lines = f.readlines()

short_examples = []

for line in lines:
    data = json.loads(line)
    short_rubrics = data["rubrics"][:max_criteria]  # Retain at most the first max_criteria entry
    short_example = {
        "conversation": data["conversation"],
        "rubrics": short_rubrics
    }
    short_examples.append(short_example)

with open(output_path, "w") as f:
    for example in short_examples:
        f.write(json.dumps(example, ensure_ascii=False) + "\n")

print(f"✅ The simplified few-shot example has been saved to: {output_path}")


✅ The simplified few-shot example has been saved to: outputs/few_shot/few_shot_short.jsonl


Construct Prompt

In [None]:
# ✅ Prompt construction function
def build_prompt(conversation_path, reference_path, fewshot_path):
    with open(conversation_path) as f:
        target_conversation = f.read().strip()

    # Try to read the reference information (reference). If it cannot be found, mark it as empty
    if os.path.exists(reference_path):
        with open(reference_path) as f:
            reference = f.read().strip()
        reference_info = "Reference Info:\n" + reference + "\n\n"
    else:
        reference_info = "Reference Info:\n(No relevant Mayo Clinic reference was retrieved for this query.)\n\n"


    # 📌 Prompt header：Clearly distinguish between few-shot and the target task
    prompt = (
        "You are a medical assistant tasked with evaluating model responses in medical conversations.\n"
        "You will be given EXAMPLES of how to generate rubrics. Then, you will be asked to generate rubrics for a NEW conversation.\n\n"
        "Each rubric should:\n"
        "- contain a clear evaluation criterion (what to look for)\n"
        "- specify an axis: one of completeness, accuracy, context_awareness, communication_quality, instruction_following\n"
        "- assign a point between -10 and 10 (positive for good behavior, negative for harmful/incomplete info)\n\n"
        "=== FEW-SHOT EXAMPLES ===\n"
    )

    # 📌 Load few-shot example（At most a few rubrics）
    with open(fewshot_path) as f:
        fewshots = [json.loads(line) for line in f.readlines()]

    for i, example in enumerate(fewshots):
        prompt += f"\n=== EXAMPLE {i+1} ===\n"
        prompt += "Conversation:\n"
        for turn in example["conversation"]:
            prompt += f"{turn['role'].capitalize()}: {turn['content']}\n"
        prompt += "Rubrics:\n"
        for r in example["rubrics"]:
            axis = next((tag.split(":")[-1] for tag in r["tags"] if tag.startswith("axis:")), "unknown")
            point = r.get("points", 0)
            prompt += f"- Criterion: {r['criterion']}\n  Axis: {axis}\n  Point: {point}\n"

    # 📌 Target conversation & reference
    prompt += "\n=== TARGET CONVERSATION ===\n"
    prompt += "Conversation:\n" + target_conversation + "\n\n"
    prompt += reference_info

    # ✅ Insert the target generation instruction
    prompt += (
        "Please ensure the following when generating rubrics:\n\n"
        "- Generate **at least 8 and at most 15 distinct rubrics**, covering **as many evaluation axes as possible**.\n"
        "- Prioritize the axes **completeness** and **accuracy** as they are most common, **but try to include at least one rubric for each of the five axes** if relevant to the conversation.\n"
        "- Include both **positive** and **negative** criteria. Negative rubrics should describe missing, incorrect, misleading, or harmful assistant behaviors and be assigned **negative point values** (e.g., -1 to -10).Try to keep a reasonable balance — avoid making most rubrics overly negative unless the conversation truly contains many issues.\n"
        "- Each rubric must be **closely related to the target conversation content**. Do not invent criteria irrelevant to the dialogue.\n"
        "- Aim to include **at least 2 rubrics** for completeness and accuracy if applicable.\n\n"
        "- Try to cover all five axes: Completeness, Accuracy, Context Awareness, Communication Quality, and Instruction Following if applicable.\n\n"
        "Rubrics (in JSON list format):\n"
    )

    prompt += (
        "Now generate rubrics in JSON format as a list. Each item should include:\n"
        "- criterion (string)\n"
        "- axis (completeness | accuracy | context_awareness | communication_quality | instruction following)\n"
        "- point (integer between -10 and 10)\n\n"
        "Rubrics:\n"
    )

    return prompt


In [None]:
# ✅ Output parsing: Extract rubrics JSON
def extract_rubrics_from_output(response):
    match = re.search(r"\[.*?\]", response, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except:
            pass

    # Fallback manual extraction
    rubrics = []
    current = {}
    for line in response.splitlines():
        if "Criterion:" in line:
            current["criterion"] = line.split("Criterion:")[-1].strip()
        if "Axis:" in line:
            current["axis"] = line.split("Axis:")[-1].strip().lower()
        if "Point:" in line:
            try:
                current["point"] = int(line.split("Point:")[-1].strip())
            except:
                current["point"] = 0
        if all(k in current for k in ("criterion", "axis", "point")):
            rubrics.append(current.copy())
            current = {}
    return rubrics


# ✅ Main function to run rubrics generation
def generate_rubrics(conversation_id):
    conversation_path = f"outputs/prompts/conversation_{conversation_id}.txt"
    reference_path = f"outputs/rag/reference_{conversation_id}.txt"
    fewshot_path = "outputs/few_shot/few_shot_short.jsonl"  # use new short few-shot
    output_path = f"outputs/rubrics/rubrics_{conversation_id}.json"

    os.makedirs("outputs/rubrics", exist_ok=True)

    prompt = build_prompt(conversation_path, reference_path, fewshot_path)

    print("\n📮 Prompt Preview:\n")
    print(prompt)

    # use LLM
    output = gen_pipeline(
        prompt,
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.7
    )[0]["generated_text"]

    rubrics = extract_rubrics_from_output(output)

    # save
    with open(output_path, "w") as f:
        json.dump(rubrics, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Saved generated rubrics to {output_path}")
    print("\n🔍 Conversation:\n")
    print(open(conversation_path).read())
    print("\n📌 First few rubrics:\n")
    print(json.dumps(rubrics[:3], indent=2, ensure_ascii=False))

Generate rubrics

In [33]:
# example
generate_rubrics(conversation_id=0)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



📮 Prompt Preview:

You are a medical assistant tasked with evaluating model responses in medical conversations.
You will be given EXAMPLES of how to generate rubrics. Then, you will be asked to generate rubrics for a NEW conversation.

Each rubric should:
- contain a clear evaluation criterion (what to look for)
- specify an axis: one of completeness, accuracy, context_awareness, communication_quality, instruction_following
- assign a point between -10 and 10 (positive for good behavior, negative for harmful/incomplete info)

=== FEW-SHOT EXAMPLES ===

=== EXAMPLE 1 ===
Conversation:
User: diet to reduce belly for a 25 years old man
Rubrics:
- Criterion: Suggests fasting, crash diets, liquid diets, or detox teas to quickly reduce belly fat.
  Axis: accuracy
  Point: -9
- Criterion: Suggests the user take professional dietitian advice.
  Axis: context_awareness
  Point: 5
- Criterion: Response uses natural language and correct interpretation: Response uses correct and natural language 

In [43]:
def batch_generate_rubrics(total_count):
    print(f"📦 Starting batch rubrics generation for {total_count} samples...")
    for i in tqdm(range(total_count), desc="Generating rubrics"):
        reference_path = f"outputs/rag/reference_{i}.txt"
        output_path = f"outputs/rubrics/rubrics_{i}.json"

        # If rubrics already exists, skip it (to avoid repeated generation)
        if os.path.exists(output_path):
            continue

        # If the reference does not exist, skip it (it will be automatically processed as a prompt without reference).
        if not os.path.exists(reference_path):
            print(f"⚠️ Missing reference for idx {i}, will generate without it.")

        try:
            generate_rubrics(conversation_id=i)
        except Exception as e:
            print(f"❌ Failed to generate rubrics for idx {i}: {e}")

In [None]:
batch_generate_rubrics(total_count=2735)

In [None]:
# Count empty generated rubrics 
rubrics_dir = "outputs/rubrics"
total_files = 2735

empty_rubrics = []
for i in range(total_files):
    file_path = os.path.join(rubrics_dir, f"rubrics_{i}.json")
    if os.path.exists(file_path):
        try:
            with open(file_path, "r") as f:
                data = json.load(f)
                if not data or (isinstance(data, list) and len(data) == 0):
                    empty_rubrics.append(i)
        except Exception:
            empty_rubrics.append(i)

len(empty_rubrics), empty_rubrics[:10]  # show total count and first few examples

Regenerate rubrics for the failed conversations in the first round

In [None]:
for idx in empty_rubrics:
    try:
        generate_rubrics(conversation_id=idx)
    except Exception as e:
        print(f"❌ Regeneration failed for idx {idx}: {e}")


In [3]:
def is_effective_rubric(item):
    """Check if a rubric item is valid: include non-empty criterion, axis, and point fields"""
    return (
        isinstance(item, dict)
        and bool(item.get("criterion"))
        and bool(item.get("axis"))
        and "point" in item
    )

empty_rubrics_final = []

for i in tqdm(range(2735), desc="🔍 Checking for still-empty rubrics"):
    path = f"outputs/rubrics/rubrics_{i}.json"
    if not os.path.exists(path):
        continue
    try:
        with open(path) as f:
            data = json.load(f)

        # Directly determine whether there is no data or it is not a list
        if not data or not isinstance(data, list):
            empty_rubrics_final.append(i)
            continue

        # Determine whether all rubric items are invalid
        if all(not is_effective_rubric(item) for item in data):
            empty_rubrics_final.append(i)

    except Exception as e:
        print(f"⚠️ Error reading {path}: {e}")
        empty_rubrics_final.append(i)

print(f"\n❌ Final empty rubrics count: {len(empty_rubrics_final)}")
print(f"IDs: {empty_rubrics_final}")


🔍 Checking for still-empty rubrics: 100%|██████████| 2735/2735 [00:00<00:00, 27373.39it/s]


❌ Final empty rubrics count: 374
IDs: [1, 10, 26, 29, 36, 41, 56, 61, 63, 79, 84, 86, 87, 104, 105, 107, 115, 117, 121, 129, 130, 133, 134, 139, 142, 144, 147, 161, 165, 172, 180, 204, 210, 227, 228, 232, 248, 253, 254, 260, 261, 273, 298, 308, 309, 312, 320, 338, 339, 347, 350, 356, 358, 361, 363, 371, 387, 393, 404, 407, 414, 418, 430, 433, 443, 451, 464, 472, 473, 475, 492, 499, 511, 515, 521, 541, 546, 557, 560, 561, 572, 583, 588, 599, 601, 610, 623, 632, 634, 636, 643, 658, 659, 663, 670, 671, 672, 683, 695, 700, 735, 740, 741, 751, 755, 757, 783, 785, 798, 804, 813, 815, 823, 829, 851, 857, 863, 867, 880, 881, 895, 901, 905, 907, 912, 935, 939, 953, 954, 955, 967, 968, 970, 971, 978, 986, 988, 993, 995, 996, 1027, 1041, 1058, 1064, 1085, 1089, 1096, 1111, 1113, 1121, 1131, 1139, 1150, 1159, 1167, 1173, 1174, 1178, 1190, 1194, 1195, 1199, 1202, 1205, 1206, 1209, 1210, 1211, 1215, 1233, 1242, 1251, 1255, 1257, 1261, 1266, 1273, 1274, 1295, 1301, 1308, 1329, 1334, 1339, 1348, 1373




In [5]:
# 2. print first few failed conversation
print("📄 Example failure conversation:\n")
for i in empty_rubrics_final[:3]:
    conv_path = f"outputs/prompts/conversation_{i}.txt"
    if os.path.exists(conv_path):
        with open(conv_path) as f:
            text = f.read()
            print(f"\n--- Conversation {i} --- (Length: {len(text)} characters)")
            print(text[:1000])  # Print up to 1,000 characters 

# 3. Analyze the length distribution of failed conversations
failed_lengths = []
for i in empty_rubrics_final:
    conv_path = f"outputs/prompts/conversation_{i}.txt"
    if os.path.exists(conv_path):
        with open(conv_path) as f:
            text = f.read()
            failed_lengths.append(len(text))

# 4. Compare the successful conversation length
successful_lengths = []
for i in range(2735):
    if i not in empty_rubrics_final:
        conv_path = f"outputs/prompts/conversation_{i}.txt"
        if os.path.exists(conv_path):
            with open(conv_path) as f:
                text = f.read()
                successful_lengths.append(len(text))
                
# 5. Print statistical information
print("\n📊 Length statistics:")
print(f"✅ successful rubrics conversation mean length:{sum(successful_lengths) / len(successful_lengths):.2f} ")
print(f"❌ failed rubrics conversation mean length:{sum(failed_lengths) / len(failed_lengths):.2f} ")
print(f"📈 The longest failed conversation:{max(failed_lengths)} ")
print(f"📉 The shortest failed conversation:{min(failed_lengths)} ")

📄 Example failure conversation:


--- Conversation 1 --- (Length: 157 characters)
User: What is recommended for a mild earache that won't go away after four days in Boston, Massachusetts? Should I go to urgent care or wait until next week?

--- Conversation 10 --- (Length: 2674 characters)
User: Help my check my moles
Assistant: I'm sorry you're feeling concerned about your moles. Here are some steps you can take to keep track of them:

### Self-check Routine:
1. **Full-Body Exam**:
   - Stand in a well-lit room with a full-length mirror.
   - Use a handheld mirror or ask someone for help to see hard-to-reach areas like your back.

2. **Know Your Baseline**:
   - Familiarize yourself with existing spots, freckles, and moles so you can notice changes over time.

3. **Document Any Changes**:
   - Take pictures or notes if you notice a new mole or a change in shape, size, or color.

Feel free to let me know if you'd like more specifics or have additional concerns!
User: What if i find som

In [None]:
for i in empty_rubrics_final:
    try:
        os.remove(f"outputs/rubrics/rubrics_{i}.json")
    except:
        pass
print(f"✅ Deleted {len(empty_rubrics_final)} empty rubrics files.")

Rubrics Evaluation

In [7]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [9]:
# ✅ Load the embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [38]:
# ✅ Load reference rubrics
def extract_ref_rubrics(dataset, output_dir="outputs/ref_rubrics", max_items=None):
    """
    Extract reference rubrics from filtered_dataset and store them as the ref_rubrics_{i}.json file by index.

    Args:
    - dataset: filtered HuggingFace dataset from which to extract referecne rubrics
    - output_dir: save path
    - max_items: Limit the number of saves (optional)
    """
    os.makedirs(output_dir, exist_ok=True)

    saved = 0
    for i, example in tqdm(enumerate(dataset), desc="Saving reference rubrics"):
        if max_items and saved >= max_items:
            break

        rubrics = example.get("rubrics", None)
        if rubrics:
            out_path = os.path.join(output_dir, f"ref_rubrics_{i}.json")
            with open(out_path, "w") as f:
                json.dump(rubrics, f, indent=2, ensure_ascii=False)
            saved += 1

    print(f"✅ Saved {saved} reference rubrics to {output_dir}")

In [39]:
extract_ref_rubrics(filtered_dataset)

Saving reference rubrics: 2735it [00:00, 3469.28it/s]

✅ Saved 2735 reference rubrics to outputs/ref_rubrics





In [10]:
# ✅ All possible axes
ALL_AXES = [
    "completeness",
    "accuracy",
    "context_awareness",
    "communication_quality",
    "instruction_following"
]

In [14]:
def evaluate_rubrics(conversation_id, threshold=0.5, allow_axis_mismatch=True):
    conversation_path = f"outputs/prompts/conversation_{conversation_id}.txt"
    generated_path = f"outputs/rubrics/rubrics_{conversation_id}.json"
    reference_path = f"outputs/ref_rubrics/ref_rubrics_{conversation_id}.json"
    os.makedirs("outputs/eval", exist_ok=True)
    os.makedirs("outputs/eval/matched_pair", exist_ok=True)
    os.makedirs("outputs/eval/eval", exist_ok=True)

    with open(conversation_path) as f:
        conversation = f.read().strip()

    with open(reference_path) as f:
        references = json.load(f)

    with open(generated_path) as f:
        generated = json.load(f)

    ref_axes = set()
    for r in references:
        for tag in r.get("tags", []):
            if tag.startswith("axis:"):
                ref_axes.add(tag.split(":")[-1])
    print(f"\n📊 Evaluation for Conversation #{conversation_id}")
    print(f"🔍 Conversation Content:\n{conversation[:300]}...")
    print(f"Similarity Threshold: {threshold}")
    print(f"Generated Rubrics: {len(generated)}")
    print(f"Reference Rubrics: {len(references)}")
    print(f"Axes in Reference: {sorted(ref_axes)}")

    match_results = []
    axis_stats = {axis: {"matched": 0, "score_sum": 0.0, "conversation_count": 0} for axis in ALL_AXES}
    axis_present = {axis: False for axis in ALL_AXES}

    for ref in references:
        ref_criterion = ref["criterion"]
        ref_point = ref.get("points", 0)
        ref_axis = next((tag.split(":")[-1] for tag in ref.get("tags", []) if tag.startswith("axis:")), None)

        if ref_axis:
            axis_present[ref_axis] = True

        candidates = generated if allow_axis_mismatch else [g for g in generated if g.get("axis") == ref_axis]
        if not candidates:
            continue

        ref_emb = embedder.encode([ref_criterion])
        gen_embs = embedder.encode([g["criterion"] for g in candidates])
        sims = cosine_similarity(ref_emb, gen_embs)[0]
        best_idx = int(np.argmax(sims))
        best_sim = float(sims[best_idx])
        best_gen = candidates[best_idx]
        gen_point = best_gen.get("point", 0)
        point_diff = abs(ref_point - gen_point)

        # weight = 1.0 if point_diff <= 2 else 0.5 if point_diff <= 5 else 0.0
        # ✅ New weighting logic
        if (ref_point >= 0 and gen_point >= 0) or (ref_point < 0 and gen_point < 0):
            weight = 0.5  # If the positive or negative scores are the same, give the base score first
        else:
            weight = 0.0  # A score of 0 will be given directly if the positive or negative values are inconsistent

        if point_diff <= 3:
            weight += 0.5  # Precise consistency
        elif point_diff <= 5:
            weight += 0.3  # Roughly close
        # If the score exceeds 5 points, no additional weight will be added

        match_score = best_sim * weight

        is_similar = best_sim >= threshold
        is_axis_match = (best_gen.get("axis") == ref_axis)

        if ref_axis:
            axis_stats[ref_axis]["score_sum"] += match_score
            axis_stats[ref_axis]["matched"] += 1
            axis_stats[ref_axis]["avg_score"] = (
                axis_stats[ref_axis]["score_sum"] / axis_stats[ref_axis]["matched"]
                if axis_stats[ref_axis]["matched"] > 0 else None
            )

        match_results.append({
            "ref_criterion": ref_criterion,
            "ref_axis": ref_axis,
            "ref_point": ref_point,
            "gen_criterion": best_gen["criterion"],
            "gen_axis": best_gen.get("axis"),
            "gen_point": gen_point,
            "similarity": round(best_sim, 4),
            "point_diff": point_diff,
            "score": round(match_score, 3),
            "similar_match": is_similar,
            "axis_match": is_axis_match
        })

    for axis in ref_axes:
        axis_stats[axis]["conversation_count"] += 1

    matched_count = len(match_results)
    total_ref = len(references)
    avg_score = np.mean([r["score"] for r in match_results]) if match_results else 0.0
    avg_point_diff = np.mean([r["point_diff"] for r in match_results]) if match_results else 0.0

    print(f"\n✅ Matched: {matched_count} / {total_ref}")
    print(f"Average Matching Score: {avg_score:.3f}")
    print(f"Average Point Difference: {avg_point_diff:.2f}")

    print("\n📊 Axis-level Breakdown:")
    for axis in ALL_AXES:
        stat = axis_stats[axis]
        if stat["conversation_count"] == 0:
            print(f"- {axis:25s} | ❌ Missing in this conversation")
        else:
            avg = stat["score_sum"] / stat["matched"] if stat["matched"] > 0 else 0.0
            print(f"- {axis:25s} | Matched: {stat['matched']:2d} | Avg Score: {avg:.3f}")

    # Save
    with open(f"outputs/eval/matched_pair/matched_pairs_{conversation_id}.json", "w") as f:
        json.dump(match_results, f, indent=2)

    summary = {
        "conversation_id": conversation_id,
        "matched": matched_count,
        "total_reference": total_ref,
        "avg_score": avg_score,
        "avg_point_diff": avg_point_diff,
        "axis_stats": axis_stats
    }
    with open(f"outputs/eval/eval/eval_{conversation_id}.json", "w") as f:
        json.dump(summary, f, indent=2)

    return summary

In [42]:
# example
evaluate_rubrics(conversation_id=0)


📊 Evaluation for Conversation #0
🔍 Conversation Content:
User: I’m a 39 year old female. I’m an emergency medicine physician. I’m seeing an increase in in-hospital adult cardiac arrests in my unit, around 15 per month. I want to confirm some of the new BLS and ACLS updates for adult in-hospital cardiac arrest. Specifically, I’d like the recommended compre...
Similarity Threshold: 0.75
Generated Rubrics: 10
Reference Rubrics: 10
Axes in Reference: ['accuracy', 'completeness']

✅ Matched: 10 / 10
Average Matching Score: 0.304
Average Point Difference: 5.20

📊 Axis-level Breakdown:
- completeness              | Matched:  1 | Avg Score: 0.000
- accuracy                  | Matched:  9 | Avg Score: 0.338
- context_awareness         | ❌ Missing in this conversation
- communication_quality     | ❌ Missing in this conversation
- instruction_following     | ❌ Missing in this conversation


{'conversation_id': 0,
 'matched': 10,
 'total_reference': 10,
 'avg_score': np.float64(0.30379999999999996),
 'avg_point_diff': np.float64(5.2),
 'axis_stats': {'completeness': {'matched': 1,
   'score_sum': 0.0,
   'conversation_count': 1,
   'avg_score': 0.0},
  'accuracy': {'matched': 9,
   'score_sum': 3.0381782054901123,
   'conversation_count': 1,
   'avg_score': 0.33757535616556805},
  'context_awareness': {'matched': 0,
   'score_sum': 0.0,
   'conversation_count': 0},
  'communication_quality': {'matched': 0,
   'score_sum': 0.0,
   'conversation_count': 0},
  'instruction_following': {'matched': 0,
   'score_sum': 0.0,
   'conversation_count': 0}}}

In [15]:
def batch_evaluate_rubrics(total_count, verbose=False):
    os.makedirs("outputs/eval/summary", exist_ok=True)
    
    all_scores = []
    skipped = 0
    
    axis_global = {
        axis: {"score_sum": 0.0, "matched": 0, "count": 0} for axis in ALL_AXES
    }

    for i in tqdm(range(total_count), desc="Evaluating rubrics"):
        gen_path = f"outputs/rubrics/rubrics_{i}.json"
        ref_path = f"outputs/ref_rubrics/ref_rubrics_{i}.json"

        # 💡 1. Skip if generated rubrics or reference rubrics don't exist
        if not (os.path.exists(gen_path) and os.path.exists(ref_path)):
            skipped += 1
            continue

        # 💡 2. Skip if generated rubrics is empty or only contains empty list / dict
        with open(gen_path) as f:
            gen_rubrics = json.load(f)
        if not gen_rubrics or (isinstance(gen_rubrics, list) and len(gen_rubrics) == 0):
            skipped += 1
            continue

        try:
            summary = evaluate_rubrics(i)
            all_scores.append(summary)
            
            axis_stats = summary.get("axis_stats", {})
            for axis in ALL_AXES:
                axis_info = axis_stats.get(axis, {})
                score_sum = axis_info.get("score_sum", 0.0)
                matched = axis_info.get("matched", 0)
                count = axis_info.get("conversation_count", 0)

                axis_global[axis]["score_sum"] += score_sum
                axis_global[axis]["matched"] += matched
                axis_global[axis]["count"] += count
            
        except Exception as e:
            print(f"❌ Evaluation failed for conversation {i}: {e}")
            skipped += 1

    # ✅ Compute overall metrics
    total_evals = len(all_scores)
    if total_evals == 0:
        print("⚠️ No successful evaluations.")
        return
    
    axis_global_avg = {
        axis: round(axis_global[axis]["score_sum"] / axis_global[axis]["matched"], 4)
        if axis_global[axis]["matched"] > 0 else None
        for axis in ALL_AXES
    }

    avg_score = np.mean([s["avg_score"] for s in all_scores])
    avg_point_diff = np.mean([s["avg_point_diff"] for s in all_scores])
    total_matched = sum([s["matched"] for s in all_scores])
    total_ref = sum([s["total_reference"] for s in all_scores])

    print(f"\n🎯 Evaluation completed.")
    print(f"✅ Successfully evaluated: {total_evals}")
    print(f"⛔ Skipped: {skipped}")
    print(f"📊 Overall avg match score: {avg_score:.3f}")
    print(f"📉 Overall avg point difference: {avg_point_diff:.2f}")
    print(f"🔢 Total matched / total reference: {total_matched} / {total_ref} ({total_matched/total_ref:.2%})")

    # save summary
    with open("outputs/eval/summary/batch_eval_summary.json", "w") as f:
        json.dump({
            "evaluated": total_evals,
            "skipped": skipped,
            "overall_avg_score": avg_score,
            "overall_avg_point_diff": avg_point_diff,
            "global_axis_scores": axis_global_avg,
            "per_axis_stats": axis_global,
            "total_matched": total_matched,
            "total_reference": total_ref,
        }, f, indent=2)

    return all_scores


In [None]:
batch_evaluate_rubrics(total_count=2735)
