# Data Extraction

In [1]:
import re, pandas as pd
from glob import glob
from PyPDF2 import PdfReader
import time


In [None]:
def extract_workout_from_pdf(pdf_path):
    # tables = camelot.read_pdf(pdf_path, flavor='lattice', pages='1-end')
    # if not tables.n:
    #     tables = camelot.read_pdf(pdf_path, flavor='stream', pages='1-end')
    # df_tables = [t.df[0].str.split("\n", expand=True) for t in tables]

    reader = PdfReader(pdf_path)
    raw_text = reader.pages[0].extract_text()  

    data = {'pdf_path': pdf_path}

    tables_pattern = re.compile(
        r'(?s)\A(.+?)(?=\nMUSCLEANDSTRENGTH\.COM)',
    )
    m = tables_pattern.search(raw_text)
    tables = m.group(1).strip() if m else None
    data['tables'] = tables

    labels = (
        "Main Goal|Training Level|Program Duration|"
        "Days Per Week|Time Per Workout|Equipment|Author|Target Gender"
    )
    metadata_pattern = re.compile(
        rf"(?P<label>{labels}):\s*"
        r"(?P<value>[^:]+?)(?="
        rf"(?:{labels}):|$)"
    )
    for m in metadata_pattern.finditer(raw_text):
        label = m.group("label").strip()
        value = m.group("value").strip()
        data[label] = value


    desc_pattern = re.compile(
        r"""MUSCLEANDSTRENGTH\.COM[\s\S]*?Tools\s*\n 
        (?P<desc>[\s\S]+?)             
        (?=\nMain\s+Goal:)          
        """,
        re.VERBOSE | re.MULTILINE
    )

    m = desc_pattern.search(raw_text)
    if m:
        desc = m.group("desc").strip()
    else:
        desc_pattern = re.compile(
            r"MUSCLEANDSTRENGTH\.COM\s*\r?\n"
            r"(?P<desc>.*?)"
            r"(?=\r?\nMain\s+Goal:)",
            re.DOTALL
        )
        m = desc_pattern.search(raw_text)
        desc = m.group("desc").strip() if m else None
    data['Description'] = desc

    data['raw_text'] = raw_text

    return pd.DataFrame.from_dict(data, orient='index').T


In [None]:
all_plans = []
for pdf in glob('./samples/**/*.pdf'):
    all_plans.append(extract_workout_from_pdf(pdf))

master_df = pd.concat(all_plans, ignore_index=True)

In [3]:
from langchain_core.messages import SystemMessage, HumanMessage

def improve_baseline(raw_text, model):
    system_message = SystemMessage(
        content="""
You are an expert fitness coach and formatter. When given a block of unstructured “raw” workout text, you must:

1. Parse out each day or section header (e.g. “Monday – Back & Chest”)

2. Under each day, display the exercises in a clean table or bullet list with columns for Exercise, Sets, Reps (and Notes if present).

3. Preserve any supersets, rest-pause notes, or AMAP indicators.

4. At the end, include a brief “Overview” paragraph that summarizes the weekly split, total days per week, and main focus.

5. Output in Markdown so it’s immediately human-readable.
"""
    )

    user_message = HumanMessage(
        content=(
            "Here is a raw baseline workout plan. Please reorganize it into a well-structured, human-readable weekly plan following the system instructions:\n"
            f"{raw_text}"
        )
    )

    response = model.invoke([system_message, user_message])
    
    return response.content



In [None]:
baseline_plans = []
for m in master_df['raw_text']:
    baseline_plans.append(improve_baseline(m, gemini_chat))
    time.sleep(10)

In [None]:
master_df['improved_text'] = baseline_plans
master_df.to_csv('./data/workouts_data.csv', index=False)

In [2]:
master_df = pd.read_csv('./data/workouts_data.csv')
master_df

Unnamed: 0,pdf_path,tables,Main Goal,Training Level,Program Duration,Days Per Week,Time Per Workout,Equipment,Author,Description,Target Gender,raw_text,improved_text
0,./samples/back/8weekchestback.pdf,Monday - Back & Chest\nExercise Sets Reps\nWor...,Build Muscle,Beginner,8 Weeks,4 Days,45-60 Mins,"Barbell, Bodyweight, \nDumbbells, Machines",Steve Shaw,This back and chest specialization workout is ...,Male & Female,Monday - Back & Chest\nExercise Sets Reps\nWor...,Here is your structured weekly workout plan:\n...
1,./samples/back/backandshoulderwomen.pdf,Workout Routine for Women\nExercise Sets Reps\...,Build Muscle,Beginner,6 Weeks,2 Days,45-60 Mins,"Bodyweight, \nDumbbells",Holly Blumenberg,BACK & SHOULDER WORKOUT ROUTINE \nFOR WOMEN\nT...,Female,Workout Routine for Women\nExercise Sets Reps\...,Here is your reorganized workout plan:\n\n## W...
2,./samples/back/cobraworkout.pdf,Monday: Super Thick Workout\nExercise Sets Rep...,Build Muscle,Intermediate,8 Weeks,2 Days,60-90 Mins,"Barbell, Bodyweight,\nCables, Dumbbells",Coach Dustin Myers,THE COBRA WORKOUT: \nHEAVY HIGH VOLUME BACK PR...,Male & Female,Monday: Super Thick Workout\nExercise Sets Rep...,Here is your reorganized workout plan:\n\n---\...
3,./samples/begineer/12weekfullbodyworkoutroutin...,Weeks 1-3: Total Body Circuit Workout\nDuring ...,Build Muscle,Beginner,12 Weeks,3 Day,30-45 Mins,"Barbell, Bodyweight, \nCables, Dumbbells, Mach...",Doug Lawrenson,12 WEEK FULL BODY WORKOUT \nROUTINE FOR BEGINN...,Male & Female,Weeks 1-3: Total Body Circuit Workout\nDuring ...,Here is your structured 12-week beginner worko...
4,./samples/begineer/4dayupperlowerplantfitnessw...,Planet Fitness Upper Body Workout A\nExercise ...,Build Muscle,Beginner,8 Weeks,4 Days,45 - 70 Mins,"Bodyweight, Cables, \nDumbbells, EZ Bar, Machines",Josh England,4 DAY UPPER/LOWER PLANET FITNESS\nWORKOUT (MAC...,Male & Female,Planet Fitness Upper Body Workout A\nExercise ...,Here is your structured weekly workout plan:\n...
5,./samples/begineer/bodyweightbasics.pdf,Day 1: Upper Body\nExercise Sets Reps\nPull Up...,General Fitness,Beginner,4 Weeks,3 Days,15-30 Mins,Bodyweight,Roger “Rock” Lockridge,BODYWEIGHT BASICS: \n3 DAY BODYWEIGHT WORKOUT ...,Male & Female,Day 1: Upper Body\nExercise Sets Reps\nPull Up...,Here is your reorganized bodyweight workout pl...
6,./samples/begineer/thebest15minutewarmups.pdf,Exercise Sets Reps/Time Rest\nFast-Paced Walk ...,General Fitness,Beginner,,,15 Mins,"Bands, Barbell, \nBodyweight, Foam Roll, Kettl...",Roger “Rock” Lockridge\nFull-Body Focused Warm...,THE BEST 15-MINUTE WARM-UPS\nMaximize your wor...,Male & Female,Exercise Sets Reps/Time Rest\nFast-Paced Walk ...,"Here is your reorganized workout plan, structu..."
7,./samples/chest/12weekstoabiggerbenchpress.pdf,Monday\nExercise Sets Reps\nBench Press 3 See ...,Increase Strength,Beginner,12 Weeks,3 Days,45-60 Mins,"Barbell, Bodyweight, \nDumbbells, EZ Bar, Machine",Jonathan Byrd,How much do you bench? If you feel it’s never ...,Male & Female,Monday\nExercise Sets Reps\nBench Press 3 See ...,# 12 Weeks to a Bigger Bench Press\n\n## Monda...
8,./samples/chest/bestchestworkout.pdf,The Stretch & Push Workout\nExercise Sets Reps...,Build Muscle,Intermediate,8 Weeks,1 Day,45-60 Mins,"Barbell, Bodyweight,\nCables, Dumbbells",Roger “Rock” Lockridge,BEST CHEST WORKOUT: THE PUSH & \nSTRETCH METHO...,Male & Female,The Stretch & Push Workout\nExercise Sets Reps...,Here is your reorganized workout plan:\n\n## T...
9,./samples/chest/pecpounder8weekchestworkouttos...,The Hybrid PRRS™ Method\nExercise Tempo Sets R...,Build Muscle,Intermediate,8 Weeks,1 Day,45-60 Mins,"Barbell, Cables, \nDumbbells, Machines",Eric Broser,PEC-POUNDER: 8 WEEK CHEST WORKOUT \nTO SHATTER...,Male,The Hybrid PRRS™ Method\nExercise Tempo Sets R...,# PEC-POUNDER: 8 WEEK CHEST WORKOUT\n\nThis pr...


# Create Queries

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

gemini_api_key = os.getenv("GEMINI_API_KEY")

In [4]:
from langchain_google_genai import ChatGoogleGenerativeAI

MODEL_NAME = "gemini-2.5-flash-preview-05-20"

gemini_chat = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=0, api_key=gemini_api_key)
gemini_chat_half = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=0.5, api_key=gemini_api_key)
gemini_chat_1 = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=1, api_key=gemini_api_key)


In [5]:
from pydantic import BaseModel, Field
from typing import Literal

class UserRegistrationQuery(BaseModel):
    email: str = Field(description="User's email address")
    password: str = Field(description="A secure password")
    first_name: str = Field(description="User's given name")
    last_name: str = Field(description="User's family name")
    date_of_birth: str = Field(description="YYYY-MM-DD format")
    sex: Literal["M", "F", "O"] = Field(description="User's gender identity")
    height_cm: int = Field(description="Height in centimeters")
    weight_kg: int = Field(description="Weight in kilograms")
    activity_level: Literal["Sedentary", "Light", "Moderate", "Active", "Very Active"] = Field(description="Typical daily activity level")
    fitness_goals: str = Field(description="User's primary fitness objectives")
    query: str = Field(description="A 2-3 sentence natural-language request describing their goals, background, and constraints")

structured_model = gemini_chat.with_structured_output(UserRegistrationQuery)


In [6]:
from langchain_core.messages import SystemMessage, HumanMessage

def create_prompt(metadata, model):
    system_message = SystemMessage(
        content=(
            "You are a virtual fitness-app assistant. "
            "For the given workout-plan metadata, generate:\n"
            "1. A plausible user registration record (email, password, first_name, last_name, date_of_birth, sex, height_cm, weight_kg, activity_level, fitness_goals) that fits the plan's focus.\n"
            "2. A 2-3 sentence user query explaining their background, goals, and constraints.\n"
            "Output in JSON matching the UserRegistrationQuery Pydantic model exactly."
        )
    )

    metadata_lines = "\n".join(f"- {k}: {v}" for k, v in metadata.items())
    user_message = HumanMessage(
        content=(
            "Here is the workout plan metadata:\n"
            f"{metadata_lines}\n\n"
            "Now respond as instructed above."
        )
    )

    response = model.invoke([system_message, user_message])
    return response



In [8]:
metadata = master_df[["Main Goal", "Training Level", "Program Duration", "Days Per Week", "Time Per Workout", "Equipment", "Target Gender", "Description"]].to_dict(orient='records')

In [None]:
records = []
for m in metadata:
    records.append(create_prompt(m, structured_model).model_dump())
    time.sleep(10)

records_df = pd.DataFrame(records)

In [None]:
records_df.to_csv('./data/records.csv', index=False)

In [5]:
records_df = pd.read_csv('./data/records.csv')
records_df

Unnamed: 0,email,password,first_name,last_name,date_of_birth,sex,height_cm,weight_kg,activity_level,fitness_goals,query,agent_plan
0,john.doe1@example.com,SecureP@ssw0rd!,John,Doe,1995-05-15,M,175,70,Light,"Build muscle and increase strength, especially...",I'm new to structured weight training and want...,It's fantastic that you're ready to embark on ...
1,jane.doe2@example.com,SecurePass123!,Jane,Doe,1998-05-15,F,165,60,Light,"Build muscle, specifically focusing on develop...",I'm a beginner looking to start building muscl...,Hello Jane! It's fantastic that you're ready t...
2,john.doe3@example.com,SecureP@ss123,John,Doe,1992-05-15,M,180,85,Active,Build Muscle,I'm an intermediate lifter looking to signific...,It's fantastic that you're committed to buildi...
3,john.doe4@example.com,SecurePassword123!,John,Doe,2000-05-15,M,175,70,Light,Build muscle and gain strength,I'm a complete beginner to working out and wan...,"Hello John,\n\nWelcome to your fitness journey..."
4,john.doe5@example.com,SecurePassword123!,John,Doe,1995-05-15,M,175,70,Light,Build Muscle,I'm new to weightlifting and looking to build ...,Here is a personalized 4-day workout plan desi...
5,sarah.miller@example.com,SecurePass123!,Sarah,Miller,1992-05-15,F,165,68,Light,"Improve general fitness, build a foundation, a...",I'm new to working out and looking for a simpl...,"Hi Sarah,\n\nIt's fantastic that you're ready ..."
6,alex.smith@example.com,SecureP@ss123,Alex,Smith,1995-05-15,M,175,70,Moderate,"Improve overall fitness, prepare for workouts,...",I'm new to structured fitness and looking for ...,"Hello Alex,\n\nIt's fantastic that you're prio..."
7,john.doe6@example.com,SecureP@ssw0rd!,John,Doe,1995-05-15,M,175,80,Active,"Increase strength, specifically bench press",I'm a beginner looking to significantly increa...,Here is a personalized 3-day-a-week workout pl...
8,john.doe7@example.com,SecureP@ssw0rd!,John,Doe,1995-05-15,M,175,75,Active,Build Muscle,I'm an intermediate lifter looking to signific...,Here's a personalized workout plan designed to...
9,john.doe8@example.com,SecureP@ssw0rd!,John,Doe,1995-05-15,M,180,85,Active,"Build muscle, specifically focusing on chest d...",I've been consistently working out for a while...,It's fantastic that you're consistently workin...


# LLM as a Judge

In [6]:
from pydantic import BaseModel, Field
from typing import Literal

class PlanComparison(BaseModel):
    relevance: Literal["A", "B", "Tie"] = Field(
        description="Which plan better aligns with the persona's stated goals"
    )
    completeness: Literal["A", "B", "Tie"] = Field(
        description="Which plan provides more thorough information"
    )
    creativity: Literal["A", "B", "Tie"] = Field(
        description="Which plan shows more novel or varied exercise choices"
    )
    safety_and_evidence: Literal["A", "B", "Tie"] = Field(
        description="Which plan is safer and more grounded in exercise science"
    )
    clarity: Literal["A", "B", "Tie"] = Field(
        description="Which plan is more clearly written and well-organized"
    )
    overall: Literal["A", "B", "Tie"] = Field(
        description="Overall better plan considering all criteria"
    )
    rationale: str = Field(
        description="A brief, objective explanation of the judgment"
    )


In [7]:
from langchain_core.messages import SystemMessage, HumanMessage

structured_model = gemini_chat.with_structured_output(PlanComparison)

def judge(query, baseline_plan, agent_plan):
    system_message = SystemMessage(
        content=(
            "You are an expert fitness coach."
        )
    )
    message = f"""Given the User Persona:
{query}

Plan A:
{baseline_plan}

Plan B:
{agent_plan}

Evaluate each plan on five dimensions: Relevance, Completeness, Creativity, Safety and Evidence-Grounding, and Clarity and Organization. without bias toward presentation order, response length, or plan names.

For each dimension, choose “A” if Plan A is stronger, “B” if Plan B is stronger, or “Tie” if they are equal. Then decide the overall winner in the same way. Finally, provide a concise rationale.

Output must conform to the PlanComparison schema."""
    user_message = HumanMessage(
        content=message
    )

    response = structured_model.invoke([system_message, user_message])

    return response

In [8]:
from db.db_manager import FitnessDB

db_manager = FitnessDB(db_path='./fitness_temp1.db', use_dummy_data=False)

SQLite database './fitness_temp1.db' created/verified with updated schema.


In [9]:
from werkzeug.security import generate_password_hash

for i, user in records_df.iterrows():
    db_manager.create_user(
        email = user['email'],
        password_hash = generate_password_hash(user['password']),
        first_name = user['first_name'],
        last_name = user['last_name'],
        date_of_birth = user['date_of_birth'],
        sex =  user['sex'],
        height_cm = user['height_cm'],
        weight_kg = user['weight_kg'],
        activity_level = user['activity_level'],
        fitness_goals = user['fitness_goals'],
    )

In [10]:
from agents.main import MainGraph

agent = MainGraph(gemini_chat, db_manager, last_messages=1).compile()

Making the book retriever ready =====
Creating Database =====


Loading files:   0%|          | 0/2 [00:00<?, ?file/s]

Started parsing the file under job_id 4fc9efdb-636e-4544-a97b-209c7aceb402
..

Loading files:  50%|█████     | 1/2 [00:47<00:47, 47.77s/file]

Started parsing the file under job_id cc87f69a-58e8-49fa-91b9-0f787164a69d
.

Loading files: 100%|██████████| 2/2 [01:26<00:00, 43.15s/file]


There are 1366 chunks to be processed.


Batches of chunks: 100%|██████████| 137/137 [00:00<00:00, 1462.49it/s]


Book Retriever is ready ====


### Compare with raw baseline

In [12]:
plans = []
judges = []
for i, user in records_df.iterrows():
    agent_plan = agent.invoke({"messages": HumanMessage(content=user['query']), 'user_id':i+1}, config={"configurable": {"thread_id": i+1}})['messages'][-1].content
    time.sleep(50)
    plans.append(agent_plan)
    j = judge(user['query'], master_df['tables'][i], agent_plan).model_dump()
    time.sleep(10)
    judges.append(j)
records_df['agent_plan'] = plans
judges_df = pd.DataFrame(judges)

Function 'node_task_identifier'


Returned: {'task_type': <TaskType.FITNESS: 'fitness'>} 
Function 'node_check_tasks'
Returned: node_plan 
Function 'node_plan'
Function 'node_log_summary'
Function 'node_summarize'
Returned: {'messages': [], 'user_id': 1} 
Returned: {'log_summary': None} 
Function 'node_search_doc'
Returned: {'messages': [AIMessage(content='', additional_kwargs={'function_call': {'name': 'search_books', 'arguments': '{"query": "strength training for beginners chest and back exercises"}'}}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run-4582aa3d-4ceb-404d-9f52-68487540f649-0', tool_calls=[{'name': 'search_books', 'args': {'query': 'beginner weight training program for muscle gain chest back'}, 'id': '78c904cc-0963-48f6-960c-cd8f2293fdc6', 'type': 'tool_call'}, {'name': 'search_books', 'args': {'query': 'full body workout routine for beginners 3 days a week'}, 'id': '056465da-2400-42db-9c9c-f9686400227d', 'type': 'to

In [17]:
judges_df

Unnamed: 0,relevance,completeness,creativity,safety_and_evidence,clarity,overall,rationale
0,B,B,A,B,B,B,"Plan B is superior due to its comprehensive, b..."
1,B,B,B,B,B,B,Plan B is significantly superior due to its co...
2,B,B,A,B,B,B,Plan B is superior due to its comprehensive na...
3,B,B,A,B,B,B,Plan B is superior due to its comprehensive na...
4,B,B,B,B,B,B,Plan B is significantly better as it provides ...
5,B,B,B,B,B,B,Plan B is superior as it directly addresses al...
6,B,B,A,B,B,B,"Plan B is superior due to its clear structure,..."
7,B,B,B,B,B,B,Plan B is significantly superior due to its co...
8,B,B,B,B,B,B,"Plan B is significantly more comprehensive, re..."
9,B,B,A,B,B,B,"Plan B is a comprehensive, well-structured, an..."


In [15]:
records_df.to_csv('./data/records.csv', index=False)
judges_df.to_csv('./data/judges.csv', index=False)

### Compare with improved baseline

In [11]:
judges = []
for i, user in records_df.iterrows():
    j = judge(user['query'], master_df['improved_text'][i], user['agent_plan']).model_dump()
    time.sleep(10)
    judges.append(j)
judges_df_2 = pd.DataFrame(judges)

In [12]:
judges_df_2

Unnamed: 0,relevance,completeness,creativity,safety_and_evidence,clarity,overall,rationale
0,B,B,A,B,B,B,Plan B is significantly better for a beginner ...
1,B,B,B,B,B,B,Plan B is significantly better as it provides ...
2,B,B,A,B,B,B,Plan B is superior due to its comprehensive na...
3,B,B,A,B,B,B,Plan B is superior for a complete beginner due...
4,Tie,B,Tie,B,B,B,Plan B is significantly more comprehensive and...
5,B,B,B,B,B,B,Plan B is significantly better as it fully adh...
6,B,B,A,B,B,B,Plan B is superior due to its beginner-friendl...
7,B,B,B,B,B,B,Plan B is significantly superior due to its co...
8,B,B,B,B,B,B,"Plan B is significantly more comprehensive, pr..."
9,B,B,A,B,B,B,Plan B is superior due to its comprehensive st...


In [13]:
judges_df_2.to_csv('./data/judges_2.csv', index=False)

# Analysis

In [None]:
from sklearn.metrics import cohen_kappa_score

judges_df = pd.read_csv('./data/judges.csv')

overall_counts = judges_df.query("overall in ['A','B']")['overall'].value_counts()
n_overall = overall_counts.sum()
win_rate_A = overall_counts.get('A', 0) / n_overall
win_rate_B = overall_counts.get('B', 0) / n_overall

print(f"Overall win-rate: Baseline = {win_rate_A:.2%}, Agent = {win_rate_B:.2%} (n={n_overall})")

criteria = ['relevance', 'completeness', 'creativity', 'safety_and_evidence', 'clarity']
breakdown = {}
for crit in criteria:
    mask = judges_df['overall'].isin(['A','B']) & judges_df[crit].isin(['A','B'])
    y_overall = judges_df.loc[mask, 'overall']
    y_crit    = judges_df.loc[mask, crit]

    a = (y_crit == 'A').sum()
    b = (y_crit == 'B').sum()
    ties = len(judges_df) - len(y_crit)

    print(f"\n--- {crit.capitalize()} Breakdown ---")
    print(f"{crit.capitalize()}: Baseline = {a}, Agent = {b}, tie/skipped = {ties}")
    
    if a and b:
        kappa = cohen_kappa_score(y_overall, y_crit, labels=['A', 'B'])
    else:
        kappa = pd.NA
    print(f"Cohen's Kappa (overall vs {crit}): {kappa:.2f}")



Overall win-rate: Baseline = 0.00%, Agent = 100.00% (n=37)

--- Relevance Breakdown ---
Relevance: Baseline = 0, Agent = 33, tie/skipped = 4
Cohen's Kappa (overall vs relevance): <NA>

--- Completeness Breakdown ---
Completeness: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs completeness): <NA>

--- Creativity Breakdown ---
Creativity: Baseline = 16, Agent = 16, tie/skipped = 5
Cohen's Kappa (overall vs creativity): 0.00

--- Safety_and_evidence Breakdown ---
Safety_and_evidence: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs safety_and_evidence): <NA>

--- Clarity Breakdown ---
Clarity: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs clarity): <NA>


In [14]:
from sklearn.metrics import cohen_kappa_score

judges_df = pd.read_csv('./data/judges_2.csv')

overall_counts = judges_df.query("overall in ['A','B']")['overall'].value_counts()
n_overall = overall_counts.sum()
win_rate_A = overall_counts.get('A', 0) / n_overall
win_rate_B = overall_counts.get('B', 0) / n_overall

print(f"Overall win-rate: Baseline = {win_rate_A:.2%}, Agent = {win_rate_B:.2%} (n={n_overall})")

criteria = ['relevance', 'completeness', 'creativity', 'safety_and_evidence', 'clarity']
breakdown = {}
for crit in criteria:
    mask = judges_df['overall'].isin(['A','B']) & judges_df[crit].isin(['A','B'])
    y_overall = judges_df.loc[mask, 'overall']
    y_crit    = judges_df.loc[mask, crit]

    a = (y_crit == 'A').sum()
    b = (y_crit == 'B').sum()
    ties = len(judges_df) - len(y_crit)

    print(f"\n--- {crit.capitalize()} Breakdown ---")
    print(f"{crit.capitalize()}: Baseline = {a}, Agent = {b}, tie/skipped = {ties}")
    
    if a and b:
        kappa = cohen_kappa_score(y_overall, y_crit, labels=['A', 'B'])
    else:
        kappa = pd.NA
    print(f"Cohen's Kappa (overall vs {crit}): {kappa:.2f}")



Overall win-rate: Baseline = 0.00%, Agent = 100.00% (n=37)

--- Relevance Breakdown ---
Relevance: Baseline = 1, Agent = 29, tie/skipped = 7
Cohen's Kappa (overall vs relevance): 0.00

--- Completeness Breakdown ---
Completeness: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs completeness): <NA>

--- Creativity Breakdown ---
Creativity: Baseline = 15, Agent = 15, tie/skipped = 7
Cohen's Kappa (overall vs creativity): 0.00

--- Safety_and_evidence Breakdown ---
Safety_and_evidence: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs safety_and_evidence): <NA>

--- Clarity Breakdown ---
Clarity: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs clarity): <NA>
