In [1]:
import re, pandas as pd
from glob import glob
from PyPDF2 import PdfReader
import time
from IPython.display import Markdown, display


# Baseline Data Extraction

In [None]:
def extract_workout_from_pdf(pdf_path):
    # tables = camelot.read_pdf(pdf_path, flavor='lattice', pages='1-end')
    # if not tables.n:
    #     tables = camelot.read_pdf(pdf_path, flavor='stream', pages='1-end')
    # df_tables = [t.df[0].str.split("\n", expand=True) for t in tables]

    reader = PdfReader(pdf_path)
    raw_text = reader.pages[0].extract_text()  

    data = {'pdf_path': pdf_path}

    tables_pattern = re.compile(
        r'(?s)\A(.+?)(?=\nMUSCLEANDSTRENGTH\.COM)',
    )
    m = tables_pattern.search(raw_text)
    tables = m.group(1).strip() if m else None
    data['tables'] = tables

    labels = (
        "Main Goal|Training Level|Program Duration|"
        "Days Per Week|Time Per Workout|Equipment|Author|Target Gender"
    )
    metadata_pattern = re.compile(
        rf"(?P<label>{labels}):\s*"
        r"(?P<value>[^:]+?)(?="
        rf"(?:{labels}):|$)"
    )
    for m in metadata_pattern.finditer(raw_text):
        label = m.group("label").strip()
        value = m.group("value").strip()
        data[label] = value


    desc_pattern = re.compile(
        r"""MUSCLEANDSTRENGTH\.COM[\s\S]*?Tools\s*\n 
        (?P<desc>[\s\S]+?)             
        (?=\nMain\s+Goal:)          
        """,
        re.VERBOSE | re.MULTILINE
    )

    m = desc_pattern.search(raw_text)
    if m:
        desc = m.group("desc").strip()
    else:
        desc_pattern = re.compile(
            r"MUSCLEANDSTRENGTH\.COM\s*\r?\n"
            r"(?P<desc>.*?)"
            r"(?=\r?\nMain\s+Goal:)",
            re.DOTALL
        )
        m = desc_pattern.search(raw_text)
        desc = m.group("desc").strip() if m else None
    data['Description'] = desc

    data['raw_text'] = raw_text

    return pd.DataFrame.from_dict(data, orient='index').T


In [None]:
all_plans = []
for pdf in glob('./samples/**/*.pdf'):
    all_plans.append(extract_workout_from_pdf(pdf))

master_df = pd.concat(all_plans, ignore_index=True)

In [None]:
from langchain_core.messages import SystemMessage, HumanMessage

def improve_baseline_text(raw_text, model):
    system_message = SystemMessage(
        content="""
You are an expert fitness coach and formatter. When given a block of unstructured “raw” workout text, you must:

1. Parse out each day or section header (e.g. “Monday – Back & Chest”)

2. Under each day, display the exercises in a clean table or bullet list with columns for Exercise, Sets, Reps (and Notes if present).

3. Preserve any supersets, rest-pause notes, or AMAP indicators.

4. At the end, include a brief “Overview” paragraph that summarizes the weekly split, total days per week, and main focus.

5. Output in Markdown so it’s immediately human-readable.
"""
    )

    user_message = HumanMessage(
        content=(
            "Here is a raw baseline workout plan. Please reorganize it into a well-structured, human-readable weekly plan following the system instructions:\n"
            f"{raw_text}"
        )
    )

    response = model.invoke([system_message, user_message])
    
    return response.content



In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

avalai_api_key = os.getenv("AVALAI_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")

In [3]:
from langchain_openai import ChatOpenAI

AVALAI_BASE_URL = "https://api.avalai.ir/v1/"
MODEL = "gpt-4.1-nano"
MODEL_JUDGE ="gpt-4.1-mini"

llm = ChatOpenAI(
    model=MODEL,
    base_url=AVALAI_BASE_URL,
    api_key=avalai_api_key,
    temperature=1,
)
llm_judge = ChatOpenAI(
    model=MODEL_JUDGE,
    base_url=AVALAI_BASE_URL,
    api_key=avalai_api_key,
    temperature=0,
)

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.rate_limiters import InMemoryRateLimiter

rate_limiter = InMemoryRateLimiter(requests_per_second=0.1, check_every_n_seconds=10, max_bucket_size=10)

MODEL_NAME = "gemini-2.5-flash-preview-05-20"

gemini_chat = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=0, base_url=gemini_api_key, api_key=avalai_api_key, rate_limiter=rate_limiter)
# gemini_chat_half = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=0.5, api_key=gemini_api_key, rate_limiter=rate_limiter)
# gemini_chat_1 = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=1, api_key=gemini_api_key, rate_limiter=rate_limiter)


In [None]:
baseline_plans = []
for m in master_df['raw_text']:
    baseline_plans.append(improve_baseline_text(m, llm))
    # time.sleep(10)

In [None]:
master_df['improved_text'] = baseline_plans
master_df.to_csv('./data/workouts_data.csv', index=False)

In [None]:
display(Markdown(baseline_plans[0]))

In [None]:
from langchain_core.messages import SystemMessage, HumanMessage

def improve_baseline_tables(raw_text, model):
    system_message = SystemMessage(
        content="""
You are an expert fitness coach and formatter. Given a block of unstructured workout text, produce ONLY Markdown tables—one table per day/section—without any extra text.

Rules:
1) Detect each day/section header (e.g., "Monday – Back & Chest"). For each detected section, output exactly one Markdown table immediately after a level-3 heading with the day/section name.
2) Table columns must be: | Exercise | Sets | Reps | Notes |
3) Preserve special cues:
   - Supersets: indicate as "Superset: <Exercise A> + <Exercise B>" in the Exercise cell, or add details in Notes.
   - Rest-pause: add "Rest–pause" in Notes.
   - AMAP: put "AMAP" in the Reps column and add "(As Many As Possible)" in Notes.
   - Alternatives ("or"): keep them in the Exercise cell, e.g., "Pull Ups or Inverted Rows".
4) Normalize rep ranges to an en dash (e.g., "8–12").
5) Keep the original exercise order. If sets/reps are missing, leave the cell blank.
6) Do NOT include any overview, explanations, links, or narrative—ONLY the headings and tables.

Output format example (structure only):
### Monday – Back & Chest
| Exercise | Sets | Reps | Notes |
|---|---:|---:|---|
| Deadlifts | 3 | 3 | |
| Dips | 3 | AMAP | (As Many As Possible) |
"""
    )

    user_message = HumanMessage(
        content=(
            "Here is a raw baseline workout plan. Please reorganize it into sectioned Markdown tables following the system instructions:\n"
            f"{raw_text}"
        )
    )

    response = model.invoke([system_message, user_message])
    return response.content


In [None]:
baseline_tables = []
for m in master_df['raw_text']:
    baseline_tables.append(improve_baseline_tables(m, llm))
    time.sleep(10)

In [None]:
master_df['improved_tables'] = baseline_tables
master_df.to_csv('./data/workouts_data.csv', index=False)

In [None]:
display(Markdown(baseline_tables[0]))

### Monday – Back & Chest
| Exercise | Sets | Reps | Notes |
|---|---:|---:|---|
| Deadlifts | 3 | 3 | |
| Bench Press | 3 | 5 | |
| Pendlay Rows | 3 | 8–10 | |
| Incline Dumbbell Bench Press | 3 | 8–10 | |
| One Arm Dumbbell Rows | 2 | 20–25 | |
| Dips | 3 | AMAP | (As Many As Possible) |
| Barbell Shrugs | 3 | 12–15 | |

### Thursday – Back & Chest
| Exercise | Sets | Reps | Notes |
|---|---:|---:|---|
| Pull Ups or Inverted Rows | 6 | AMAP | Rest–pause, (As Many As Possible) |
| Machine Chest Press | 6 | 8–12 | Rest–pause |
| Lat Pull Downs | 6 | 8–12 | Rest–pause |
| Dumbbell Flye or Pec Dec | 6 | 8–12 | Rest–pause |
| Dumbbell Shrugs | 6 | 10 | Rest–pause |

### Saturday – Shoulders & Arms
| Exercise | Sets | Reps | Notes |
|---|---:|---:|---|
| Seated Overhead Press | 4 | 8–12 | |
| Upright Row or Seated Dumbbell Press | 3 | 8–12 | |
| Superset: Side Lateral Raise + Bent Over Reverse Flye | 3 | 10–15 | |
| Dumbbell Curl | 3 | 8–12 | |
| Skullcrushers | 3 | 8–12 | |
| EZ Bar Preacher Curl | 3 | 10–12 | |
| Cable Tricep Extensions | 3 | 10–12 | |

### Friday – Legs
| Exercise | Sets | Reps | Notes |
|---|---:|---:|---|
| Squats | 4 | 6–12 | |
| Hack Squat or Dumbbell Lunges | 3 | 8–12 | |
| Leg Extensions or Leg Press | 3 | 12–15 | |
| Stiff Leg Deadlift | 3 | 8–10 | |
| Leg Curls | 4 | 12–15 | |
| Seated Calf Raise | 4 | 12–20 | |

In [4]:
master_df = pd.read_csv('./data/evaluation/fitness/workouts_data.csv')
master_df

Unnamed: 0,pdf_path,tables,Main Goal,Training Level,Program Duration,Days Per Week,Time Per Workout,Equipment,Author,Description,Target Gender,raw_text,improved_text,improved_tables
0,./samples/back/8weekchestback.pdf,Monday - Back & Chest\nExercise Sets Reps\nWor...,Build Muscle,Beginner,8 Weeks,4 Days,45-60 Mins,"Barbell, Bodyweight, \nDumbbells, Machines",Steve Shaw,This back and chest specialization workout is ...,Male & Female,Monday - Back & Chest\nExercise Sets Reps\nWor...,Here is your structured weekly workout plan:\n...,### Monday – Back & Chest\n| Exercise | Sets |...
1,./samples/back/backandshoulderwomen.pdf,Workout Routine for Women\nExercise Sets Reps\...,Build Muscle,Beginner,6 Weeks,2 Days,45-60 Mins,"Bodyweight, \nDumbbells",Holly Blumenberg,BACK & SHOULDER WORKOUT ROUTINE \nFOR WOMEN\nT...,Female,Workout Routine for Women\nExercise Sets Reps\...,Here is your reorganized workout plan:\n\n## W...,### Back & Shoulders\n| Exercise | Sets | Reps...
2,./samples/back/cobraworkout.pdf,Monday: Super Thick Workout\nExercise Sets Rep...,Build Muscle,Intermediate,8 Weeks,2 Days,60-90 Mins,"Barbell, Bodyweight,\nCables, Dumbbells",Coach Dustin Myers,THE COBRA WORKOUT: \nHEAVY HIGH VOLUME BACK PR...,Male & Female,Monday: Super Thick Workout\nExercise Sets Rep...,Here is your reorganized workout plan:\n\n---\...,### Monday: Super Thick Workout\n| Exercise | ...
3,./samples/begineer/12weekfullbodyworkoutroutin...,Weeks 1-3: Total Body Circuit Workout\nDuring ...,Build Muscle,Beginner,12 Weeks,3 Day,30-45 Mins,"Barbell, Bodyweight, \nCables, Dumbbells, Mach...",Doug Lawrenson,12 WEEK FULL BODY WORKOUT \nROUTINE FOR BEGINN...,Male & Female,Weeks 1-3: Total Body Circuit Workout\nDuring ...,Here is your structured 12-week beginner worko...,### Weeks 1-3: Total Body Circuit Workout\n| E...
4,./samples/begineer/4dayupperlowerplantfitnessw...,Planet Fitness Upper Body Workout A\nExercise ...,Build Muscle,Beginner,8 Weeks,4 Days,45 - 70 Mins,"Bodyweight, Cables, \nDumbbells, EZ Bar, Machines",Josh England,4 DAY UPPER/LOWER PLANET FITNESS\nWORKOUT (MAC...,Male & Female,Planet Fitness Upper Body Workout A\nExercise ...,Here is your structured weekly workout plan:\n...,### Planet Fitness Upper Body Workout A\n| Exe...
5,./samples/begineer/bodyweightbasics.pdf,Day 1: Upper Body\nExercise Sets Reps\nPull Up...,General Fitness,Beginner,4 Weeks,3 Days,15-30 Mins,Bodyweight,Roger “Rock” Lockridge,BODYWEIGHT BASICS: \n3 DAY BODYWEIGHT WORKOUT ...,Male & Female,Day 1: Upper Body\nExercise Sets Reps\nPull Up...,Here is your reorganized bodyweight workout pl...,### Day 1: Upper Body\n| Exercise | Sets | Rep...
6,./samples/begineer/thebest15minutewarmups.pdf,Exercise Sets Reps/Time Rest\nFast-Paced Walk ...,General Fitness,Beginner,,,15 Mins,"Bands, Barbell, \nBodyweight, Foam Roll, Kettl...",Roger “Rock” Lockridge\nFull-Body Focused Warm...,THE BEST 15-MINUTE WARM-UPS\nMaximize your wor...,Male & Female,Exercise Sets Reps/Time Rest\nFast-Paced Walk ...,"Here is your reorganized workout plan, structu...",### Full-Body Focused Warm-Up\n| Exercise | Se...
7,./samples/chest/12weekstoabiggerbenchpress.pdf,Monday\nExercise Sets Reps\nBench Press 3 See ...,Increase Strength,Beginner,12 Weeks,3 Days,45-60 Mins,"Barbell, Bodyweight, \nDumbbells, EZ Bar, Machine",Jonathan Byrd,How much do you bench? If you feel it’s never ...,Male & Female,Monday\nExercise Sets Reps\nBench Press 3 See ...,# 12 Weeks to a Bigger Bench Press\n\n## Monda...,### Monday\n| Exercise | Sets | Reps | Notes |...
8,./samples/chest/bestchestworkout.pdf,The Stretch & Push Workout\nExercise Sets Reps...,Build Muscle,Intermediate,8 Weeks,1 Day,45-60 Mins,"Barbell, Bodyweight,\nCables, Dumbbells",Roger “Rock” Lockridge,BEST CHEST WORKOUT: THE PUSH & \nSTRETCH METHO...,Male & Female,The Stretch & Push Workout\nExercise Sets Reps...,Here is your reorganized workout plan:\n\n## T...,### The Stretch & Push Workout\n| Exercise | S...
9,./samples/chest/pecpounder8weekchestworkouttos...,The Hybrid PRRS™ Method\nExercise Tempo Sets R...,Build Muscle,Intermediate,8 Weeks,1 Day,45-60 Mins,"Barbell, Cables, \nDumbbells, Machines",Eric Broser,PEC-POUNDER: 8 WEEK CHEST WORKOUT \nTO SHATTER...,Male,The Hybrid PRRS™ Method\nExercise Tempo Sets R...,# PEC-POUNDER: 8 WEEK CHEST WORKOUT\n\nThis pr...,### The Hybrid PRRS™ Method\n| Exercise | Sets...


# Create Test Personas & Queries

In [5]:
from pydantic import BaseModel, Field
from typing import Literal

class UserRegistrationQuery(BaseModel):
    email: str = Field(description="User's email address")
    password: str = Field(description="A secure password")
    first_name: str = Field(description="User's given name")
    last_name: str = Field(description="User's family name")
    date_of_birth: str = Field(description="YYYY-MM-DD format")
    sex: Literal["M", "F", "O"] = Field(description="User's gender identity")
    height_cm: int = Field(description="Height in centimeters")
    weight_kg: int = Field(description="Weight in kilograms")
    activity_level: Literal["Sedentary", "Light", "Moderate", "Active", "Very Active"] = Field(description="Typical daily activity level")
    fitness_goals: str = Field(description="User's primary fitness objectives")
    query: str = Field(description="A 2-3 sentence natural-language request describing their goals, background, and constraints")

structured_model = gemini_chat.with_structured_output(UserRegistrationQuery)


In [6]:
from langchain_core.messages import SystemMessage, HumanMessage

def create_prompt(metadata, model):
    system_message = SystemMessage(
        content=(
            "You are a virtual fitness-app assistant. "
            "For the given workout-plan metadata, generate:\n"
            "1. A plausible user registration record (email, password, first_name, last_name, date_of_birth, sex, height_cm, weight_kg, activity_level, fitness_goals) that fits the plan's focus.\n"
            "2. A 2-3 sentence user query explaining their background, goals, and constraints.\n"
            "Output in JSON matching the UserRegistrationQuery Pydantic model exactly."
        )
    )

    metadata_lines = "\n".join(f"- {k}: {v}" for k, v in metadata.items())
    user_message = HumanMessage(
        content=(
            "Here is the workout plan metadata:\n"
            f"{metadata_lines}\n\n"
            "Now respond as instructed above."
        )
    )

    response = model.invoke([system_message, user_message])
    return response



In [8]:
metadata = master_df[["Main Goal", "Training Level", "Program Duration", "Days Per Week", "Time Per Workout", "Equipment", "Target Gender", "Description"]].to_dict(orient='records')

In [None]:
records = []
for m in metadata:
    records.append(create_prompt(m, structured_model).model_dump())
    time.sleep(10)

records_df = pd.DataFrame(records)

In [None]:
records_df.to_csv('./data/records.csv', index=False)

In [5]:
records_df = pd.read_csv('./data/evaluation/fitness/records.csv')
records_df

Unnamed: 0,email,password,first_name,last_name,date_of_birth,sex,height_cm,weight_kg,activity_level,fitness_goals,query,agent_plan,agent_plan_withoutrag,agent_plan_tables,agent_plan_tables_withoutrag
0,john.doe1@example.com,SecureP@ssw0rd!,John,Doe,1995-05-15,M,175,70,Light,"Build muscle and increase strength, especially...",I'm new to structured weight training and want...,It's fantastic that you're ready to embark on ...,Hello John! It's great to hear you're ready to...,### Day 1 – Full Body (Chest Focus)\n| Exercis...,### Monday – Full Body A (Chest & Back Focus)\...
1,jane.doe2@example.com,SecurePass123!,Jane,Doe,1998-05-15,F,165,60,Light,"Build muscle, specifically focusing on develop...",I'm a beginner looking to start building muscl...,Hello Jane! It's fantastic that you're ready t...,Here is a personalized workout plan designed t...,### Day 1 – Full Body (Back & Shoulders Focus)...,### Day 1 – Full Body - Back & Shoulders Empha...
2,john.doe3@example.com,SecureP@ss123,John,Doe,1992-05-15,M,180,85,Active,Build Muscle,I'm an intermediate lifter looking to signific...,It's fantastic that you're committed to buildi...,Hello John! It's great to hear you're committe...,### Day 1 – Full Body Strength & Back Width\n|...,### Day 1 – Upper Body: Back & Biceps Focus\n|...
3,john.doe4@example.com,SecurePassword123!,John,Doe,2000-05-15,M,175,70,Light,Build muscle and gain strength,I'm a complete beginner to working out and wan...,"Hello John,\n\nWelcome to your fitness journey...",Hello John! It's fantastic that you're ready t...,### Day 1 – Full Body Strength & Muscle Introd...,### Monday – Full Body A\n| Exercise | Sets | ...
4,john.doe5@example.com,SecurePassword123!,John,Doe,1995-05-15,M,175,70,Light,Build Muscle,I'm new to weightlifting and looking to build ...,Here is a personalized 4-day workout plan desi...,"Hello John,\n\nIt's fantastic that you're read...",### Day 1 – Upper Body A\n| Exercise | Sets | ...,### Day 1 – Upper Body A\n| Exercise | Sets | ...
5,sarah.miller@example.com,SecurePass123!,Sarah,Miller,1992-05-15,F,165,68,Light,"Improve general fitness, build a foundation, a...",I'm new to working out and looking for a simpl...,"Hi Sarah,\n\nIt's fantastic that you're ready ...","Hi Sarah,\n\nIt's fantastic that you're ready ...",### Day 1 – Full Body Foundation\n| Exercise |...,### Day 1 – Full Body Foundation\n| Exercise |...
6,alex.smith@example.com,SecureP@ss123,Alex,Smith,1995-05-15,M,175,70,Moderate,"Improve overall fitness, prepare for workouts,...",I'm new to structured fitness and looking for ...,"Hello Alex,\n\nIt's fantastic that you're prio...","Hello Alex,\n\nIt's fantastic that you're prio...",### Day 1 – Full Body: Foundational Strength\n...,### Monday – Full Body A\n| Exercise | Sets | ...
7,john.doe6@example.com,SecureP@ssw0rd!,John,Doe,1995-05-15,M,175,80,Active,"Increase strength, specifically bench press",I'm a beginner looking to significantly increa...,Here is a personalized 3-day-a-week workout pl...,Hello John! It's great to hear you're committe...,### Monday – Upper Body Strength (Bench Focus)...,### Day 1 – Upper Body (Bench Focus)\n| Exerci...
8,john.doe7@example.com,SecureP@ssw0rd!,John,Doe,1995-05-15,M,175,75,Active,Build Muscle,I'm an intermediate lifter looking to signific...,Here's a personalized workout plan designed to...,Hello John!\n\nIt's fantastic to hear you're a...,### Day 1 – Push (Chest & Triceps Focus)\n| Ex...,### Day 1 – Push (Chest Focus)\n| Exercise | S...
9,john.doe8@example.com,SecureP@ssw0rd!,John,Doe,1995-05-15,M,180,85,Active,"Build muscle, specifically focusing on chest d...",I've been consistently working out for a while...,It's fantastic that you're consistently workin...,"Hello John,\n\nIt's great to hear you're consi...",### Day 1 – Push (Strength & Chest Focus)\n| E...,### Day 1 – Heavy Chest & Triceps\n| Exercise ...


# LLM as a Judge

In [6]:
from pydantic import BaseModel, Field
from typing import Literal

class PlanComparison(BaseModel):
    relevance: Literal["A", "B", "Tie"] = Field(
        description="Which plan better aligns with the persona's stated goals"
    )
    completeness: Literal["A", "B", "Tie"] = Field(
        description="Which plan provides more thorough information"
    )
    creativity: Literal["A", "B", "Tie"] = Field(
        description="Which plan shows more novel or varied exercise choices"
    )
    safety_and_evidence: Literal["A", "B", "Tie"] = Field(
        description="Which plan is safer and more grounded in exercise science"
    )
    clarity: Literal["A", "B", "Tie"] = Field(
        description="Which plan is more clearly written and well-organized"
    )
    overall: Literal["A", "B", "Tie"] = Field(
        description="Overall better plan considering all criteria"
    )
    rationale: str = Field(
        description="A brief, objective explanation of the judgment"
    )


In [7]:
from langchain_core.messages import SystemMessage, HumanMessage

structured_model = llm_judge.with_structured_output(PlanComparison)

def judge(query, baseline_plan, agent_plan):
    system_message = SystemMessage(
        content=(
            "You are an expert fitness coach."
        )
    )
    message = f"""Given the User Persona:
{query}

Plan A:
{baseline_plan}

Plan B:
{agent_plan}

Evaluate each plan on five dimensions: Relevance, Completeness, Creativity, Safety and Evidence-Grounding, and Clarity and Organization. without bias toward presentation order, response length, or plan names.

For each dimension, choose “A” if Plan A is stronger, “B” if Plan B is stronger, or “Tie” if they are equal. Then decide the overall winner in the same way. Finally, provide a concise rationale.

Output must conform to the PlanComparison schema."""
    user_message = HumanMessage(
        content=message
    )

    response = structured_model.invoke([system_message, user_message])

    return response

### Create agent plans

In [26]:
from db.db_manager import FitnessDB

db_manager = FitnessDB(db_path='./fitness_temp1.db', use_dummy_data=False)

SQLite database './fitness_temp1.db' created/verified with updated schema.


In [27]:
from werkzeug.security import generate_password_hash

for i, user in records_df.iterrows():
    db_manager.create_user(
        email = user['email'],
        password_hash = generate_password_hash(user['password']),
        first_name = user['first_name'],
        last_name = user['last_name'],
        date_of_birth = user['date_of_birth'],
        sex =  user['sex'],
        height_cm = user['height_cm'],
        weight_kg = user['weight_kg'],
        activity_level = user['activity_level'],
        fitness_goals = user['fitness_goals'],
    )

In [17]:
from agents.planner import PlannerGraph
from langchain_core.messages import HumanMessage
from db.retrievers import DocumentRetriever
from agents.enums import PlannerType

agent = PlannerGraph(
    llm = llm,
    db_manager = db_manager,
    book_retriever = DocumentRetriever('data/'),
    planner_type = PlannerType.FITNESS,
    num_results = 5,
    use_rag_data = True,
    summarize_logs = False # disable log summarization.
).compile()

Making the book retriever ready =====
table is already created=====
Book Retriever is ready ====


In [None]:
plans = []
for i, user in records_df.iterrows():
    response = agent.invoke({"messages": HumanMessage(content=user['query']), 'user_id':i+1}, config={"configurable": {"thread_id": i+1}})
    agent_plan = response['messages'][-1].content
    # time.sleep(50)
    plans.append(agent_plan)
records_df['agent_plan'] = plans


In [None]:
records_df.to_csv('./data/records.csv', index=False)


### Compare raw baseline vs agent plan (texts)

In [None]:
judges = []
for i, user in records_df.iterrows():
    j = judge(user['query'], master_df['tables'][i], agent_plan).model_dump()
    # time.sleep(10)
    judges.append(j)
judges_df = pd.DataFrame(judges)

In [17]:
judges_df

Unnamed: 0,relevance,completeness,creativity,safety_and_evidence,clarity,overall,rationale
0,B,B,A,B,B,B,"Plan B is superior due to its comprehensive, b..."
1,B,B,B,B,B,B,Plan B is significantly superior due to its co...
2,B,B,A,B,B,B,Plan B is superior due to its comprehensive na...
3,B,B,A,B,B,B,Plan B is superior due to its comprehensive na...
4,B,B,B,B,B,B,Plan B is significantly better as it provides ...
5,B,B,B,B,B,B,Plan B is superior as it directly addresses al...
6,B,B,A,B,B,B,"Plan B is superior due to its clear structure,..."
7,B,B,B,B,B,B,Plan B is significantly superior due to its co...
8,B,B,B,B,B,B,"Plan B is significantly more comprehensive, re..."
9,B,B,A,B,B,B,"Plan B is a comprehensive, well-structured, an..."


In [None]:
judges_df.to_csv('./data/judges.csv', index=False)

### Compare improved baseline vs agent plan (texts)

In [11]:
judges = []
for i, user in records_df.iterrows():
    j = judge(user['query'], master_df['improved_text'][i], user['agent_plan']).model_dump()
    time.sleep(10)
    judges.append(j)
judges_df_2 = pd.DataFrame(judges)

In [12]:
judges_df_2

Unnamed: 0,relevance,completeness,creativity,safety_and_evidence,clarity,overall,rationale
0,B,B,A,B,B,B,Plan B is significantly better for a beginner ...
1,B,B,B,B,B,B,Plan B is significantly better as it provides ...
2,B,B,A,B,B,B,Plan B is superior due to its comprehensive na...
3,B,B,A,B,B,B,Plan B is superior for a complete beginner due...
4,Tie,B,Tie,B,B,B,Plan B is significantly more comprehensive and...
5,B,B,B,B,B,B,Plan B is significantly better as it fully adh...
6,B,B,A,B,B,B,Plan B is superior due to its beginner-friendl...
7,B,B,B,B,B,B,Plan B is significantly superior due to its co...
8,B,B,B,B,B,B,"Plan B is significantly more comprehensive, pr..."
9,B,B,A,B,B,B,Plan B is superior due to its comprehensive st...


In [13]:
judges_df_2.to_csv('./data/judges_2.csv', index=False)

### Compare RAG-free Agent vs RAG Agent (texts)

In [None]:
from agents.planner import PlannerGraph
from langchain_core.messages import HumanMessage
from db.retrievers import DocumentRetriever
from agents.enums import PlannerType

agent = PlannerGraph(
    llm = llm,
    db_manager = db_manager,
    book_retriever = DocumentRetriever('data/'),
    planner_type = PlannerType.FITNESS,
    num_results = 5,
    use_rag_data = True,
    summarize_logs = False # disable log summarization.
).compile()

In [None]:
plans = []
for i, user in records_df.iterrows():
    agent_plan = agent.invoke({"messages": HumanMessage(content=user['query']), 'user_id':i+1}, config={"configurable": {"thread_id": i+1}})['messages'][-1].content
    # time.sleep(50)
    plans.append(agent_plan)
records_df['agent_plan_withoutrag'] = plans

Function 'node_task_identifier'
Returned: {'task_type': <TaskType.FITNESS: 'fitness'>} 
Function 'node_check_tasks'
Returned: node_plan 
Function 'node_plan'
Function 'node_log_summary'
Function 'node_summarize'
Returned: {'messages': [], 'user_id': 1} 
Returned: {'log_summary': None} 
Function 'node_plan'


Returned: {'messages': AIMessage(content="Hello John! It's great to hear you're ready to embark on your fitness journey and focus on building muscle, especially in your chest and back. As someone new to structured weight training, consistency and proper form will be your best friends.\n\nThis plan is designed to be beginner-friendly, focusing on fundamental movements that will build a strong foundation. We'll hit your chest and back with dedicated exercises, while also ensuring a balanced full-body workout. Aim to perform this routine 2-3 times per week, allowing at least one day of rest between sessions for muscle recovery and growth.\n\nLet's get started!\n\n---\n\n## Your Beginner Full-Body Strength Plan: Chest & Back Focus\n\n**Goal:** Build muscle and increase strength, with an emphasis on chest and back.\n**Frequency:** 2-3 times per week, with rest days in between.\n**Equipment:** A pair of dumbbells (adjustable if possible), a sturdy bench or chair, and optionally a resistance 

In [15]:
records_df.to_csv('./data/records.csv', index=False)

In [17]:
judges = []
for i, user in records_df.iterrows():
    j = judge(user['query'], user['agent_plan_withoutrag'], user['agent_plan']).model_dump()
    time.sleep(10)
    judges.append(j)
judges_df_3 = pd.DataFrame(judges)

In [18]:
judges_df_3

Unnamed: 0,relevance,completeness,creativity,safety_and_evidence,clarity,overall,rationale
0,A,A,Tie,Tie,A,A,"Plan A is slightly more relevant to ""structure..."
1,B,A,B,Tie,B,B,Plan B is superior due to its more targeted ap...
2,Tie,B,B,Tie,B,B,Plan B provides slightly more explicit guidanc...
3,Tie,A,Tie,Tie,A,A,Plan A provides a slightly more robust workout...
4,Tie,B,A,Tie,Tie,A,Both plans are exceptionally well-designed for...
5,Tie,B,B,B,A,B,Plan B offers slightly more comprehensive modi...
6,Tie,A,Tie,Tie,A,A,Plan A provides a more clearly organized and l...
7,Tie,B,Tie,Tie,B,B,Plan B is marginally better due to its slightl...
8,Tie,B,Tie,Tie,B,B,Plan B is slightly stronger due to its more co...
9,Tie,B,B,B,Tie,B,Plan B is slightly more comprehensive and crea...


In [19]:
judges_df_3.to_csv('./data/judges_3.csv', index=False)

### Compare baseline vs agent (tables)

In [18]:
plans = []
judges = []
for i, user in records_df.iterrows():
    agent_plan = agent.invoke({"messages": HumanMessage(content=user['query']), 'user_id':i+1}, config={"configurable": {"thread_id": i+1}})['messages'][-1].content
    plans.append(agent_plan)
    j = judge(user['query'], master_df['improved_tables'][i], agent_plan).model_dump()
    judges.append(j)
records_df['agent_plan_tables'] = plans
judges_df = pd.DataFrame(judges)

Function 'node_log_summary'
Function 'node_summarize'
Returned: {'messages': [], 'user_id': 1} 
Returned: {'log_summary': None} 
Function 'node_search_doc'


Returned: {'messages': [AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_y9iFOYkfCMi5LGHHUx45xoUG', 'function': {'arguments': '{"query": "beginner muscle building program for chest and back"}', 'name': 'search_books'}, 'type': 'function'}, {'id': 'call_DQDaiJ2OxRQjyu2cHGEuQzE7', 'function': {'arguments': '{"query": "beginner weight training routine for muscle gain"}', 'name': 'search_books'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 60, 'prompt_tokens': 540, 'total_tokens': 600, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0, 'text_tokens': None}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0, 'text_tokens': None, 'image_tokens': None}}, 'model_name': 'gpt-4.1-nano-2025-04-14', 'system_fingerprint': 'fp_368a354b49', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-59e4fd8a-463b-41f1-ab86-cd9cf65df

In [20]:
judges_df

Unnamed: 0,relevance,completeness,creativity,safety_and_evidence,clarity,overall,rationale
0,B,A,A,B,B,B,Plan B is more relevant for a beginner focusin...
1,B,B,B,B,B,B,Plan B is more relevant as it specifically tar...
2,B,B,A,B,B,B,Plan B better aligns with the user's goal of h...
3,B,B,B,B,Tie,B,Plan B is more relevant for a complete beginne...
4,B,B,B,B,B,B,Plan B is more relevant as it includes clear g...
5,B,B,B,B,B,B,Plan B is more relevant for a beginner with no...
6,B,A,A,B,B,B,Plan B is more relevant for a beginner seeking...
7,B,B,B,B,B,B,Plan B is more relevant as it explicitly focus...
8,B,B,B,B,B,B,Plan B better aligns with the user's goal of m...
9,B,B,A,B,B,B,Plan B is more relevant as it specifically tar...


In [23]:
records_df.to_csv('./data/evaluation/fitness/records.csv', index=False)
judges_df.to_csv('./data/evaluation/fitness/judges_tables_1.csv', index=False)

### Compare RAG-free Agent vs RAG Agent (tables)

In [28]:
from agents.planner import PlannerGraph
from langchain_core.messages import HumanMessage
from db.retrievers import DocumentRetriever
from agents.enums import PlannerType

agent = PlannerGraph(
    llm = llm,
    db_manager = db_manager,
    book_retriever = DocumentRetriever('data/'),
    planner_type = PlannerType.FITNESS,
    num_results = 5,
    use_rag_data = False,
    summarize_logs = False # disable log summarization.
).compile()

Making the book retriever ready =====
table is already created=====
Book Retriever is ready ====


In [29]:
plans = []
for i, user in records_df.iterrows():
    agent_plan = agent.invoke({"messages": HumanMessage(content=user['query']), 'user_id':i+1}, config={"configurable": {"thread_id": i+1}})['messages'][-1].content
    # time.sleep(40)
    plans.append(agent_plan)
records_df['agent_plan_tables_withoutrag'] = plans

Function 'node_log_summary'
Function 'node_summarize'
Returned: {'messages': [], 'user_id': 1} 
Returned: {'log_summary': None} 
Function 'node_plan'


Returned: {'messages': [AIMessage(content='### Day 1 – Chest & Back\n| Exercise | Sets | Reps/Duration | Notes |\n|---|---:|---:|---|\n| Barbell Bench Press | 4 | 8–12 | Focus on controlled movement, progressive overload |\n| Bent-Over Rows | 4 | 8–12 | Keep back flat, engage lats |\n| Incline Dumbbell Press | 3 | 10–12 | Targets upper chest, use moderate weight |\n| Lat Pulldown | 3 | 10–12 | Use neutral grip, squeeze back muscles |\n| Push-Ups | 3 | AMAP | Variations to increase challenge |\n| Seated Cable Rows | 3 | 10–12 | Use full range of motion, scapular retraction |\n\n### Day 2 – Legs & Abs\n| Exercise | Sets | Reps/Duration | Notes |\n|---|---:|---:|---|\n| Squats | 4 | 8–12 | Maintain proper form, depth for glutes and quads |\n| Leg Press | 3 | 10–12 | Focus on controlled movement |\n| Romanian Deadlifts | 3 | 8–10 | Targets hamstrings, keep back straight |\n| Standing Calf Raises | 3 | 15–20 | Full stretch and contraction |\n| Plank | 3 | 30–45s | Core stabilization |\n| Le

In [30]:
records_df.to_csv('./data/evaluation/fitness/records.csv', index=False)


In [31]:
judges = []
for i, user in records_df.iterrows():
    j = judge(user['query'], user['agent_plan_tables_withoutrag'], user['agent_plan_tables']).model_dump()
    # time.sleep(10)
    judges.append(j)
judges_df = pd.DataFrame(judges)

In [32]:
judges_df

Unnamed: 0,relevance,completeness,creativity,safety_and_evidence,clarity,overall,rationale
0,A,A,A,Tie,Tie,A,Plan A is more relevant as it directly targets...
1,B,A,A,B,B,B,Plan B is more relevant as it uses only dumbbe...
2,B,B,B,Tie,Tie,B,Plan B is more relevant as it focuses exclusiv...
3,B,A,A,B,Tie,B,Plan B is more relevant for a complete beginne...
4,Tie,A,A,Tie,Tie,A,Both plans are well-suited for a beginner aimi...
5,B,Tie,B,Tie,Tie,B,Plan B is more relevant for a beginner with no...
6,B,A,B,Tie,Tie,B,Plan B is more relevant as it specifically tar...
7,B,A,A,Tie,Tie,B,Plan B is more relevant as it specifically tar...
8,B,A,B,Tie,Tie,B,Plan B is more relevant as it specifically tar...
9,B,B,B,Tie,Tie,B,Plan B is more relevant as it specifically tar...


In [33]:
judges_df.to_csv('./data/evaluation/fitness/judges_tables_2.csv', index=False)

# Analysis

### raw baseline vs agent plan (texts)

In [None]:
from sklearn.metrics import cohen_kappa_score

judges_df = pd.read_csv('./data/judges.csv')

overall_counts = judges_df.query("overall in ['A','B']")['overall'].value_counts()
n_overall = overall_counts.sum()
win_rate_A = overall_counts.get('A', 0) / n_overall
win_rate_B = overall_counts.get('B', 0) / n_overall

print(f"Overall win-rate: Baseline = {win_rate_A:.2%}, Agent = {win_rate_B:.2%} (n={n_overall})")

criteria = ['relevance', 'completeness', 'creativity', 'safety_and_evidence', 'clarity']
breakdown = {}
for crit in criteria:
    mask = judges_df['overall'].isin(['A','B']) & judges_df[crit].isin(['A','B'])
    y_overall = judges_df.loc[mask, 'overall']
    y_crit    = judges_df.loc[mask, crit]

    a = (y_crit == 'A').sum()
    b = (y_crit == 'B').sum()
    ties = len(judges_df) - len(y_crit)

    print(f"\n--- {crit.capitalize()} Breakdown ---")
    print(f"{crit.capitalize()}: Baseline = {a}, Agent = {b}, tie/skipped = {ties}")
    
    if a and b:
        kappa = cohen_kappa_score(y_overall, y_crit, labels=['A', 'B'])
    else:
        kappa = pd.NA
    print(f"Cohen's Kappa (overall vs {crit}): {kappa:.2f}")



Overall win-rate: Baseline = 0.00%, Agent = 100.00% (n=37)

--- Relevance Breakdown ---
Relevance: Baseline = 0, Agent = 33, tie/skipped = 4
Cohen's Kappa (overall vs relevance): <NA>

--- Completeness Breakdown ---
Completeness: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs completeness): <NA>

--- Creativity Breakdown ---
Creativity: Baseline = 16, Agent = 16, tie/skipped = 5
Cohen's Kappa (overall vs creativity): 0.00

--- Safety_and_evidence Breakdown ---
Safety_and_evidence: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs safety_and_evidence): <NA>

--- Clarity Breakdown ---
Clarity: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs clarity): <NA>


### improved baseline vs agent plan (texts)

In [14]:
from sklearn.metrics import cohen_kappa_score

judges_df = pd.read_csv('./data/judges_2.csv')

overall_counts = judges_df.query("overall in ['A','B']")['overall'].value_counts()
n_overall = overall_counts.sum()
win_rate_A = overall_counts.get('A', 0) / n_overall
win_rate_B = overall_counts.get('B', 0) / n_overall

print(f"Overall win-rate: Baseline = {win_rate_A:.2%}, Agent = {win_rate_B:.2%} (n={n_overall})")

criteria = ['relevance', 'completeness', 'creativity', 'safety_and_evidence', 'clarity']
breakdown = {}
for crit in criteria:
    mask = judges_df['overall'].isin(['A','B']) & judges_df[crit].isin(['A','B'])
    y_overall = judges_df.loc[mask, 'overall']
    y_crit    = judges_df.loc[mask, crit]

    a = (y_crit == 'A').sum()
    b = (y_crit == 'B').sum()
    ties = len(judges_df) - len(y_crit)

    print(f"\n--- {crit.capitalize()} Breakdown ---")
    print(f"{crit.capitalize()}: Baseline = {a}, Agent = {b}, tie/skipped = {ties}")
    
    if a and b:
        kappa = cohen_kappa_score(y_overall, y_crit, labels=['A', 'B'])
    else:
        kappa = pd.NA
    print(f"Cohen's Kappa (overall vs {crit}): {kappa:.2f}")



Overall win-rate: Baseline = 0.00%, Agent = 100.00% (n=37)

--- Relevance Breakdown ---
Relevance: Baseline = 1, Agent = 29, tie/skipped = 7
Cohen's Kappa (overall vs relevance): 0.00

--- Completeness Breakdown ---
Completeness: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs completeness): <NA>

--- Creativity Breakdown ---
Creativity: Baseline = 15, Agent = 15, tie/skipped = 7
Cohen's Kappa (overall vs creativity): 0.00

--- Safety_and_evidence Breakdown ---
Safety_and_evidence: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs safety_and_evidence): <NA>

--- Clarity Breakdown ---
Clarity: Baseline = 0, Agent = 37, tie/skipped = 0
Cohen's Kappa (overall vs clarity): <NA>


### RAG-free Agent vs RAG Agent (texts)

In [20]:
from sklearn.metrics import cohen_kappa_score

judges_df = pd.read_csv('./data/judges_3.csv')

overall_counts = judges_df.query("overall in ['A','B']")['overall'].value_counts()
n_overall = overall_counts.sum()
win_rate_A = overall_counts.get('A', 0) / n_overall
win_rate_B = overall_counts.get('B', 0) / n_overall

print(f"Overall win-rate: Baseline = {win_rate_A:.2%}, Agent = {win_rate_B:.2%} (n={n_overall})")

criteria = ['relevance', 'completeness', 'creativity', 'safety_and_evidence', 'clarity']
breakdown = {}
for crit in criteria:
    mask = judges_df['overall'].isin(['A','B']) & judges_df[crit].isin(['A','B'])
    y_overall = judges_df.loc[mask, 'overall']
    y_crit    = judges_df.loc[mask, crit]

    a = (y_crit == 'A').sum()
    b = (y_crit == 'B').sum()
    ties = len(judges_df) - len(y_crit)

    print(f"\n--- {crit.capitalize()} Breakdown ---")
    print(f"{crit.capitalize()}: Baseline = {a}, Agent = {b}, tie/skipped = {ties}")
    
    if a and b:
        kappa = cohen_kappa_score(y_overall, y_crit, labels=['A', 'B'])
    else:
        kappa = pd.NA
    print(f"Cohen's Kappa (overall vs {crit}): {kappa:.2f}")



Overall win-rate: Baseline = 48.65%, Agent = 51.35% (n=37)

--- Relevance Breakdown ---
Relevance: Baseline = 2, Agent = 8, tie/skipped = 27
Cohen's Kappa (overall vs relevance): 1.00

--- Completeness Breakdown ---
Completeness: Baseline = 16, Agent = 18, tie/skipped = 3
Cohen's Kappa (overall vs completeness): 0.82

--- Creativity Breakdown ---
Creativity: Baseline = 10, Agent = 15, tie/skipped = 12
Cohen's Kappa (overall vs creativity): 0.92

--- Safety_and_evidence Breakdown ---
Safety_and_evidence: Baseline = 4, Agent = 7, tie/skipped = 26
Cohen's Kappa (overall vs safety_and_evidence): 0.65

--- Clarity Breakdown ---
Clarity: Baseline = 12, Agent = 10, tie/skipped = 15
Cohen's Kappa (overall vs clarity): 0.91


### baseline vs agent plan (tables)

In [24]:
from sklearn.metrics import cohen_kappa_score

judges_df = pd.read_csv('./data/evaluation/fitness/judges_tables_1.csv')

overall_counts = judges_df.query("overall in ['A','B']")['overall'].value_counts()
n_overall = overall_counts.sum()
win_rate_A = overall_counts.get('A', 0) / n_overall
win_rate_B = overall_counts.get('B', 0) / n_overall

print(f"Overall win-rate: Baseline = {win_rate_A:.2%}, Agent = {win_rate_B:.2%} (n={n_overall})")

criteria = ['relevance', 'completeness', 'creativity', 'safety_and_evidence', 'clarity']
breakdown = {}
for crit in criteria:
    mask = judges_df['overall'].isin(['A','B']) & judges_df[crit].isin(['A','B'])
    y_overall = judges_df.loc[mask, 'overall']
    y_crit    = judges_df.loc[mask, crit]

    a = (y_crit == 'A').sum()
    b = (y_crit == 'B').sum()
    ties = len(judges_df) - len(y_crit)

    print(f"\n--- {crit.capitalize()} Breakdown ---")
    print(f"{crit.capitalize()}: Baseline = {a}, Agent = {b}, tie/skipped = {ties}")
    
    if a and b:
        kappa = cohen_kappa_score(y_overall, y_crit, labels=['A', 'B'])
    else:
        kappa = pd.NA
    print(f"Cohen's Kappa (overall vs {crit}): {kappa:.2f}")



Overall win-rate: Baseline = 13.51%, Agent = 86.49% (n=37)

--- Relevance Breakdown ---
Relevance: Baseline = 2, Agent = 30, tie/skipped = 5
Cohen's Kappa (overall vs relevance): 1.00

--- Completeness Breakdown ---
Completeness: Baseline = 9, Agent = 25, tie/skipped = 3
Cohen's Kappa (overall vs completeness): 0.54

--- Creativity Breakdown ---
Creativity: Baseline = 15, Agent = 21, tie/skipped = 1
Cohen's Kappa (overall vs creativity): 0.37

--- Safety_and_evidence Breakdown ---
Safety_and_evidence: Baseline = 0, Agent = 26, tie/skipped = 11
Cohen's Kappa (overall vs safety_and_evidence): <NA>

--- Clarity Breakdown ---
Clarity: Baseline = 2, Agent = 25, tie/skipped = 10
Cohen's Kappa (overall vs clarity): -0.11


### RAG-free Agent vs RAG Agent (tables)

In [34]:
from sklearn.metrics import cohen_kappa_score

judges_df = pd.read_csv('./data/evaluation/fitness/judges_tables_2.csv')

overall_counts = judges_df.query("overall in ['A','B']")['overall'].value_counts()
n_overall = overall_counts.sum()
win_rate_A = overall_counts.get('A', 0) / n_overall
win_rate_B = overall_counts.get('B', 0) / n_overall

print(f"Overall win-rate: Baseline = {win_rate_A:.2%}, Agent = {win_rate_B:.2%} (n={n_overall})")

criteria = ['relevance', 'completeness', 'creativity', 'safety_and_evidence', 'clarity']
breakdown = {}
for crit in criteria:
    mask = judges_df['overall'].isin(['A','B']) & judges_df[crit].isin(['A','B'])
    y_overall = judges_df.loc[mask, 'overall']
    y_crit    = judges_df.loc[mask, crit]

    a = (y_crit == 'A').sum()
    b = (y_crit == 'B').sum()
    ties = len(judges_df) - len(y_crit)

    print(f"\n--- {crit.capitalize()} Breakdown ---")
    print(f"{crit.capitalize()}: Baseline = {a}, Agent = {b}, tie/skipped = {ties}")
    
    if a and b:
        kappa = cohen_kappa_score(y_overall, y_crit, labels=['A', 'B'])
    else:
        kappa = pd.NA
    print(f"Cohen's Kappa (overall vs {crit}): {kappa:.2f}")



Overall win-rate: Baseline = 39.39%, Agent = 60.61% (n=33)

--- Relevance Breakdown ---
Relevance: Baseline = 9, Agent = 20, tie/skipped = 8
Cohen's Kappa (overall vs relevance): 0.92

--- Completeness Breakdown ---
Completeness: Baseline = 21, Agent = 10, tie/skipped = 6
Cohen's Kappa (overall vs completeness): 0.39

--- Creativity Breakdown ---
Creativity: Baseline = 17, Agent = 14, tie/skipped = 6
Cohen's Kappa (overall vs creativity): 0.49

--- Safety_and_evidence Breakdown ---
Safety_and_evidence: Baseline = 0, Agent = 2, tie/skipped = 35
Cohen's Kappa (overall vs safety_and_evidence): <NA>

--- Clarity Breakdown ---
Clarity: Baseline = 8, Agent = 1, tie/skipped = 28
Cohen's Kappa (overall vs clarity): 0.18


### A sample of query, baseline plan, RAG-free agent plan, RAG agent plan

### query

In [35]:
print(records_df.loc[0, 'query'])

I'm new to structured weight training and want to focus on building muscle, particularly in my chest and back. I'm looking for a beginner-friendly program that I can commit to a few days a week to get started on my fitness journey.


### baseline plan

In [36]:
display(Markdown(master_df.loc[0, 'improved_tables']))

### Monday – Back & Chest
| Exercise | Sets | Reps | Notes |
|---|---:|---:|---|
| Deadlifts | 3 | 3 | |
| Bench Press | 3 | 5 | |
| Pendlay Rows | 3 | 8–10 | |
| Incline Dumbbell Bench Press | 3 | 8–10 | |
| One Arm Dumbbell Rows | 2 | 20–25 | |
| Dips | 3 | AMAP | (As Many As Possible) |
| Barbell Shrugs | 3 | 12–15 | |

### Thursday – Back & Chest
| Exercise | Sets | Reps | Notes |
|---|---:|---:|---|
| Pull Ups or Inverted Rows | 6 | AMAP | Rest–pause, (As Many As Possible) |
| Machine Chest Press | 6 | 8–12 | Rest–pause |
| Lat Pull Downs | 6 | 8–12 | Rest–pause |
| Dumbbell Flye or Pec Dec | 6 | 8–12 | Rest–pause |
| Dumbbell Shrugs | 6 | 10 | Rest–pause |

### Saturday – Shoulders & Arms
| Exercise | Sets | Reps | Notes |
|---|---:|---:|---|
| Seated Overhead Press | 4 | 8–12 | |
| Upright Row or Seated Dumbbell Press | 3 | 8–12 | |
| Superset: Side Lateral Raise + Bent Over Reverse Flye | 3 | 10–15 | |
| Dumbbell Curl | 3 | 8–12 | |
| Skullcrushers | 3 | 8–12 | |
| EZ Bar Preacher Curl | 3 | 10–12 | |
| Cable Tricep Extensions | 3 | 10–12 | |

### Friday – Legs
| Exercise | Sets | Reps | Notes |
|---|---:|---:|---|
| Squats | 4 | 6–12 | |
| Hack Squat or Dumbbell Lunges | 3 | 8–12 | |
| Leg Extensions or Leg Press | 3 | 12–15 | |
| Stiff Leg Deadlift | 3 | 8–10 | |
| Leg Curls | 4 | 12–15 | |
| Seated Calf Raise | 4 | 12–20 | |

### RAG agent plan

In [37]:
display(Markdown(records_df.loc[0, 'agent_plan_tables']))

### Week 1 – Building Muscle (Beginner Focus)
| Exercise | Sets | Reps/Duration | Notes |
|---|---:|---:|---|
| Joint Mobility Warm-Up | 1 | 5-10 min | Dynamic warm up, prepare joints and muscles |
| Bodyweight Squats | 3 | 8-10 reps | Focus on form, sit back, knees open |
| Modified Push-Up | 3 | 8-10 reps | Use knees or incline for easier variation |
| Door Frame Row | 3 | 8-10 reps | Squeeze back, keep elbows close |
| Glute Bridge | 3 | 10-12 reps | Lift hips fully, engage glutes |
| Penguin Crunch | 3 | 15/side | Controlled, focus on core |
---

### Week 2 – Building Muscle (Beginner Focus)
| Exercise | Sets | Reps/Duration | Notes |
|---|---:|---:|---|
| Joint Mobility Warm-Up | 1 | 5-10 min | Dynamic warm up to loosen muscles |
| Bodyweight Squats | 3 | 8-10 reps | Keep control, avoid rushing reps |
| Modified Push-Up | 3 | 8-10 reps | Maintain steady pace |
| Door Frame Row | 3 | 8-10 reps | Focus on squeezing back muscles |
| Split Squat | 3 | 6-8/leg | Use support if needed for balance |
| Penguin Crunch | 3 | 15/side | Engage core, controlled motion |
| Glute Bridge | 3 | 10-12 reps | Hips up fully, activate glutes |
---

### Notes:
- Perform these workouts 2x per week, spaced evenly (e.g., Monday and Thursday).
- Rest 30-45 seconds between sets.
- Focus on proper form and gradual increase in reps over time.
- Incorporate 2-3 light cardio sessions on off days for overall conditioning.
- As you progress, consider adding light weights or resistance bands to increase resistance gradually.

### RAG-free agent plan

In [38]:
display(Markdown(records_df.loc[0, 'agent_plan_tables_withoutrag']))

### Day 1 – Chest & Back
| Exercise | Sets | Reps/Duration | Notes |
|---|---:|---:|---|
| Barbell Bench Press | 4 | 8–12 | Focus on controlled movement, progressive overload |
| Bent-Over Rows | 4 | 8–12 | Keep back flat, engage lats |
| Incline Dumbbell Press | 3 | 10–12 | Targets upper chest, use moderate weight |
| Lat Pulldown | 3 | 10–12 | Use neutral grip, squeeze back muscles |
| Push-Ups | 3 | AMAP | Variations to increase challenge |
| Seated Cable Rows | 3 | 10–12 | Use full range of motion, scapular retraction |

### Day 2 – Legs & Abs
| Exercise | Sets | Reps/Duration | Notes |
|---|---:|---:|---|
| Squats | 4 | 8–12 | Maintain proper form, depth for glutes and quads |
| Leg Press | 3 | 10–12 | Focus on controlled movement |
| Romanian Deadlifts | 3 | 8–10 | Targets hamstrings, keep back straight |
| Standing Calf Raises | 3 | 15–20 | Full stretch and contraction |
| Plank | 3 | 30–45s | Core stabilization |
| Leg Raises | 3 | 15 | Engage lower abs |

### Day 3 – Rest or Active Recovery

### Day 4 – Chest & Back (Focus on Volume)
| Exercise | Sets | Reps/Duration | Notes |
|---|---:|---:|---|
| Dumbbell Flat Bench Press | 4 | 8–12 | Use moderate weights, focus on control |
| Pull-Ups | 3 | AMAP | Use assistance if needed, focus on full range |
| Chest Dips | 3 | 10–12 | Lean forward slightly to target chest |
| Single-Arm Dumbbell Rows | 3 | 10–12 each arm | Good for unilateral strength |
| Pec Deck Machine | 3 | 12–15 | Isolate chest, stretch at start |
| Cable Face Pulls | 3 | 12–15 | Targets rear delts and upper back |

### Day 5 – Optional Full Body or Focused Weak Area
| Exercise | Sets | Reps/Duration | Notes |
|---|---:|---:|---|
| Deadlifts | 3 | 8–10 | Compound for posterior chain |
| Dumbbell Shoulder Press | 3 | 10–12 | Strengthens shoulders |
| Lunges | 3 | 10–12 per leg | Improve balance, glutes, quads |
| Bicep Curls | 3 | 12–15 | Accessory arm work |
| Tricep Dips | 3 | 12–15 | Focus on triceps activation |

*Schedule this routine 3–4 times weekly, with at least one rest day between sessions. Adjust volume based on recovery and progress.*