# Data Extraction

In [34]:
import re, pandas as pd
from glob import glob
from PyPDF2 import PdfReader
import time


In [35]:
def extract_workout_from_pdf(pdf_path):
    # tables = camelot.read_pdf(pdf_path, flavor='lattice', pages='1-end')
    # if not tables.n:
    #     tables = camelot.read_pdf(pdf_path, flavor='stream', pages='1-end')
    # df_tables = [t.df[0].str.split("\n", expand=True) for t in tables]

    reader = PdfReader(pdf_path)
    raw_text = reader.pages[0].extract_text()  

    data = {'pdf_path': pdf_path}

    tables_pattern = re.compile(
        r'(?s)\A(.+?)(?=\nMUSCLEANDSTRENGTH\.COM)',
    )
    m = tables_pattern.search(raw_text)
    tables = m.group(1).strip() if m else None
    data['tables'] = tables

    labels = (
        "Main Goal|Training Level|Program Duration|"
        "Days Per Week|Time Per Workout|Equipment|Author|Target Gender"
    )
    metadata_pattern = re.compile(
        rf"(?P<label>{labels}):\s*"
        r"(?P<value>[^:]+?)(?="
        rf"(?:{labels}):|$)"
    )
    for m in metadata_pattern.finditer(raw_text):
        label = m.group("label").strip()
        value = m.group("value").strip()
        data[label] = value


    desc_pattern = re.compile(
        r"""MUSCLEANDSTRENGTH\.COM[\s\S]*?Tools\s*\n 
        (?P<desc>[\s\S]+?)             
        (?=\nMain\s+Goal:)          
        """,
        re.VERBOSE | re.MULTILINE
    )

    m = desc_pattern.search(raw_text)
    if m:
        desc = m.group("desc").strip()
    else:
        desc_pattern = re.compile(
            r"MUSCLEANDSTRENGTH\.COM\s*\r?\n"
            r"(?P<desc>.*?)"
            r"(?=\r?\nMain\s+Goal:)",
            re.DOTALL
        )
        m = desc_pattern.search(raw_text)
        desc = m.group("desc").strip() if m else None
    data['Description'] = desc

    # data['raw_text'] = raw_text

    return pd.DataFrame.from_dict(data, orient='index').T


In [36]:
all_plans = []
for pdf in glob('./samples/**/*.pdf'):
    all_plans.append(extract_workout_from_pdf(pdf))

master_df = pd.concat(all_plans, ignore_index=True)
master_df.to_csv('workouts_data.csv', index=False)

In [43]:
master_df = pd.read_csv('workouts_data.csv')
master_df

Unnamed: 0,pdf_path,tables,Main Goal,Training Level,Program Duration,Days Per Week,Time Per Workout,Equipment,Author,Description,Target Gender
0,./samples/back/8weekchestback.pdf,Monday - Back & Chest\nExercise Sets Reps\nWor...,Build Muscle,Beginner,8 Weeks,4 Days,45-60 Mins,"Barbell, Bodyweight, \nDumbbells, Machines",Steve Shaw,This back and chest specialization workout is ...,
1,./samples/back/backandshoulderwomen.pdf,Workout Routine for Women\nExercise Sets Reps\...,Build Muscle,Beginner,6 Weeks,2 Days,45-60 Mins,"Bodyweight, \nDumbbells",Holly Blumenberg,BACK & SHOULDER WORKOUT ROUTINE \nFOR WOMEN\nT...,
2,./samples/back/cobraworkout.pdf,Monday: Super Thick Workout\nExercise Sets Rep...,Build Muscle,Intermediate,8 Weeks,2 Days,60-90 Mins,"Barbell, Bodyweight,\nCables, Dumbbells",Coach Dustin Myers,THE COBRA WORKOUT: \nHEAVY HIGH VOLUME BACK PR...,
3,./samples/begineer/12weekfullbodyworkoutroutin...,Weeks 1-3: Total Body Circuit Workout\nDuring ...,Build Muscle,Beginner,12 Weeks,3 Day,30-45 Mins,"Barbell, Bodyweight, \nCables, Dumbbells, Mach...",Doug Lawrenson,12 WEEK FULL BODY WORKOUT \nROUTINE FOR BEGINN...,Male & Female
4,./samples/begineer/4dayupperlowerplantfitnessw...,Planet Fitness Upper Body Workout A\nExercise ...,Build Muscle,Beginner,8 Weeks,4 Days,45 - 70 Mins,"Bodyweight, Cables, \nDumbbells, EZ Bar, Machines",Josh England,4 DAY UPPER/LOWER PLANET FITNESS\nWORKOUT (MAC...,Male & Female
5,./samples/begineer/bodyweightbasics.pdf,Day 1: Upper Body\nExercise Sets Reps\nPull Up...,General Fitness,Beginner,4 Weeks,3 Days,15-30 Mins,Bodyweight,Roger “Rock” Lockridge,BODYWEIGHT BASICS: \n3 DAY BODYWEIGHT WORKOUT ...,
6,./samples/begineer/thebest15minutewarmups.pdf,Exercise Sets Reps/Time Rest\nFast-Paced Walk ...,General Fitness,Beginner,,,15 Mins,"Bands, Barbell, \nBodyweight, Foam Roll, Kettl...",Roger “Rock” Lockridge\nFull-Body Focused Warm...,THE BEST 15-MINUTE WARM-UPS\nMaximize your wor...,Male & Female
7,./samples/chest/12weekstoabiggerbenchpress.pdf,Monday\nExercise Sets Reps\nBench Press 3 See ...,Increase Strength,Beginner,12 Weeks,3 Days,45-60 Mins,"Barbell, Bodyweight, \nDumbbells, EZ Bar, Machine",Jonathan Byrd,How much do you bench? If you feel it’s never ...,
8,./samples/chest/bestchestworkout.pdf,The Stretch & Push Workout\nExercise Sets Reps...,Build Muscle,Intermediate,8 Weeks,1 Day,45-60 Mins,"Barbell, Bodyweight,\nCables, Dumbbells",Roger “Rock” Lockridge,BEST CHEST WORKOUT: THE PUSH & \nSTRETCH METHO...,
9,./samples/chest/pecpounder8weekchestworkouttos...,The Hybrid PRRS™ Method\nExercise Tempo Sets R...,Build Muscle,Intermediate,8 Weeks,1 Day,45-60 Mins,"Barbell, Cables, \nDumbbells, Machines",Eric Broser,PEC-POUNDER: 8 WEEK CHEST WORKOUT \nTO SHATTER...,Male


# Create Queries

In [38]:
from dotenv import load_dotenv
import os

load_dotenv()

gemini_api_key = os.getenv("GEMINI_API_KEY")

In [39]:
from langchain_google_genai import ChatGoogleGenerativeAI

MODEL_NAME = "gemini-2.5-flash-preview-05-20"

gemini_chat = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=0, api_key=gemini_api_key)
gemini_chat_half = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=0.5, api_key=gemini_api_key)
gemini_chat_1 = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=1, api_key=gemini_api_key)


In [40]:
from langchain_core.messages import SystemMessage, HumanMessage

def create_prompt(metadata, model):
    system_message = SystemMessage(
        content=(
            "You are a fitness coach recruiter creating user personas."
        )
    )
    message = (
        "Given this workout plan metadata and description generate a realistic user query "
        "describing their goals, background, and constraints in 2-3 sentences."
        "\n".join(f"- {k}: {v}" for k,v in metadata.items())
    )
    user_message = HumanMessage(
        content=message
    )

    response = model.invoke([system_message, user_message])

    return response.content

In [41]:
metadata = master_df[["Main Goal", "Training Level", "Program Duration", "Days Per Week", "Time Per Workout", "Equipment", "Target Gender", "Description"]].to_dict(orient='records')

In [None]:
prompts = []
for m in metadata:
    prompts.append(create_prompt(m, gemini_chat))
    time.sleep(10)

In [44]:
prompts_half = []
for m in metadata:
    prompts_half.append(create_prompt(m, gemini_chat_half))
    time.sleep(10)

In [46]:
prompts_1 = []
for m in metadata:
    prompts_1.append(create_prompt(m, gemini_chat_1))
    time.sleep(10)

In [59]:
query_df = pd.DataFrame({
    'query_0': prompts,
    'query_half': prompts_half,
    'query_1': prompts_1,
})
query_df.to_csv('queries.csv', index=False)

In [60]:
query_df

Unnamed: 0,query_0,query_half,query_1
0,"""As a beginner, I'm looking for an 8-week prog...","""I'm a beginner looking for an 8-week program ...","""As a beginner, I'm looking for an 8-week prog..."
1,"""As a beginner woman, I'm looking for a 6-week...","""As a beginner woman, I'm looking for a 6-week...","I'm a beginner woman looking to build muscle, ..."
2,"""I'm an intermediate lifter seeking an 8-week ...","""I'm an intermediate lifter focused on buildin...","""I'm an intermediate lifter seeking an 8-week ..."
3,"""I'm a complete beginner looking to build musc...","""I'm a complete beginner aiming to build muscl...","""I'm a complete beginner looking to build musc..."
4,"""I'm a beginner looking to build muscle over t...","""As a beginner, I'm looking for an 8-week prog...","""I'm a beginner looking to build muscle over t..."
5,"""I'm a complete beginner looking to improve my...","""I'm a complete beginner looking to improve my...","""I'm a complete beginner looking to establish ..."
6,"""I'm a beginner looking for a quick and effect...","""I'm a beginner looking for quick and effectiv...","""I'm a fitness beginner looking to improve my ..."
7,"""As a beginner, I'm looking for a structured 1...",I'm a beginner lifter who feels my bench press...,"""I'm a beginner looking for a focused program ..."
8,"""I'm an intermediate lifter focused on buildin...","""As an intermediate lifter, I'm aiming to buil...","""I'm an intermediate lifter focused on buildin..."
9,"""As an intermediate male lifter, I'm really fr...","""I'm an intermediate male lifter who's hit a p...","""As an intermediate male lifter, I'm really fr..."


# LLM as a Judge

In [50]:
query_df = pd.read_csv('queries.csv')

In [30]:
from pydantic import BaseModel, Field
from typing import Literal

class PlanComparison(BaseModel):
    relevance: Literal["A", "B", "Tie"] = Field(
        description="Which plan better aligns with the persona's stated goals"
    )
    completeness: Literal["A", "B", "Tie"] = Field(
        description="Which plan provides more thorough information"
    )
    creativity: Literal["A", "B", "Tie"] = Field(
        description="Which plan shows more novel or varied exercise choices"
    )
    safety_and_evidence: Literal["A", "B", "Tie"] = Field(
        description="Which plan is safer and more grounded in exercise science"
    )
    clarity: Literal["A", "B", "Tie"] = Field(
        description="Which plan is more clearly written and well-organized"
    )
    overall: Literal["A", "B", "Tie"] = Field(
        description="Overall better plan considering all criteria"
    )
    rationale: str = Field(
        description="A brief, objective explanation of the judgment"
    )


In [31]:
from langchain_core.messages import SystemMessage, HumanMessage

structured_model = gemini_chat.with_structured_output(PlanComparison)

def judge(query, baseline_plan, agent_plan):
    system_message = SystemMessage(
        content=(
            "You are an expert fitness coach."
        )
    )
    message = f"""Given the User Persona:
{query}

Plan A:
{baseline_plan}

Plan B:
{agent_plan}

Evaluate each plan on five dimensions: Relevance, Completeness, Creativity, Safety and Evidence-Grounding, and Clarity and Organization. without bias toward presentation order, response length, or plan names.

For each dimension, choose “A” if Plan A is stronger, “B” if Plan B is stronger, or “Tie” if they are equal. Then decide the overall winner in the same way. Finally, provide a concise rationale.

Output must conform to the PlanComparison schema."""
    user_message = HumanMessage(
        content=message
    )

    response = structured_model.invoke([system_message, user_message])

    return response

In [None]:
# import agent

for col in query_df.columns:
    plans = []
    judges = []
    for i in range(len(query_df)):
        agent_plan = agent.invoke("query": query_df[col][i])
        time.sleep(10)
        plans.append(agent_plan)
        judges.append(judge(query_df[col][i], master_df['tables'][i], agent_plan))
        time.sleep(10)
    query_df[f'plan_{col}'] = plans
    query_df[f'judge_{col}'] = judges

# Analysis

In [30]:
from sklearn.metrics import cohen_kappa_score

df = pd.DataFrame(judges)

overall_counts = df.query("overall in ['A','B']")['overall'].value_counts()
n_overall = overall_counts.sum()
win_rate_A = overall_counts.get('A', 0) / n_overall
win_rate_B = overall_counts.get('B', 0) / n_overall

print(f"Overall win-rate: Baseline = {win_rate_A:.2%}, Agent = {win_rate_B:.2%} (n={n_overall})")

criteria = ['relevance', 'completeness', 'creativity', 'safety_and_evidence', 'clarity']
breakdown = {}
for crit in criteria:
    print(f"\n--- {crit.capitalize()} Breakdown ---")

    cnts = df.query(f"{crit} in ['A','B']")[crit].value_counts()
    a = cnts.get('A', 0)
    b = cnts.get('B', 0)
    print(f"{crit.capitalize()}: Baseline = {a}, Agent = {b}, tie/skipped = {len(df) - (a+b)}")

    kappa_rel = cohen_kappa_score(
        df['overall'].replace('Tie', pd.NA).dropna(),
        df[crit].replace('Tie', pd.NA).dropna()
    )
    print(f"Cohen's Kappa (overall vs {crit}): {kappa_rel:.2f}")
    
    breakdown[crit] = {'A_wins': a, 'B_wins': b, 'kappa': kappa_rel}



Overall win-rate: Baseline = 50.00%, Agent = 50.00% (n=2)

--- Relevance Breakdown ---
Relevance: Baseline = 1, Agent = 1, tie/skipped = 0
Cohen's Kappa (overall vs relevance): 1.00

--- Completeness Breakdown ---
Completeness: Baseline = 1, Agent = 1, tie/skipped = 0
Cohen's Kappa (overall vs completeness): -1.00

--- Creativity Breakdown ---
Creativity: Baseline = 1, Agent = 1, tie/skipped = 0
Cohen's Kappa (overall vs creativity): 1.00

--- Safety_and_evidence Breakdown ---
Safety_and_evidence: Baseline = 1, Agent = 1, tie/skipped = 0
Cohen's Kappa (overall vs safety_and_evidence): -1.00

--- Clarity Breakdown ---
Clarity: Baseline = 1, Agent = 1, tie/skipped = 0
Cohen's Kappa (overall vs clarity): 1.00
