In [70]:
import pandas as pd

In [71]:
from openai import OpenAI

In [72]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
print("API Key loaded:", "Yes" if api_key else "No")

API Key loaded: Yes


In [73]:
from openai import OpenAI

client = OpenAI()

In [74]:
df = pd.read_csv('../data/data.csv')
documents = df.to_dict(orient='records')

In [75]:
prompt_template = """
You emulate a user of our fitness assistant application.
Make the questions specific to this excercises.
Formulate 5 questions this user might ask based on a a provide excrercices.
The record should contain the answer to the questions, 
and the questions should be complete and not too short.
Use as fewer words as possible from the record. 

The record:

exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [76]:
prompt = prompt_template.format(**documents[0])

In [77]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [78]:
answers = llm(prompt)

In [79]:
import json

In [80]:
json.loads(answers)

{'questions': ['What is the correct starting position for push-ups?',
  'Which muscles are primarily targeted when performing push-ups?',
  'Do I need any equipment to do push-ups?',
  'How should I lower my body during a push-up?',
  'Can you explain the proper form for pushing back up in a push-up?']}

In [81]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [82]:
from tqdm.auto import tqdm

In [83]:
results = {}

In [84]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/207 [00:00<?, ?it/s]

In [85]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [86]:
final_results[0]

(0, 'What is the proper starting position for doing push-ups?')

In [92]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [95]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [96]:
!head ../data/ground-truth-retrieval.csv

id,question
0,What is the proper starting position for doing push-ups?
0,Which muscle groups are primarily worked during push-ups?
0,How do I perform a push-up correctly?
0,What type of exercise are push-ups classified as?
0,Do I need any equipment to do push-ups?
1,What are the primary muscle groups activated during squats?
1,Can I perform squats without any equipment?
1,What is the correct stance width for doing squats?
1,How should I position my chest while performing squats?
