In [9]:
import pandas as pd
import minsearch
import json
from tqdm.auto import tqdm

In [2]:
import os
from dotenv import load_dotenv
import openai
from openai import OpenAI

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()

In [3]:
df = pd.read_csv("../data.csv")
documents = df.to_dict(orient='records')

In [4]:
prompt_template = """
    You emulate a user of our fitness assistant application.
    Formulate 5 questions this user might ask based on a provided exercise.
    Make the questions specific to this exercise.
    The record should contain the answer to the questions, and the questions should
    be complete and not too short. Use as fewer words as possible from the record. 
    
    The record:
    
    exercise_name: {exercise_name}
    type_of_activity: {type_of_activity}
    type_of_equipment: {type_of_equipment}
    body_part: {body_part}
    type: {type}
    muscle_groups_activated: {muscle_groups_activated}
    instructions: {instructions}
    
    Provide the output in parsable JSON without using code blocks:
    
    {{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [5]:
prompt = prompt_template.format(**documents[0])

In [6]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [14]:
questions = llm(prompt)

In [16]:
json.loads(questions)

{'questions': ['What is the starting position for performing push-ups?',
  'Which muscle groups are primarily activated during push-ups?',
  'Can push-ups be done without any equipment?',
  'What is the correct movement sequence for a push-up?',
  'Which body part is primarily targeted while doing push-ups?']}

In [8]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [11]:
results = {}
# Generate 5 question each FAQ (gonna be expensive)
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/207 [00:00<?, ?it/s]

In [12]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [13]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [14]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [16]:
!head ../data/ground-truth-retrieval.csv

'head' is not recognized as an internal or external command,
operable program or batch file.
