In [30]:
import pandas as pd
import minsearch
from tqdm.auto import tqdm

## Ingestion

In [3]:
df = pd.read_csv("../data.csv")

In [24]:
df.head()

Unnamed: 0,id,exercise_name,type_of_activity,type_of_equipment,body_part,type,muscle_groups_activated,instructions
0,0,Push-Ups,Strength,Bodyweight,Upper Body,Push,"Pectorals, Triceps, Deltoids",Start in a high plank position with your hands...
1,1,Squats,Strength,Bodyweight,Lower Body,Push,"Quadriceps, Glutes, Hamstrings",Stand with feet shoulder-width apart. Lower yo...
2,2,Plank,Strength/Mobility,Bodyweight,Core,Hold,"Rectus Abdominis, Transverse Abdominis",Start in a forearm plank position with your el...
3,3,Deadlift,Strength,Barbell,Lower Body,Pull,"Glutes, Hamstrings, Lower Back","Stand with feet hip-width apart, barbell in fr..."
4,4,Bicep Curls,Strength,Dumbbells,Upper Body,Pull,"Biceps, Forearms","Stand with a dumbbell in each hand, arms fully..."


In [5]:
df.columns

Index(['id', 'exercise_name', 'type_of_activity', 'type_of_equipment',
       'body_part', 'type', 'muscle_groups_activated', 'instructions'],
      dtype='object')

In [6]:
index = minsearch.Index(
    text_fields = ['exercise_name', 'type_of_activity', 'type_of_equipment',
       'body_part', 'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields = ['id']
)

In [7]:
documents = df.to_dict(orient='records')

In [8]:
index.fit(documents)

<minsearch.Index at 0x15aa3fb0260>

In [9]:
query = 'give me exercises for hamstrings' 

## RAG Flow

In [10]:
import os
from dotenv import load_dotenv
import openai
from openai import OpenAI

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()

In [11]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

In [12]:
prompt_template = """
    You're a coach teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: {context}
""".strip()

entry_template = """
    exercise_name: {exercise_name}
    type_of_activity: {type_of_activity}
    type_of_equipment: {type_of_equipment}
    body_part: {body_part}
    type: {type}
    muscle_groups_activated: {muscle_groups_activated}
    instructions: {instructions}
""".strip()


def build_prompt(query, search_results):
    
    context = ""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [13]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [14]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [29]:
answer = rag(query)
print(answer)

Here are some exercises that target the hamstrings:

1. **Leg Curl**:
   - **Activity Type**: Strength
   - **Equipment**: Machine
   - **Instructions**: Lie face down on a leg curl machine and curl the pad towards your glutes, then return to the starting position.

2. **Hamstring Stretch**:
   - **Activity Type**: Mobility
   - **Equipment**: Bodyweight
   - **Instructions**: Stand upright and place one heel on a bench or step. Lean forward slightly to feel a stretch in the hamstring of the elevated leg.

3. **Machine Leg Curl**:
   - **Activity Type**: Strength
   - **Equipment**: Machine
   - **Instructions**: Sit on a leg curl machine and curl the pad down towards your glutes, then return to the starting position.

4. **Seated Leg Curl**:
   - **Activity Type**: Strength
   - **Equipment**: Machine
   - **Instructions**: Sit on a leg curl machine and curl the pad down towards your glutes, then return to the starting position.

5. **Prone Leg Curl**:
   - **Activity Type**: Strength

## Retrieval Evaluation

In [15]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [17]:
df_question.head()

Unnamed: 0,id,question
0,0,What is the correct starting position for doin...
1,0,Which muscle groups are primarily activated du...
2,0,How should I lower my body while performing a ...
3,0,What type of equipment do I need for push-ups?
4,0,Can you describe the motion involved in comple...


In [21]:
ground_truth = df_question.to_dict(orient='records')

In [23]:
ground_truth[0]

{'id': 0,
 'question': 'What is the correct starting position for doing push-ups?'}

In [18]:
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)         
    
def mmr(relevance_total):
    total_score = 0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
        
    return total_score / len(relevance_total)  

In [26]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

In [28]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id =q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "MMR": mmr(relevance_total)
    }   

In [32]:
evaluate(ground_truth, lambda q: minsearch_search(query=q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.9497584541062802, 'MMR': 0.8445077064642286}

## Finding the best parameter

In [42]:
gt_val = ground_truth[:100]
gt_test = ground_truth[100:]

In [38]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [39]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [45]:
param_ranges = {
    'exercise_name': (0.0, 3.0),
    'type_of_activity': (0.0, 3.0),
    'type_of_equipment': (0.0, 3.0),
    'body_part': (0.0, 3.0),
    'type': (0.0, 3.0),
    'muscle_groups_activated': (0.0, 3.0),
    'instructions': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['MMR']

In [47]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'exercise_name': 2.8664358956968776,
  'type_of_activity': 0.6639354730372857,
  'type_of_equipment': 0.5039263007577525,
  'body_part': 1.8790068047197304,
  'type': 0.40466074582783396,
  'muscle_groups_activated': 2.133852149769934,
  'instructions': 0.55384884006528},
 0.882)

In [50]:
def minsearch_improved(query):
    boost = {
        'exercise_name': 2.8664358956968776,
        'type_of_activity': 0.6639354730372857,
        'type_of_equipment': 0.5039263007577525,
        'body_part': 1.8790068047197304,
        'type': 0.40466074582783396,
        'muscle_groups_activated': 2.133852149769934,
        'instructions': 0.55384884006528
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [51]:
evaluate(ground_truth, lambda q: minsearch_improved(query=q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.9545893719806763, 'MMR': 0.9264419906448894}