In [69]:
!python -m wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py


Saved under minsearch (1).py


In [1]:
import pandas as pd

## Ingestion

In [2]:
df = pd.read_csv('../data/data.csv')

In [3]:
df.insert(0, 'id', df.index)

ValueError: cannot insert id, already exists

In [None]:
del df['ID']

In [None]:
df.to_csv('../data/data.csv', index=False)

In [None]:
documents = df.to_dict(orient='records')

In [None]:
df.columns

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
documents = df.to_dict(orient='records')

In [6]:
import minsearch

In [7]:
index = minsearch.Index(
    text_fields=['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part',
       'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields=['id']
)

In [8]:
index.fit(documents)

<minsearch.Index at 0x1a67fdaa9d0>

## RAG flow

In [1]:
# import os
# os.environ['OPENAI_API_KEY'] = 'key'

In [11]:
from openai import OpenAI

client = OpenAI()

In [12]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [13]:
prompt_template = """
You're a fitness insrtuctor. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [14]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [15]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [16]:
question = 'Is the Lat Pulldown considered a strength training activity, and if so, why?'
answer = rag(question)
print(answer)

Yes, the Lat Pulldown is considered a strength training activity because it targets major muscle groups, specifically the Latissimus Dorsi and Biceps, using resistance through a machine. This exercise involves pulling movements that help build muscular strength in the upper body.


## Retrieval evaluation

In [17]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [18]:
df_question.head()

Unnamed: 0,id,question
0,0,What equipment do I need to perform push-ups?
1,0,Which muscle groups are primarily worked durin...
2,0,Can you describe the starting position for pus...
3,0,What motion should I follow after lowering my ...
4,0,How should I position my hands when doing push...


In [19]:
ground_truth = df_question.to_dict(orient='records')

In [20]:
ground_truth[0]

{'id': 0, 'question': 'What equipment do I need to perform push-ups?'}

In [21]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [22]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [23]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [24]:
from tqdm.auto import tqdm

In [25]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.9449275362318841, 'mrr': 0.8339897247143628}

## Finding the best parameters

In [26]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [27]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [28]:
gt_val = df_validation.to_dict(orient='records')

In [29]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results


In [30]:
param_ranges = {
    'exercise_name': (0.0, 3.0),
    'type_of_activity': (0.0, 3.0),
    'type_of_equipment': (0.0, 3.0),
    'body_part': (0.0, 3.0),
    'type': (0.0, 3.0),
    'muscle_groups_activated': (0.0, 3.0),
    'instructions': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [31]:
simple_optimize(param_ranges, objective, n_iterations=10)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'exercise_name': 1.5000715012549362,
  'type_of_activity': 0.9016829174931699,
  'type_of_equipment': 0.4039822610747035,
  'body_part': 1.0686551463644418,
  'type': 0.396001907388713,
  'muscle_groups_activated': 1.5728626387190778,
  'instructions': 0.15899407644772012},
 0.8711111111111112)

In [32]:
def minsearch_improved(query):
    boost = {
        'exercise_name': 2.11,
        'type_of_activity': 1.46,
        'type_of_equipment': 0.65,
        'body_part': 2.65,
        'type': 1.31,
        'muscle_groups_activated': 2.54,
        'instructions': 0.74
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.9400966183574879, 'mrr': 0.9050785982670041}

## RAG evaluation

In [33]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [34]:
len(ground_truth)

1035

In [38]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)

In [39]:
print(answer_llm)

To perform standard push-ups, you need no equipment other than your bodyweight. For variations such as TRX Push-Ups, you would need a TRX system, and for Push-Up to Row, you would require dumbbells.


In [40]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What equipment do I need to perform push-ups?
Generated Answer: To perform standard push-ups, you need no equipment other than your bodyweight. For variations such as TRX Push-Ups, you would need a TRX system, and for Push-Up to Row, you would require dumbbells.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [41]:
llm(prompt)

'{\n  "Relevance": "RELEVANT",\n  "Explanation": "The generated answer addresses the question directly by stating that no equipment is needed for standard push-ups and provides additional information about equipment required for variations, which is relevant to the topic of performing push-ups."\n}'

In [42]:
import json

In [43]:
df_sample = df_question.sample(n=200, random_state=1)

In [44]:
sample = df_sample.to_dict(orient='records')

In [45]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [46]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [47]:
df_eval.relevance.value_counts(normalize=True)

RELEVANT           0.875
PARTLY_RELEVANT    0.120
NON_RELEVANT       0.005
Name: relevance, dtype: float64

In [48]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [49]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
52,"Yes, the Cable Lateral Raise is considered a p...",100,Is the Cable Lateral Raise considered a push e...,NON_RELEVANT,The Cable Lateral Raise is actually considered...
