In [2]:
import pandas as pd

In [3]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [4]:
df = df.iloc[:300]

In [6]:
df

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
295,An alternative way to load the data using the ...,Above users showed how to load the dataset dir...,8d209d6d,What is an alternative way to load the data us...,machine-learning-zoomcamp
296,You can directly download the dataset from Git...,Above users showed how to load the dataset dir...,8d209d6d,How can I directly download the dataset from G...,machine-learning-zoomcamp
297,You can fetch data for homework using the `req...,Above users showed how to load the dataset dir...,8d209d6d,Could you share a method to fetch data for hom...,machine-learning-zoomcamp
298,If the status code is 200 when downloading dat...,Above users showed how to load the dataset dir...,8d209d6d,What should I do if the status code is 200 whe...,machine-learning-zoomcamp


In [7]:
from sentence_transformers import SentenceTransformer 
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
answer_llm = df.iloc[0].answer_llm
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [12]:
answer_llm_encoded = embedding_model.encode(answer_llm)
answer_llm_encoded[0]

-0.42244673

In [22]:
result_df = df.to_dict(orient='records')

In [13]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [17]:
from tqdm.auto import tqdm

In [24]:
evaluations = []

for record in tqdm(result_df):
    sim = compute_similarity(record)
    evaluations.append(sim)

100%|██████████| 300/300 [02:06<00:00,  2.38it/s]


In [32]:
import numpy as np

In [34]:
percentiles = np.percentile(evaluations, [25, 50, 75])
percentiles

array([24.30784178, 28.33687401, 31.67430735])

In [35]:
result_df

[{'answer_llm': 'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).',
  'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
  'document': '0227b872',
  'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp'},
 {'answer_llm': 'You can sign up using the link provided in the course GitHub repository: [https://airtable.com/shryxwLd0COOEaqXo](https://airtable.com/shryxwLd0COOEaqXo).',
  'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our

In [36]:
def normalization(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm


In [37]:
def compute_similarity_normalized(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = normalization(embedding_model.encode(answer_llm))
    v_orig = normalization(embedding_model.encode(answer_orig))
    
    return v_llm.dot(v_orig)

In [38]:
evaluations_normalized = []

for record in tqdm(result_df):
    sim = compute_similarity_normalized(record)
    evaluations_normalized.append(sim)

100%|██████████| 300/300 [01:58<00:00,  2.52it/s]


In [39]:
percentiles_normalized = np.percentile(evaluations_normalized, [25, 50, 75])
percentiles_normalized

array([0.65127344, 0.76376092, 0.83623482])

In [43]:
r = df.iloc[10]

In [44]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [45]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [47]:
f_scores = [value['f'] for value in scores.values()]
average_f_score = sum(f_scores) / len(f_scores)
average_f_score

0.35490034990035496

In [50]:
results = []

# Compute ROUGE scores for each record
for r in result_df:
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
    results.append({
        'rouge-1_r': scores['rouge-1']['r'],
        'rouge-1_p': scores['rouge-1']['p'],
        'rouge-1_f': scores['rouge-1']['f'],
        'rouge-2_r': scores['rouge-2']['r'],
        'rouge-2_p': scores['rouge-2']['p'],
        'rouge-2_f': scores['rouge-2']['f'],
        'rouge-l_r': scores['rouge-l']['r'],
        'rouge-l_p': scores['rouge-l']['p'],
        'rouge-l_f': scores['rouge-l']['f']
    })

# Create DataFrame from results
df = pd.DataFrame(results)

# Calculate the average ROUGE-2 F1-score
average_rouge_2_f = df['rouge-2_f'].mean()

print("Average ROUGE-2 F1-score:", average_rouge_2_f)

Average ROUGE-2 F1-score: 0.20696501983423318
