In [1]:
import pandas as pd

In [2]:
github_url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]

In [3]:
from sentence_transformers import SentenceTransformer
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm
You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





## Q1

In [11]:
answer_llm = df.iloc[0].answer_llm
answer_embedding = embedding_model.encode(answer_llm)

In [12]:
answer_embedding[0]

-0.42244655

In [15]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


## Q2

In [19]:
evaluations = []
for i in tqdm(range(df.shape[0])):
    answer_llm = df.iloc[i].answer_llm
    answer_orig = df.iloc[i].answer_orig
    llm_embedding = embedding_model.encode(answer_llm)
    orig_embedding = embedding_model.encode(answer_orig)
    dot_score = llm_embedding.dot(orig_embedding)
    evaluations.append(dot_score)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [03:12<00:00,  1.56it/s]


In [10]:
import numpy as np

In [23]:
percentile_75th = np.percentile(evaluations, 75)

In [24]:
percentile_75th

31.67430877685547

## Q3

In [26]:
def get_norm(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [28]:
normalized_eval = []
for i in tqdm(range(df.shape[0])):
    answer_llm = df.iloc[i].answer_llm
    answer_orig = df.iloc[i].answer_orig
    llm_embedding = get_norm(embedding_model.encode(answer_llm))
    orig_embedding = get_norm(embedding_model.encode(answer_orig))
    dot_score = llm_embedding.dot(orig_embedding)
    normalized_eval.append(dot_score)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [03:07<00:00,  1.60it/s]


In [29]:
percentile_75th_normalized = np.percentile(normalized_eval, 75)

In [30]:
percentile_75th_normalized

0.8362348973751068

## Q4

In [7]:
from rouge import Rouge
rouge_scorer = Rouge()

r_answer_llm = df.iloc[10].answer_llm
r_answer_orig = df.iloc[10].answer_orig
scores = rouge_scorer.get_scores(r_answer_llm, r_answer_orig)[0]

In [8]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q5

In [13]:
avg_scores = np.mean([scores[key]['f'] for key in scores])

In [14]:
avg_scores

0.35490034990035496

## Q6

In [16]:
scores_ls = []

for i in tqdm(range(df.shape[0])):
    s1 = df.iloc[i].answer_llm
    s2 = df.iloc[i].answer_orig
    score = rouge_scorer.get_scores(r_answer_llm, r_answer_orig)[0]
    scores_ls.append(score['rouge-2']['f'])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 616.20it/s]


In [17]:
np.mean(scores_ls)

0.21621621121621634