In [15]:
import pandas as pd
from tqdm import tqdm

In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [9]:
df = df.iloc[:300]
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


# Q1. Getting the embeddings model

In [8]:
from sentence_transformers import SentenceTransformer


model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

In [7]:
answer_llm = df.iloc[0].answer_llm
v = embedding_model.encode([answer_llm])[0]
v[0]

-0.42244673

In [11]:
type(v)

numpy.ndarray

# Q2. Computing the dot product

In [17]:
evaluations = []

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    answer_llm = row.answer_llm
    answer_orig = row.answer_orig
    llm_v = embedding_model.encode([answer_llm])[0]
    orig_v = embedding_model.encode([answer_orig])[0]
    evaluation = llm_v.dot(orig_v)
    evaluations.append(evaluation)

100%|██████████| 300/300 [02:51<00:00,  1.75it/s]


In [21]:
import numpy as np

evaluations = np.array(evaluations)
p75 = np.percentile(evaluations, 75)
p75

0.7949347198009491

# Q3. Computing the cosine

In [19]:
def vector_normalization(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [23]:
evaluations2 = []

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    answer_llm = row.answer_llm
    answer_orig = row.answer_orig
    llm_v = embedding_model.encode([answer_llm])[0]
    llm_v = vector_normalization(llm_v)
    orig_v = embedding_model.encode([answer_orig])[0]
    orig_v = vector_normalization(orig_v)
    evaluation = llm_v.dot(orig_v)
    evaluations2.append(evaluation)

100%|██████████| 300/300 [02:33<00:00,  1.96it/s]


In [24]:
import numpy as np

evaluations2 = np.array(evaluations2)
p75 = np.percentile(evaluations2, 75)
p75

0.8362348973751068

# Q4. Rouge

In [25]:
%pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [31]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df.iloc[10]
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [32]:
scores['rouge-1']['f']

0.45454544954545456

# Q5. Average rouge score

In [34]:
rouge_avg = sum([rouge['f'] for rouge in scores.values()]) / 3
rouge_avg

0.35490034990035496

# Q6. Average rouge score for all the data points

In [36]:
rouge_evaluations = []

for idx, r in tqdm(df.iterrows(), total=df.shape[0]):
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
    rouge_evaluation = {
        "rouge_1": rouge_1,
        "rouge_2": rouge_2,
        "rouge_l": rouge_l,
        "rouge_avg": rouge_avg
    }
    rouge_evaluations.append(rouge_evaluation)

100%|██████████| 300/300 [00:00<00:00, 444.32it/s]


In [38]:
rouge_evaluations = pd.DataFrame(rouge_evaluations)
rouge_evaluations.head()

0.20696501983423318

In [None]:
rouge_evaluations.rouge_2.mean()