# Load data

In [1]:
import pandas as pd

df_gpt4o_mini = pd.read_csv('data/results-gpt4o-mini.csv')
df = df_gpt4o_mini.iloc[:300]
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


# Q1. Getting the embeddings model

In [2]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-mpnet-base-dot-v1'
model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [4]:
answer_llm = df.iloc[0].answer_llm

## Q1 Solution

In [5]:
model.encode(answer_llm)

array([-4.22446430e-01, -2.24856049e-01, -3.24058473e-01, -2.84758627e-01,
        7.25684036e-03,  1.01186633e-01,  1.03716515e-01, -1.89983338e-01,
       -2.80601084e-02,  2.71588773e-01, -1.15337394e-01,  1.14666238e-01,
       -8.49588290e-02,  3.32365125e-01,  5.52724749e-02, -2.22195625e-01,
       -1.42541066e-01,  1.02519169e-01, -1.52333736e-01, -2.02912390e-01,
        1.98425800e-02,  8.38148370e-02, -5.68631828e-01,  2.32843328e-02,
       -1.67293012e-01, -2.39256814e-01, -8.05461258e-02,  2.57084910e-02,
       -8.15466940e-02, -7.39289895e-02, -2.61550218e-01,  1.92574356e-02,
        3.22909087e-01,  1.90357015e-01, -9.34726413e-05, -2.13165745e-01,
        2.88944189e-02, -1.79530494e-02, -5.92763498e-02,  1.99918285e-01,
       -4.75169867e-02,  1.71633631e-01, -2.45911796e-02, -9.38056931e-02,
       -3.57002854e-01,  1.33263797e-01,  1.94046006e-01, -1.18530825e-01,
        4.56915170e-01,  1.47728220e-01,  3.35945159e-01, -1.86959490e-01,
        2.45954573e-01, -

# Q2. Computing the dot product

In [6]:
results_gpt4o_mini = df.to_dict(orient='records')

In [9]:
from tqdm.auto import tqdm

def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']

    v_orig = model.encode(answer_orig)
    v_llm = model.encode(answer_llm)

    return v_llm.dot(v_orig)


similarities = []

In [10]:
for record in tqdm(results_gpt4o_mini):
    similarity = compute_similarity(record)
    similarities.append(similarity)

  0%|          | 0/300 [00:00<?, ?it/s]

In [11]:
df['cosine'] = similarities
df['cosine'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cosine'] = similarities


count    300.000000
mean      27.495996
std        6.384742
min        4.547927
25%       24.307843
50%       28.336865
75%       31.674307
max       39.476013
Name: cosine, dtype: float64

# Q3. Computing the cosine

In [17]:
import numpy as np

def compute_similarity_norm(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']

    v_orig = model.encode(answer_orig)
    norm_orig = np.sqrt((v_orig * v_orig).sum())
    v_orig_norm = v_orig / norm_orig
    
    v_llm = model.encode(answer_llm)
    norm_llm = np.sqrt((v_llm * v_llm).sum())
    v_llm_norm = v_llm / norm_llm

    return v_llm_norm.dot(v_orig_norm)

similarities_norm = []

In [18]:
for record in tqdm(results_gpt4o_mini):
    similarity = compute_similarity_norm(record)
    similarities_norm.append(similarity)

  0%|          | 0/300 [00:00<?, ?it/s]

In [19]:
df['cosine_norm'] = similarities_norm
df['cosine_norm'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cosine_norm'] = similarities_norm


count    300.000000
mean       0.728392
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine_norm, dtype: float64

# Q4. Rouge

In [17]:
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [19]:
#df[df['document'] == '5170565b'].head()
df.iloc[10]

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
Name: 10, dtype: object

In [21]:
from rouge import Rouge

r = df.iloc[10]
rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

# Q5. Average rouge score

In [22]:
avg = (scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f']) / 3
avg

0.35490034990035496

# Q6. Average rouge score for all the data points

In [26]:
from tqdm.auto import tqdm

def compute_rogue_2(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']

    scores = rouge_scorer.get_scores(answer_llm, answer_orig)[0]

    return scores['rouge-2']['f']


rogue_2 = []

In [27]:
for record in tqdm(results_gpt4o_mini):
    r = compute_rogue_2(record)
    rogue_2.append(r)

  0%|          | 0/300 [00:00<?, ?it/s]

In [28]:
df['rogue_2'] = rogue_2
df['rogue_2'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rogue_2'] = rogue_2


count    300.000000
mean       0.206965
std        0.153550
min        0.000000
25%        0.097809
50%        0.178671
75%        0.286181
max        0.739130
Name: rogue_2, dtype: float64