In [1]:
import pandas as pd 
from sentence_transformers import SentenceTransformer
import numpy as np 
from rouge import Rouge

github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'

url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]

model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Q1: 
answer_llm = df.iloc[0].answer_llm
answer_llm_embedding = embedding_model.encode(answer_llm)
print(f"Q1: {answer_llm_embedding[0]}")

Q1: -0.4224465489387512


In [3]:
# Q2
evalutions = []
for i, row in df.iterrows():
    answer_embedding_orig = embedding_model.encode(row.answer_orig)
    answer_embedding_llm = embedding_model.encode(row.answer_llm) 
    similarity = np.dot(answer_embedding_llm,answer_embedding_orig)
    # save similarity to the evalutions list
    evalutions.append(similarity)    

print(f"Q2: {np.percentile(evalutions, 75)}")

Q2: 31.67430877685547


In [4]:
# Q3 
evalutions = []

for i, row in df.iterrows():
    answer_embedding_orig = embedding_model.encode(row.answer_orig, normalize_embeddings=True)
    answer_embedding_llm = embedding_model.encode(row.answer_llm, normalize_embeddings=True) 
    
    similarity = answer_embedding_llm.dot(answer_embedding_orig)
    evalutions.append(similarity)    

evalutions = pd.Series(evalutions)
print(f"Q3: {evalutions.quantile(0.75)}")

Q3: 0.8362348824739456


In [5]:
# Q4 
rouge_scorer = Rouge()
df_d = df.iloc[10]
scores = rouge_scorer.get_scores(df_d.answer_llm, df_d.answer_orig)[0]
rouge_1_f_score = scores['rouge-1']['f']
print(f"Q4: {rouge_1_f_score}")


Q4: 0.45454544954545456


In [6]:
# Q5
rouge_1 = scores['rouge-1']['f']
rouge_2 = scores['rouge-2']['f']
rouge_l = scores['rouge-l']['f']
average_rouge = (rouge_1 + rouge_2 + rouge_l) / 3
print(f"Q5: {average_rouge}")

Q5: 0.35490034990035496


In [7]:
# Q6
rouge_scores = []
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []

for i, row in df.iterrows():
    scores = rouge_scorer.get_scores(row.answer_llm, row.answer_orig)[0]
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
    # Create a dataframe from the scores
    rouge_scores.append(rouge_avg)
    rouge_1_scores.append(rouge_1)
    rouge_2_scores.append(rouge_2)
    rouge_l_scores.append(rouge_l)

df['rouge_avg'] = rouge_scores
df['rouge_1'] = rouge_1_scores
df['rouge_2'] = rouge_2_scores
df['rouge_l'] = rouge_l_scores    
df.head(5).T
print(f"Q6 {df['rouge_2'].mean()}")

Q6 0.20696501983423318
