In [23]:
import numpy as np
import pandas as pd

In [5]:
github_url = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [6]:
df = df.iloc[:300]

In [8]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
answer_llm = df.iloc[0].answer_llm
first_encode = embedding_model.encode(answer_llm)

In [15]:
round(first_encode[0], 2)

-0.42

## Q1. Getting the embeddings model
- What's the first value of the resulting vector? **-0.42**

In [20]:
answer_llm_encode = []
answer_orig_encode = []
evaluations = []

for i in range(df.shape[0]):
    temp1 = embedding_model.encode(df.iloc[i].answer_llm)
    temp2 = embedding_model.encode(df.iloc[i].answer_orig)
    
    answer_llm_encode.append(temp1)
    answer_orig_encode.append(temp2)
    
    evaluations.append(temp1.dot(temp2))
    

In [28]:
round(np.percentile(evaluations, 75), 2)

31.67

## Q2. Computing the dot product
- What's the 75% percentile of the score? **31.67**

In [29]:
def norm_vec(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

answer_llm_norm = [norm_vec(v) for v in answer_llm_encode]
answer_orig_norm = [norm_vec(v) for v in answer_orig_encode]

evaluations_norm = [answer_llm_norm[i].dot(answer_orig_norm[i]) for i in range(df.shape[0])]

In [32]:
round(np.percentile(evaluations_norm, 75), 3)

0.836

## Q3. Computing the cosine
- What's the 75% cosine in the scores? **0.83**

In [34]:
from rouge import Rouge

r = df.iloc[10]

rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [35]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q4. Rouge
- What's the F score for rouge-1? **0.45**

In [42]:
f1_scores = []
for k,v in scores.items():
    f1_scores.append(v['f'])
    
round(np.average(f1_scores), 3)

0.355

## Q5. Average rouge score
- Let's compute the average F-score between rouge-1, rouge-2 and rouge-l for the same record from Q4 **0.35**

In [43]:
rouge_scores = []

for i in range(df.shape[0]):
    score = rouge_scorer.get_scores(df.iloc[i].answer_llm, df.iloc[i].answer_orig)[0]
    rouge_scores.append(score)

In [52]:
# Initialize an empty list to store rows for the DataFrame
rows = []

# Iterate through each dictionary in the list
for item in rouge_scores:
    row = {}
    for rouge_key, metrics in item.items():
        for metric_key, value in metrics.items():
            # Create a composite key for the DataFrame columns
            row[f'{rouge_key}_{metric_key}'] = value
    rows.append(row)

# Create a DataFrame from the list of rows
rouge_df = pd.DataFrame(rows)

# Print the DataFrame
rouge_df.head()

Unnamed: 0,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f
0,0.061224,0.214286,0.095238,0.017544,0.071429,0.028169,0.061224,0.214286,0.095238
1,0.081633,0.266667,0.125,0.035088,0.133333,0.055556,0.061224,0.2,0.09375
2,0.326531,0.571429,0.415584,0.140351,0.242424,0.177778,0.306122,0.535714,0.38961
3,0.163265,0.32,0.216216,0.035088,0.071429,0.047059,0.142857,0.28,0.189189
4,0.265306,0.097015,0.142076,0.070175,0.022346,0.033898,0.22449,0.08209,0.120219


In [55]:
round(rouge_df["rouge-2_f"].mean(), 3)

0.207

## Q6. Average rouge score for all the data points

- What's the average rouge_2 across all the records? **0.20**