In [14]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [15]:
import requests 
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np
from rouge import Rouge

In [16]:


base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '04-monitoring/data/results-gpt4o-mini.csv'
docs_url = f'{base_url}/{relative_url}?raw=1'

df = pd.read_csv(docs_url)

# docs_response = requests.get(docs_url)
# documents = docs_response.json()

In [17]:
df = df.iloc[:300]
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [18]:
model_name='multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

llm = df.iloc[0].answer_llm
llm

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [23]:
encoded = embedding_model.encode(llm)


## Q1. Getting the embeddings model

Now, get the embeddings model `multi-qa-mpnet-base-dot-v1` from
[the Sentence Transformer library](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html#model-overview)

> Note: this is not the same model as in HW3

```bash
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)
```

Create the embeddings for the first LLM answer:

```python
answer_llm = df.iloc[0].answer_llm
```


In [19]:
encoded[0]

-0.42244655

In [20]:
df.iloc[0]

answer_llm     You can sign up for the course by visiting the...
answer_orig    Machine Learning Zoomcamp FAQ\nThe purpose of ...
document                                                0227b872
question                     Where can I sign up for the course?
course                                 machine-learning-zoomcamp
Name: 0, dtype: object

In [21]:


evaluations=[]

# for elem in df.iterrows():
for _, elem in tqdm(df.iterrows(), total=len(df), desc="Processing answers"):
    answer_orig = elem['answer_orig'] 
    answer_llm = elem['answer_llm'] 
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)

    evaluations.append(v_llm.dot(v_orig))

evaluations

Processing answers:   0%|          | 1/300 [00:00<01:54,  2.61it/s]

Processing answers: 100%|██████████| 300/300 [02:22<00:00,  2.11it/s]


[17.515987,
 13.418402,
 25.313255,
 12.147415,
 18.747736,
 33.970406,
 30.251705,
 29.521576,
 35.272198,
 27.751772,
 32.34471,
 31.441843,
 36.38072,
 33.340504,
 30.606163,
 32.503044,
 29.67445,
 24.35346,
 20.13247,
 23.99548,
 30.88028,
 32.692436,
 30.049173,
 16.078167,
 31.79642,
 37.980003,
 20.839046,
 32.612865,
 38.894196,
 34.051826,
 28.263878,
 27.124832,
 23.975264,
 26.340149,
 18.658115,
 25.016403,
 21.101131,
 33.72679,
 29.340347,
 28.654505,
 29.608585,
 30.810738,
 33.331203,
 26.220482,
 26.550072,
 13.148597,
 12.962547,
 12.275609,
 9.9744425,
 10.883928,
 29.845068,
 32.36178,
 22.18718,
 30.268936,
 25.091877,
 32.742783,
 28.220984,
 27.274975,
 24.208645,
 22.568907,
 19.767456,
 18.679333,
 20.422321,
 22.051325,
 18.188013,
 28.455883,
 25.919708,
 23.33234,
 22.20594,
 28.296299,
 39.23055,
 36.758507,
 31.913895,
 31.202858,
 36.913048,
 30.514185,
 36.26145,
 27.397552,
 37.792786,
 23.29768,
 34.252586,
 34.550625,
 30.316462,
 35.703526,
 31.0125

## Q2. Computing the dot product

- Now for each answer pair, let's create embeddings and compute dot product between them
- We will put the results (scores) into the `evaluations` list
- What's the 75% percentile of the score?


In [24]:
# Calculate the 75th percentile
percentile_75 = np.percentile(evaluations, 75)

print(f"The 75th percentile of the scores is: {percentile_75:.4f}")

The 75th percentile of the scores is: 31.6743


In [22]:
def normalize(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

evaluations_norm=[]

# for elem in df.iterrows():
for _, elem in tqdm(df.iterrows(), total=len(df), desc="Processing answers"):
    answer_orig = elem['answer_orig'] 
    answer_llm = elem['answer_llm'] 
    v_llm = embedding_model.encode(answer_llm)
    v_llm_norm = normalize(v_llm)
    v_orig = embedding_model.encode(answer_orig)
    v_orig_norm =  normalize(v_orig)
    evaluations_norm.append(v_llm_norm.dot(v_orig_norm))

Processing answers: 100%|██████████| 300/300 [02:22<00:00,  2.11it/s]


In [25]:
evaluations_norm

[0.5067539,
 0.38854873,
 0.7185989,
 0.33726627,
 0.5217923,
 0.83053213,
 0.7462835,
 0.6944061,
 0.84688616,
 0.65590763,
 0.7779559,
 0.78356636,
 0.90468806,
 0.80630296,
 0.72759616,
 0.7751896,
 0.71516633,
 0.5890557,
 0.53322953,
 0.5857593,
 0.81232715,
 0.83714426,
 0.76611555,
 0.43333992,
 0.81558585,
 0.92667866,
 0.552616,
 0.7622108,
 0.9452982,
 0.8478371,
 0.7192839,
 0.6864791,
 0.6100939,
 0.64910805,
 0.48555,
 0.6549567,
 0.52971876,
 0.84890294,
 0.73956215,
 0.76096815,
 0.70153177,
 0.7140965,
 0.77817,
 0.6202106,
 0.62210196,
 0.33472955,
 0.3324926,
 0.31343076,
 0.25845352,
 0.27644622,
 0.77109647,
 0.89201,
 0.5712719,
 0.7779895,
 0.7033882,
 0.8988763,
 0.7822658,
 0.69761264,
 0.6318737,
 0.5829771,
 0.59635806,
 0.5221753,
 0.5993201,
 0.65132016,
 0.53131604,
 0.761606,
 0.6682948,
 0.6511333,
 0.66239053,
 0.75467545,
 0.89955723,
 0.87245953,
 0.75394404,
 0.7211681,
 0.8531313,
 0.74570763,
 0.85769904,
 0.6625385,
 0.91524327,
 0.55959284,
 0.827

## Q3. Computing the cosine

From Q2, we can see that the results are not within the [0, 1] range. It's because the vectors coming from this model are not normalized.

So we need to normalize them.

To do it, we 

* Compute the norm of a vector
* Divide each element by this norm

So, for vector `v`, it'll be `v / ||v||`

In numpy, this is how you do it:

```python
norm = np.sqrt((v * v).sum())
v_norm = v / norm
```

Let's put it into a function and then compute dot product 
between normalized vectors. This will give us cosine similarity

What's the 75% cosine in the scores?


In [26]:

percentile_75_norm = np.percentile(evaluations_norm, 75)

print(f"The 75th percentile of the scores is: {percentile_75_norm:.4f}")

The 75th percentile of the scores is: 0.8362


In [27]:

rouge_scorer = Rouge()


# try one

# Get the specific row from the dataframe
r = df.loc[df['document'] == '5170565b'].iloc[0]

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q4. Rouge

Now we will explore an alternative metric - the ROUGE score.  

This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs.

It can give a more nuanced view of text similarity than just cosine similarity alone.

We don't need to implement it ourselves, there's a python package for it:

```bash
pip install rouge
```

(The latest version at the moment of writing is `1.0.1`)

Let's compute the ROUGE score between the answers at the index 10 of our dataframe (`doc_id=5170565b`)

```
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
```

There are three scores: `rouge-1`, `rouge-2` and `rouge-l`, and precision, recall and F1 score for each.

* `rouge-1` - the overlap of unigrams,
* `rouge-2` - bigrams,
* `rouge-l` - the longest common subsequence

What's the F score for `rouge-1`?

In [28]:
scores['rouge-1']['f']

0.45454544954545456


## Q5. Average rouge score

Let's compute the average F-score between `rouge-1`, `rouge-2` and `rouge-l` for the same record from Q4

In [31]:
# Extract F-scores for ROUGE-1, ROUGE-2, and ROUGE-L
f_score_1 = scores['rouge-1']['f']
f_score_2 = scores['rouge-2']['f']
f_score_l = scores['rouge-l']['f']

# Compute the average F-score
average_f_score = (f_score_1 + f_score_2 + f_score_l) / 3

print(f"F-scores:")
print(f"  ROUGE-1: {f_score_1:.4f}")
print(f"  ROUGE-2: {f_score_2:.4f}")
print(f"  ROUGE-L: {f_score_l:.4f}")
print(f"\nAverage F-score: {average_f_score:.4f}")

F-scores:
  ROUGE-1: 0.1795
  ROUGE-2: 0.0235
  ROUGE-L: 0.1538

Average F-score: 0.1190


## Q6. Average rouge score for all the data points

Now let's compute the F-score for all the records and create a dataframe from them.

What's the average F-score in `rouge_2` across all the records?

In [29]:
# Prepare lists to store the scores
rouge_scores = []

# Compute ROUGE scores for each record
for _, row in tqdm(df.iterrows(), total=len(df), desc="Computing ROUGE scores"):
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    
    rouge_scores.append({
        'document': row['document'],
        'rouge_1_f': scores['rouge-1']['f'],
        'rouge_2_f': scores['rouge-2']['f'],
        'rouge_l_f': scores['rouge-l']['f']
    })

# Create a new dataframe with the scores
rouge_df = pd.DataFrame(rouge_scores)

# Calculate the average ROUGE-2 score
average_rouge_2 = rouge_df['rouge_2_f'].mean()

print(f"The average ROUGE-2 F-score across all records is: {average_rouge_2:.4f}")

# Display the first few rows of the new dataframe
print("\nFirst few rows of the ROUGE scores dataframe:")
print(rouge_df.head())

Computing ROUGE scores:  35%|███▍      | 104/300 [00:00<00:00, 420.17it/s]

Computing ROUGE scores: 100%|██████████| 300/300 [00:01<00:00, 298.81it/s]

The average ROUGE-2 F-score across all records is: 0.2070

First few rows of the ROUGE scores dataframe:
   document  rouge_1_f  rouge_2_f  rouge_l_f
0  0227b872   0.095238   0.028169   0.095238
1  0227b872   0.125000   0.055556   0.093750
2  0227b872   0.415584   0.177778   0.389610
3  0227b872   0.216216   0.047059   0.189189
4  0227b872   0.142076   0.033898   0.120219





In [30]:
average_rouge_2

0.20696501983423318