# BERTSore evaluation metric

## Installing and importing the libraries

In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->bert_score)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.0->bert_score)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0.0->bert_score)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.0.0->bert_score)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3

In [None]:
import pandas as pd
from bert_score import score, BERTScorer

## Loading dataset

In [None]:
data = pd.read_csv('q_and_a.csv')

## Calculating scores for each model

In [None]:
# Initialize BERTScorer with caching disabled
scorer = BERTScorer(lang="en", rescale_with_baseline=True, use_fast_tokenizer=True)

def append_bertscores(data, model_column, ref_column='reference_answer'):
    precisions = []
    recalls = []
    f1_scores = []

    for index, row in data.iterrows():
        try:
            P, R, F1 = scorer.score([row[ref_column]], [row[model_column]])
            precisions.append(P.item())
            recalls.append(R.item())
            f1_scores.append(F1.item())
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            precisions.append(None)
            recalls.append(None)
            f1_scores.append(None)

    data[f'{model_column}_precision'] = precisions
    data[f'{model_column}_recall'] = recalls
    data[f'{model_column}_f1'] = f1_scores

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Calculate BERTScores for each model and append to dataframe
append_bertscores(data, 'chat_gpt_answer')
append_bertscores(data, 'google_gemini_answer')
append_bertscores(data, 'fine_tuned_model_answer')

## Saving and Downloading score values

In [None]:
# Save the extended dataframe with scores to a new CSV file
data.to_csv('output_file_1.csv', index=False)

In [None]:
from google.colab import files
files.download('output_file_1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>