### Overview
In this notebook, we will look at the average helpfulness reward across different models

In [2]:
import os
files = os.listdir("./data/")
all_files = []
for ele in files:
    if 'csv' in ele:
        all_files.append(ele)

In [10]:
all_files

['gpt2_large_0_safety.csv',
 'gpt2_large_1000_safety.csv',
 'gpt2_large_100_safety.csv',
 'gpt2_large_1500_safety.csv',
 'gpt2_large_2000_safety.csv',
 'gpt2_large_300_safety.csv',
 'gpt2_large_500_safety.csv']

In [12]:
import pandas as pd
for num in [0, 100, 300, 500, 1000, 1500, 2000]:
    file = f"gpt2_large_{num}_safety.csv"
    df = pd.read_csv("./data/" + file)
    print(file)
    print(df['gpt2_large_reward'].describe()['50%'])
    print(len(df))
    print("\n\n")

gpt2_large_0_safety.csv
0.837890625
952



gpt2_large_100_safety.csv
0.884765625
952



gpt2_large_300_safety.csv
0.88671875
952



gpt2_large_500_safety.csv
0.859375
952



gpt2_large_1000_safety.csv
0.8671875
952



gpt2_large_1500_safety.csv
0.8984375
952



gpt2_large_2000_safety.csv
0.94140625
952





### Autometrics

In [1]:
# automated metrics
import torch
from sacrebleu import corpus_bleu
import nltk
from nltk.translate.bleu_score import sentence_bleu
from bert_score import BERTScorer
from bert_score import score as bert_score
from bleurt import score
from rouge_score import rouge_scorer
from comet import download_model, load_from_checkpoint
from bert_score import BERTScorer

# BART Score
import sys
sys.path.append("/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Evaluation_Metrics/MedBART/BARTScore")
# from bart_score import BARTScorer
# dd = "/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/Evaluation_Metrics/MedBART/BARTScore/"
# # bart_score.pth
# bart_scorer = BARTScorer(device='cuda:0', checkpoint="facebook/bart-large-cnn")
# bart_scorer.load(path= dd + 'bart_score.pth')

bert_score = BERTScorer(model_type="microsoft/deberta-xlarge-mnli", lang="en", rescale_with_baseline=True, device='cuda')

2024-05-05 16:15:18.914824: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  return self.fget.__get__(instance, owner)()


In [2]:
class Evaluate:
    def __init__(self):
        self.scorer = None
        
class ScoreBleu(Evaluate):
    def __init__(self):
        super().__init__()
        self.scorer = sentence_bleu
        
    def score(self, ref, output):
        score = self.scorer([ref.split()], output.split())
        return score
    
class ScoreRouge(Evaluate):
    def __init__(self):
        super().__init__()
        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        
    def score(self, ref, output):
        rouge_scores = self.scorer.score(ref, output)
        rouge1 = rouge_scores['rouge1'].fmeasure
        rouge2 = rouge_scores['rouge2'].fmeasure
        rougeL = rouge_scores['rougeL'].fmeasure
        return rouge1, rouge2, rougeL

In [3]:
def get_automated_metrics(input_data):
    prompts = input_data['instructions'].tolist()
    refs = input_data['golden_response'].to_list()
    outputs = input_data['outputs'].to_list()

    res = {}

    e = ScoreBleu()
    scores = [e.score(x,y) for x,y in zip(refs,outputs)]
    bleu  = sum(scores)/len(scores)

    res['bleu'] = bleu
    print("BLUE Score : ", bleu)
    # print("Bleu done")
    
    e = ScoreRouge()
    scores = [e.score(x,y) for x,y in zip(refs,outputs)]
    r1 = [x[0] for x in scores]
    r2 = [x[1] for x in scores]
    rl = [x[2] for x in scores]

    res['rouge-1'] = sum(r1)/len(r1)
    res['rouge-2'] = sum(r2)/len(r2)
    res['rouge-l'] = sum(rl)/len(rl)
    
    print("Rouge-1 : ", res['rouge-1'])
    print("Rouge-2 : ", res['rouge-2'])
    print("Rouge-L : ", res['rouge-l'])
    
    
    p,r,f = bert_score.score(refs, outputs)
    torch.cuda.empty_cache()
    res['bert_score'] = f.mean().item()
    print("BERT Score : ", f.mean().item())
    # print("Bert score done")
    
    return res

    # # bart score
    # bart = bart_scorer.score(refs, outputs, batch_size=4)
    # torch.cuda.empty_cache()
    # res['bart'] = sum(bart)/len(bart)
    
    # # save it
    # # Open the file in write mode
    # with open(self.output_dir + 'autometrics_eval.json', "w") as json_file:
    #     # Write the data to the file
    #     json.dump(res, json_file)
    # print("Saved at ", self.output_dir + 'autometrics_eval.json')

In [4]:
import json
import pandas as pd

with open("./data/alpaca_test.json") as f:
    test_data = json.load(f)
test_data = pd.DataFrame(test_data)

all_metrics = []
for num in [0, 100, 300, 500, 1000, 1500, 2000]:
    file = f"./data/responses_alpaca_test_{num}_safe.json"
    print(file)
    with open(file) as f:
        data = json.load(f)
    del data['parameters']
    data = pd.DataFrame(data)
    data['golden_response'] = data['instructions'].map(dict(zip(test_data['instructions'], test_data['outputs']))) 
    curr_res = get_automated_metrics(data)
    all_metrics.append(curr_res)
    print(curr_res)
    print("\n\n")

./data/responses_alpaca_test_0_safe.json


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.


BLUE Score :  0.03832360988694305
Rouge-1 :  0.33417670897998064
Rouge-2 :  0.12786351632409734
Rouge-L :  0.22673123381204507
BERT Score :  0.19510111212730408
{'bleu': 0.03832360988694305, 'rouge-1': 0.33417670897998064, 'rouge-2': 0.12786351632409734, 'rouge-l': 0.22673123381204507, 'bert_score': 0.19510111212730408}



./data/responses_alpaca_test_100_safe.json


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.


BLUE Score :  0.03681847235814658
Rouge-1 :  0.33399165110635654
Rouge-2 :  0.1273945083881824
Rouge-L :  0.22615412735870588
BERT Score :  0.19357430934906006
{'bleu': 0.03681847235814658, 'rouge-1': 0.33399165110635654, 'rouge-2': 0.1273945083881824, 'rouge-l': 0.22615412735870588, 'bert_score': 0.19357430934906006}



./data/responses_alpaca_test_300_safe.json


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.


BLUE Score :  0.03712921352207869
Rouge-1 :  0.33579427936565515
Rouge-2 :  0.1281203126892478
Rouge-L :  0.2275912230315672
BERT Score :  0.19800803065299988
{'bleu': 0.03712921352207869, 'rouge-1': 0.33579427936565515, 'rouge-2': 0.1281203126892478, 'rouge-l': 0.2275912230315672, 'bert_score': 0.19800803065299988}



./data/responses_alpaca_test_500_safe.json


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.


BLUE Score :  0.03586079439225947
Rouge-1 :  0.33164004682061704
Rouge-2 :  0.12593505093426155
Rouge-L :  0.2244506734314742
BERT Score :  0.1927282214164734
{'bleu': 0.03586079439225947, 'rouge-1': 0.33164004682061704, 'rouge-2': 0.12593505093426155, 'rouge-l': 0.2244506734314742, 'bert_score': 0.1927282214164734}



./data/responses_alpaca_test_1000_safe.json


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.


BLUE Score :  0.03691406493249383
Rouge-1 :  0.3330483300848055
Rouge-2 :  0.12758020789452273
Rouge-L :  0.2236626425215876
BERT Score :  0.19175806641578674
{'bleu': 0.03691406493249383, 'rouge-1': 0.3330483300848055, 'rouge-2': 0.12758020789452273, 'rouge-l': 0.2236626425215876, 'bert_score': 0.19175806641578674}



./data/responses_alpaca_test_1500_safe.json


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.


BLUE Score :  0.0365300183379189
Rouge-1 :  0.33405830482843824
Rouge-2 :  0.12706444057292765
Rouge-L :  0.2253591064876454
BERT Score :  0.19825227558612823
{'bleu': 0.0365300183379189, 'rouge-1': 0.33405830482843824, 'rouge-2': 0.12706444057292765, 'rouge-l': 0.2253591064876454, 'bert_score': 0.19825227558612823}



./data/responses_alpaca_test_2000_safe.json


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Using default tokenizer.


BLUE Score :  0.0350018649537188
Rouge-1 :  0.33204666644759817
Rouge-2 :  0.1240970516903776
Rouge-L :  0.22274105318560322
BERT Score :  0.1923235058784485
{'bleu': 0.0350018649537188, 'rouge-1': 0.33204666644759817, 'rouge-2': 0.1240970516903776, 'rouge-l': 0.22274105318560322, 'bert_score': 0.1923235058784485}



