In [2]:
from rouge_score import rouge_scorer
from tqdm import tqdm
import json
import bert_score

In [7]:
# Loading the summaries
token_size = "2k" # or 8k

good_summaries = json.load(open(f"summaries/{token_size}_good_summarized_wikipedia_articles.json"))
bad_summaries = json.load(open(f"summaries/{token_size}_bad_summarized_wikipedia_articles.json"))
dpo_summaries = json.load(open(f"summaries/{token_size}_finetuned_dpo_summarized_wikipedia_articles.json"))

if token_size == "2k":
    sft_summaries = json.load(open(f"summaries/{token_size}_finetuned_sft_summarized_wikipedia_articles.json"))

# Bert Score

### Computing the scores and saving them in a file

In [8]:
def compute_bertscore_fast(reference_summary, generated_summary, model_type="distilbert-base-uncased"):
    P, R, F1 = bert_score.score([generated_summary], [reference_summary], model_type=model_type)
    return {"Precision": P.item(), "Recall": R.item(), "F1-score": F1.item()}

good_scores = []
bad_scores = []
dpo_scores = []
sft_scores = []

for i in tqdm(range(len(good_summaries))):
    good_scores.append(compute_bertscore_fast(good_summaries[i], good_summaries[i]))
    bad_scores.append(compute_bertscore_fast(good_summaries[i], bad_summaries[i]))
    dpo_scores.append(compute_bertscore_fast(good_summaries[i], dpo_summaries[i]))
    if token_size == "2k":
        sft_scores.append(compute_bertscore_fast(good_summaries[i], sft_summaries[i]))

# Saving the scores
json.dump(good_scores, open(f"bert_scores/{token_size}_good_scores.json", "w"))
json.dump(bad_scores, open(f"bert_scores/{token_size}_bad_scores.json", "w"))
json.dump(dpo_scores, open(f"bert_scores/{token_size}_dpo_scores.json", "w"))
if token_size == "2k":
    json.dump(sft_scores, open(f"bert_scores/{token_size}_sft_scores.json", "w"))

### Scores for 8k tokens documents

In [13]:
good_scores = json.load(open('bert_scores/good_scores_8k.json', 'r'))
bad_scores = json.load(open('bert_scores/bad_scores_8k.json', 'r'))
dpo_scores = json.load(open('bert_scores/dpo_scores_8k.json', 'r'))

train_size = 0.8

In [16]:
# Compute the average F1-score for each model on the training set
good_scores_train = good_scores[:int(train_size * len(good_scores))]
bad_scores_train = bad_scores[:int(train_size * len(good_scores))]
dpo_scores_train = dpo_scores[:int(train_size * len(good_scores))]

F1_good_scores_train = [score['F1-score'] for score in good_scores_train]
F1_bad_scores_train = [score['F1-score'] for score in bad_scores_train]
F1_dpo_scores_train = [score['F1-score'] for score in dpo_scores_train]

print('F1-scores on the training set:')
print("--------------------")
print('Good scores mean:', sum(F1_good_scores_train) / len(F1_good_scores_train))
print('\nBad scores mean:', sum(F1_bad_scores_train) / len(F1_bad_scores_train))
print("\nDPO scores mean:", sum(F1_dpo_scores_train) / len(F1_dpo_scores_train))
print("\n")
# Compute the average F1-score for each model on the test set
good_scores_test = good_scores[int(train_size * len(good_scores)):]
bad_scores_test = bad_scores[int(train_size * len(good_scores)):]
dpo_scores_test = dpo_scores[int(train_size * len(good_scores)):]

F1_good_scores_test = [score['F1-score'] for score in good_scores_test]
F1_bad_scores_test = [score['F1-score'] for score in bad_scores_test]
F1_dpo_scores_test = [score['F1-score'] for score in dpo_scores_test]

print('F1-scores on the test set:')
print("--------------------")
print('Good scores mean:', sum(F1_good_scores_test) / len(F1_good_scores_test))
print('\nBad scores mean:', sum(F1_bad_scores_test) / len(F1_bad_scores_test))
print("\nDPO scores mean:", sum(F1_dpo_scores_test) / len(F1_dpo_scores_test))

F1-scores on the training set:
--------------------
Good scores mean: 0.6822357793154603

Bad scores mean: 0.6427095072441382

DPO scores mean: 0.7699586713280757


F1-scores on the test set:
--------------------
Good scores mean: 0.6810821079663214

Bad scores mean: 0.6411158994067139

DPO scores mean: 0.770066902359125


### Scores for 2k tokens documents

In [18]:
import json

good_scores = json.load(open('bert_scores/good_scores_2k.json', 'r'))
bad_scores = json.load(open('bert_scores/bad_scores_2k.json', 'r'))
sft_scores = json.load(open('bert_scores/sft_scores_2k.json', 'r'))
dpo_scores = json.load(open('bert_scores/dpo_scores_2k.json', 'r'))

In [20]:
# Compute the average F1-score for each model on the training set
good_scores_train = good_scores[:int(train_size * len(good_scores))]
bad_scores_train = bad_scores[:int(train_size * len(good_scores))]
dpo_scores_train = dpo_scores[:int(train_size * len(good_scores))]
sft_scores_train = sft_scores[:int(train_size * len(good_scores))]

F1_good_scores_train = [score['F1-score'] for score in good_scores_train]
F1_bad_scores_train = [score['F1-score'] for score in bad_scores_train]
F1_dpo_scores_train = [score['F1-score'] for score in dpo_scores_train]
F1_sft_scores_train = [score['F1-score'] for score in sft_scores_train]

print('F1-scores on the training set:')
print("--------------------")
print('Good scores mean:', sum(F1_good_scores_train) / len(F1_good_scores_train))
print('\nBad scores mean:', sum(F1_bad_scores_train) / len(F1_bad_scores_train))
print("\nDPO scores mean:", sum(F1_dpo_scores_train) / len(F1_dpo_scores_train))
print("\nSFT scores mean:", sum(F1_sft_scores_train) / len(F1_sft_scores_train))
print("\n")
# Compute the average F1-score for each model on the test set
good_scores_test = good_scores[int(train_size * len(good_scores)):]
bad_scores_test = bad_scores[int(train_size * len(good_scores)):]
dpo_scores_test = dpo_scores[int(train_size * len(good_scores)):]
sft_scores_test = sft_scores[int(train_size * len(good_scores)):]

F1_good_scores_test = [score['F1-score'] for score in good_scores_test]
F1_bad_scores_test = [score['F1-score'] for score in bad_scores_test]
F1_dpo_scores_test = [score['F1-score'] for score in dpo_scores_test]
F1_sft_scores_test = [score['F1-score'] for score in sft_scores_test]

print('F1-scores on the test set:')
print("--------------------")
print('Good scores mean:', sum(F1_good_scores_test) / len(F1_good_scores_test))
print('\nBad scores mean:', sum(F1_bad_scores_test) / len(F1_bad_scores_test))
print("\nDPO scores mean:", sum(F1_dpo_scores_test) / len(F1_dpo_scores_test))
print("\nSFT scores mean:", sum(F1_sft_scores_test) / len(F1_sft_scores_test))

F1-scores on the training set:
--------------------
Good scores mean: 0.8148783289185924

Bad scores mean: 0.7971492961424166

DPO scores mean: 0.7965072462694687

SFT scores mean: 0.8304813184599945


F1-scores on the test set:
--------------------
Good scores mean: 0.8128086552023888

Bad scores mean: 0.7985009086749902

DPO scores mean: 0.7946119556909564

SFT scores mean: 0.8290142194506084


# Rouge Score

In [6]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
good_scores = []
bad_scores = []
dpo_scores = []
sft_scores = []

for i in tqdm(range(len(good_summaries))):
    good_scores.append(scorer.score(good_summaries[i], good_summaries[i]))
    bad_scores.append(scorer.score(good_summaries[i], bad_summaries[i]))
    dpo_scores.append(scorer.score(good_summaries[i], dpo_summaries[i]))
    if token_size == "2k":
        sft_scores.append(scorer.score(good_summaries[i], sft_summaries[i]))

# Saving the scores
json.dump(good_scores, open(f"rouge_scores/{token_size}_good_scores.json", "w"))
json.dump(bad_scores, open(f"rouge_scores/{token_size}_bad_scores.json", "w"))
json.dump(dpo_scores, open(f"rouge_scores/{token_size}_dpo_scores.json", "w"))
if token_size == "2k":
    json.dump(sft_scores, open(f"rouge_scores/{token_size}_sft_scores.json", "w"))


100%|██████████| 6498/6498 [00:52<00:00, 124.40it/s]


### Scores for 8k tokens documents

In [21]:
good_scores = json.load(open('rouge_scores/good_scores_8k.json', 'r'))
bad_scores = json.load(open('rouge_scores/bad_scores_8k.json', 'r'))
dpo_scores = json.load(open('rouge_scores/dpo_scores_8k.json', 'r'))

train_size = 0.8

Rouge-1 F1 Scores

In [25]:
rouge1_good_scores = [score['rouge1'][0] for score in good_scores]
rouge1_bad_scores = [score['rouge1'][0] for score in bad_scores]
rouge1_dpo_scores = [score['rouge1'][0] for score in dpo_scores]
print('ROUGE-1 scores:')
print("Training scores:")
print("--------------------")
print('Good scores mean:', sum(rouge1_good_scores[:int(train_size * len(rouge1_good_scores))]) / len(rouge1_good_scores[:int(train_size * len(rouge1_good_scores))]))
print('\nBad scores mean:', sum(rouge1_bad_scores[:int(train_size * len(rouge1_bad_scores))]) / len(rouge1_bad_scores[:int(train_size * len(rouge1_bad_scores))]))
print("\nDPO scores mean:", sum(rouge1_dpo_scores[:int(train_size * len(rouge1_dpo_scores))]) / len(rouge1_dpo_scores[:int(train_size * len(rouge1_dpo_scores))]))
print("--------------------")
print("Test scores:")
print("--------------------")
print('Good scores mean:', sum(rouge1_good_scores[int(train_size * len(rouge1_good_scores)):]) / len(rouge1_good_scores[int(train_size * len(rouge1_good_scores)):]))
print('\nBad scores mean:', sum(rouge1_bad_scores[int(train_size * len(rouge1_bad_scores)):]) / len(rouge1_bad_scores[int(train_size * len(rouge1_bad_scores)):]))
print("\nDPO scores mean:", sum(rouge1_dpo_scores[int(train_size * len(rouge1_dpo_scores)):]) / len(rouge1_dpo_scores[int(train_size * len(rouge1_dpo_scores)):]))
print("--------------------")


ROUGE-1 scores:
Training scores:
--------------------
Good scores mean: 0.7664347888103062

Bad scores mean: 0.6004878160512062

DPO scores mean: 0.5845655220444853
--------------------
Test scores:
--------------------
Good scores mean: 0.7686514268790634

Bad scores mean: 0.5966565224470679

DPO scores mean: 0.5940162064000093
--------------------


Rouge-2 F1 Scores

In [26]:
rouge2_good_scores = [score['rouge2'][0] for score in good_scores]
rouge2_bad_scores = [score['rouge2'][0] for score in bad_scores]
rouge2_dpo_scores = [score['rouge2'][0] for score in dpo_scores]
print('ROUGE-2 scores:')
print("Training scores:")
print("--------------------")
print('Good scores mean:', sum(rouge2_good_scores[:int(train_size * len(rouge2_good_scores))]) / len(rouge2_good_scores[:int(train_size * len(rouge2_good_scores))]))
print('\nBad scores mean:', sum(rouge2_bad_scores[:int(train_size * len(rouge2_bad_scores))]) / len(rouge2_bad_scores[:int(train_size * len(rouge2_bad_scores))]))
print("\nDPO scores mean:", sum(rouge2_dpo_scores[:int(train_size * len(rouge2_dpo_scores))]) / len(rouge2_dpo_scores[:int(train_size * len(rouge2_dpo_scores))]))
print("--------------------")
print("Test scores:")
print("--------------------")
print('Good scores mean:', sum(rouge2_good_scores[int(train_size * len(rouge2_good_scores)):]) / len(rouge2_good_scores[int(train_size * len(rouge2_good_scores)):]))
print('\nBad scores mean:', sum(rouge2_bad_scores[int(train_size * len(rouge2_bad_scores)):]) / len(rouge2_bad_scores[int(train_size * len(rouge2_bad_scores)):]))
print("\nDPO scores mean:", sum(rouge2_dpo_scores[int(train_size * len(rouge2_dpo_scores)):]) / len(rouge2_dpo_scores[int(train_size * len(rouge2_dpo_scores)):]))
print("--------------------")

ROUGE-2 scores:
Training scores:
--------------------
Good scores mean: 0.4314135826130457

Bad scores mean: 0.3201015425891059

DPO scores mean: 0.21517255965759072
--------------------
Test scores:
--------------------
Good scores mean: 0.4281350415985045

Bad scores mean: 0.3121613461177329

DPO scores mean: 0.21478939464611416
--------------------


Rouge-L F1 Scores

In [27]:
rougeL_good_scores = [score['rougeL'][0] for score in good_scores]
rougeL_bad_scores = [score['rougeL'][0] for score in bad_scores]
rougeL_dpo_scores = [score['rougeL'][0] for score in dpo_scores]
print('ROUGE-L scores:')
print("Training scores:")
print("--------------------")
print('Good scores mean:', sum(rougeL_good_scores[:int(train_size * len(rougeL_good_scores))]) / len(rougeL_good_scores[:int(train_size * len(rougeL_good_scores))]))
print('\nBad scores mean:', sum(rougeL_bad_scores[:int(train_size * len(rougeL_bad_scores))]) / len(rougeL_bad_scores[:int(train_size * len(rougeL_bad_scores))]))
print("\nDPO scores mean:", sum(rougeL_dpo_scores[:int(train_size * len(rougeL_dpo_scores))]) / len(rougeL_dpo_scores[:int(train_size * len(rougeL_dpo_scores))]))
print("--------------------")
print("Test scores:")
print("--------------------")
print('Good scores mean:', sum(rougeL_good_scores[int(train_size * len(rougeL_good_scores)):]) / len(rougeL_good_scores[int(train_size * len(rougeL_good_scores)):]))
print('\nBad scores mean:', sum(rougeL_bad_scores[int(train_size * len(rougeL_bad_scores)):]) / len(rougeL_bad_scores[int(train_size * len(rougeL_bad_scores)):]))
print("\nDPO scores mean:", sum(rougeL_dpo_scores[int(train_size * len(rougeL_dpo_scores)):]) / len(rougeL_dpo_scores[int(train_size * len(rougeL_dpo_scores)):]))
print("--------------------")

ROUGE-L scores:
Training scores:
--------------------
Good scores mean: 0.5862384824429212

Bad scores mean: 0.43570175833313923

DPO scores mean: 0.4197006638988147
--------------------
Test scores:
--------------------
Good scores mean: 0.5829627142829823

Bad scores mean: 0.4297396676295664

DPO scores mean: 0.425424527155304
--------------------


### Scores for 2k tokens documents

In [22]:
good_scores = json.load(open('rouge_scores/good_scores_2k.json', 'r'))
bad_scores = json.load(open('rouge_scores/bad_scores_2k.json', 'r'))
dpo_scores = json.load(open('rouge_scores/dpo_scores_2k.json', 'r'))
sft_scores = json.load(open('rouge_scores/sft_scores_2k.json', 'r'))

train_size = 0.8

Rouge-1 F1 Scores

In [29]:
rouge1_good_scores = [score['rouge1'][0] for score in good_scores]
rouge1_bad_scores = [score['rouge1'][0] for score in bad_scores]
rouge1_dpo_scores = [score['rouge1'][0] for score in dpo_scores]
rouge1_sft_scores = [score['rouge1'][0] for score in sft_scores]
print('ROUGE-1 scores:')
print("Training scores:")
print("--------------------")
print('Good scores mean:', sum(rouge1_good_scores[:int(train_size * len(rouge1_good_scores))]) / len(rouge1_good_scores[:int(train_size * len(rouge1_good_scores))]))
print('\nBad scores mean:', sum(rouge1_bad_scores[:int(train_size * len(rouge1_bad_scores))]) / len(rouge1_bad_scores[:int(train_size * len(rouge1_bad_scores))]))
print("\nDPO scores mean:", sum(rouge1_dpo_scores[:int(train_size * len(rouge1_dpo_scores))]) / len(rouge1_dpo_scores[:int(train_size * len(rouge1_dpo_scores))]))
print("\nSFT scores mean:", sum(rouge1_sft_scores[:int(train_size * len(rouge1_sft_scores))]) / len(rouge1_sft_scores[:int(train_size * len(rouge1_sft_scores))]))
print("--------------------")
print("Test scores:")
print("--------------------")
print('Good scores mean:', sum(rouge1_good_scores[int(train_size * len(rouge1_good_scores)):]) / len(rouge1_good_scores[int(train_size * len(rouge1_good_scores)):]))
print('\nBad scores mean:', sum(rouge1_bad_scores[int(train_size * len(rouge1_bad_scores)):]) / len(rouge1_bad_scores[int(train_size * len(rouge1_bad_scores)):]))
print("\nDPO scores mean:", sum(rouge1_dpo_scores[int(train_size * len(rouge1_dpo_scores)):]) / len(rouge1_dpo_scores[int(train_size * len(rouge1_dpo_scores)):]))
print("\nSFT scores mean:", sum(rouge1_sft_scores[int(train_size * len(rouge1_sft_scores)):]) / len(rouge1_sft_scores[int(train_size * len(rouge1_sft_scores)):]))
print("--------------------")


ROUGE-1 scores:
Training scores:
--------------------
Good scores mean: 0.751753031874592

Bad scores mean: 0.6080961269545818

DPO scores mean: 0.5947431005397751

SFT scores mean: 0.7892345105566898
--------------------
Test scores:
--------------------
Good scores mean: 0.751103870514794

Bad scores mean: 0.6147772922826621

DPO scores mean: 0.5875321861362157

SFT scores mean: 0.7892884830368279
--------------------


Rouge-2 F1 Scores

In [30]:
rouge2_good_scores = [score['rouge2'][0] for score in good_scores]
rouge2_bad_scores = [score['rouge2'][0] for score in bad_scores]
rouge2_dpo_scores = [score['rouge2'][0] for score in dpo_scores]
rouge2_sft_scores = [score['rouge2'][0] for score in sft_scores]
print('ROUGE-2 scores:')
print("Training scores:")
print("--------------------")
print('Good scores mean:', sum(rouge2_good_scores[:int(train_size * len(rouge2_good_scores))]) / len(rouge2_good_scores[:int(train_size * len(rouge2_good_scores))]))
print('\nBad scores mean:', sum(rouge2_bad_scores[:int(train_size * len(rouge2_bad_scores))]) / len(rouge2_bad_scores[:int(train_size * len(rouge2_bad_scores))]))
print("\nDPO scores mean:", sum(rouge2_dpo_scores[:int(train_size * len(rouge2_dpo_scores))]) / len(rouge2_dpo_scores[:int(train_size * len(rouge2_dpo_scores))]))
print("\nSFT scores mean:", sum(rouge2_sft_scores[:int(train_size * len(rouge2_sft_scores))]) / len(rouge2_sft_scores[:int(train_size * len(rouge2_sft_scores))]))
print("--------------------")
print("Test scores:")
print("--------------------")
print('Good scores mean:', sum(rouge2_good_scores[int(train_size * len(rouge2_good_scores)):]) / len(rouge2_good_scores[int(train_size * len(rouge2_good_scores)):]))
print('\nBad scores mean:', sum(rouge2_bad_scores[int(train_size * len(rouge2_bad_scores)):]) / len(rouge2_bad_scores[int(train_size * len(rouge2_bad_scores)):]))
print("\nDPO scores mean:", sum(rouge2_dpo_scores[int(train_size * len(rouge2_dpo_scores)):]) / len(rouge2_dpo_scores[int(train_size * len(rouge2_dpo_scores)):]))
print("\nSFT scores mean:", sum(rouge2_sft_scores[int(train_size * len(rouge2_sft_scores)):]) / len(rouge2_sft_scores[int(train_size * len(rouge2_sft_scores)):]))
print("--------------------")

ROUGE-2 scores:
Training scores:
--------------------
Good scores mean: 0.39880191527739184

Bad scores mean: 0.3396875945166942

DPO scores mean: 0.3011038206105337

SFT scores mean: 0.5417822128371445
--------------------
Test scores:
--------------------
Good scores mean: 0.3956736961852618

Bad scores mean: 0.3438119033575253

DPO scores mean: 0.2950888306599134

SFT scores mean: 0.5359303784791478
--------------------


Rouge-L F1 Scores

In [31]:
rougeL_good_scores = [score['rougeL'][0] for score in good_scores]
rougeL_bad_scores = [score['rougeL'][0] for score in bad_scores]
rougeL_dpo_scores = [score['rougeL'][0] for score in dpo_scores]
rougeL_sft_scores = [score['rougeL'][0] for score in sft_scores]
print('ROUGE-L scores:')
print("Training scores:")
print("--------------------")
print('Good scores mean:', sum(rougeL_good_scores[:int(train_size * len(rougeL_good_scores))]) / len(rougeL_good_scores[:int(train_size * len(rougeL_good_scores))]))
print('\nBad scores mean:', sum(rougeL_bad_scores[:int(train_size * len(rougeL_bad_scores))]) / len(rougeL_bad_scores[:int(train_size * len(rougeL_bad_scores))]))
print("\nDPO scores mean:", sum(rougeL_dpo_scores[:int(train_size * len(rougeL_dpo_scores))]) / len(rougeL_dpo_scores[:int(train_size * len(rougeL_dpo_scores))]))
print("\nSFT scores mean:", sum(rougeL_sft_scores[:int(train_size * len(rougeL_sft_scores))]) / len(rougeL_sft_scores[:int(train_size * len(rougeL_sft_scores))]))
print("--------------------")
print("Test scores:")
print("--------------------")
print('Good scores mean:', sum(rougeL_good_scores[int(train_size * len(rougeL_good_scores)):]) / len(rougeL_good_scores[int(train_size * len(rougeL_good_scores)):]))
print('\nBad scores mean:', sum(rougeL_bad_scores[int(train_size * len(rougeL_bad_scores)):]) / len(rougeL_bad_scores[int(train_size * len(rougeL_bad_scores)):]))
print("\nDPO scores mean:", sum(rougeL_dpo_scores[int(train_size * len(rougeL_dpo_scores)):]) / len(rougeL_dpo_scores[int(train_size * len(rougeL_dpo_scores)):]))
print("\nSFT scores mean:", sum(rougeL_sft_scores[int(train_size * len(rougeL_sft_scores)):]) / len(rougeL_sft_scores[int(train_size * len(rougeL_sft_scores)):]))
print("--------------------")

ROUGE-L scores:
Training scores:
--------------------
Good scores mean: 0.5562012254199095

Bad scores mean: 0.44100793324710685

DPO scores mean: 0.4204164376359979

SFT scores mean: 0.6245999721312844
--------------------
Test scores:
--------------------
Good scores mean: 0.5546049449790932

Bad scores mean: 0.4424617051811035

DPO scores mean: 0.41436384195624537

SFT scores mean: 0.6191841510223639
--------------------
