In [12]:
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer, scoring
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModelForSeq2SeqLM
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
# from transformers import AutoModelForCausalLM
from tqdm import tqdm
import json

#load bert score model
from rouge_score import rouge_scorer, scoring
from evaluate import load
bert_score = load("bertscore")


def rouge(refs, preds):
    """
    Returns `t5` style ROUGE scores. See the related implementation:
    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
    :param refs:
        A `list` of reference `strs`.
    :param preds:
        A `list` of predicted `strs`.
    """
    rouge_types = ["rouge1", "rouge2", "rougeLsum"]
    scorer = rouge_scorer.RougeScorer(rouge_types)
    # Add newlines between sentences to correctly compute `rougeLsum`.

    def _prepare_summary(summary):
        summary = summary.replace(" . ", ".\n")
        return summary

    # Accumulate confidence intervals.
    aggregator = scoring.BootstrapAggregator()
    for ref, pred in zip(refs, preds):
        ref = _prepare_summary(ref)
        pred = _prepare_summary(pred)
        aggregator.add_scores(scorer.score(ref, pred))
    result = aggregator.aggregate()
    
    return {type: result[type].mid.fmeasure  for type in rouge_types}


def BertScore(refs, preds):
    bert_score_res = bert_score.compute(predictions=[refs], references=[preds], model_type="microsoft/deberta-xlarge-mnli", lang="en")
    
    return bert_score_res

def get_score(refs, preds):
    rouge_res = rouge(refs, preds)
    
    bert_score = 0
    for i in range(len(refs)): 
        bert_score += BertScore(refs[i], preds[i])["f1"][0]
    
    total_res = {
            "rouge1": rouge_res["rouge1"],
            "rougeL": rouge_res["rougeLsum"],
            "bertscore_f1": bert_score/len(refs)
        }
    
    return total_res

In [13]:
data = pd.read_json("/home/xbr/LLM/benchmark_llm_summarization/pair_with_qwen.json")

print(data.shape)
writer = data.drop_duplicates(subset=["article_id","writer_id"])['writer_summary'].to_list()
davinci = data.drop_duplicates(subset=["article_id","writer_id"])['text-davinci-002_summary'].to_list()
qwen = data.drop_duplicates(subset=["article_id","writer_id"])['qwen_summary'].to_list()
print(get_score(writer,davinci))
print(get_score(writer,qwen))
print(get_score(qwen,davinci))





(599, 9)


  return self.fget.__get__(instance, owner)()


{'rouge1': 0.36650528185885634, 'rougeL': 0.2513014265393698, 'bertscore_f1': 0.6738523904766355}
{'rouge1': 0.3747219478308891, 'rougeL': 0.24548436002357307, 'bertscore_f1': 0.6767936660242933}
{'rouge1': 0.45431033160176304, 'rougeL': 0.33080460741759454, 'bertscore_f1': 0.7225180704678807}


In [13]:
#ger correlation
from scipy.stats import kendalltau, spearmanr
import numpy as np

def correlation_score(dict1, dict2):
    #system level
    tmp_list1 = []
    tmp_list2 = []
    for i in dict1.keys():
        tmp_list1.append(np.mean(dict1[i]))
        tmp_list2.append(np.mean(dict2[i]))
    print("kendalltau correlation of system level is ", kendalltau(tmp_list1, tmp_list2)[0])
    print("spearmans correlation of system level is ", spearmanr(tmp_list1, tmp_list2)[0])
    
    #summary level
    total_corr = 0
    total_corr2 = 0
    
    for i in dict1.keys():
        total_corr+=kendalltau(dict1[i], dict2[i])[0]
        total_corr2+=spearmanr(dict1[i], dict2[i])[0]
    print("kendalltau correlation of summary level is ", total_corr/len(dict1.keys()))
    print("spearmans correlation of summary level is ", total_corr2/len(dict1.keys()))
    
import json
qwen_eva = json.load(open("./LLM_evaluation_correlation_with_human/qwen/Qwen1.5-72B-Chat_filter_annotations_summeval_expert_coherence_eva.json"))

llama_eva = json.load(open("./LLM_evaluation_correlation_with_human/llama2_70b/Llama-2-70b-chat-hf_filter_annotations_summeval_expert_coherence_eva.json"))

human_eva = json.load(open("./LLM_evaluation_correlation_with_human/qwen/human_score_filter_annotations_summeval_expert_coherence_eva.json"))

new_dict = {}
for i in qwen_eva.keys():
    # tmp_list = []
    # for j in range(len(qwen_eva[i])):
    #     tmp_list.append((qwen_eva[i][j]+llama_eva[i][j])/2)
    # new_dict[i] = tmp_list
    
    print(f'{i}, qwen score {np.mean(qwen_eva[i])}, llama score {np.mean(llama_eva[i])}, human scoe {np.mean(human_eva[i])}.')

# correlation_score(qwen_eva, human_eva)



M1, qwen score 4.77, llama score 3.26, human scoe 3.219999999999999.
M10, qwen score 3.98, llama score 3.07, human scoe 2.7266666666666657.
M11, qwen score 3.46, llama score 2.99, human scoe 2.2799999999999994.
M12, qwen score 4.77, llama score 3.28, human scoe 3.5966666666666667.
M13, qwen score 4.7, llama score 3.18, human scoe 3.4433333333333334.
M14, qwen score 4.61, llama score 3.12, human scoe 3.1966666666666668.
M15, qwen score 4.62, llama score 3.21, human scoe 3.3466666666666653.
M17, qwen score 4.9, llama score 3.36, human scoe 3.996666666666667.
M2, qwen score 4.89, llama score 3.27, human scoe 3.2766666666666664.
M20, qwen score 3.79, llama score 3.18, human scoe 3.6333333333333333.
M22, qwen score 4.92, llama score 3.26, human scoe 4.18.
M23, qwen score 4.83, llama score 3.33, human scoe 4.163333333333333.
M5, qwen score 4.98, llama score 3.47, human scoe 3.71.
M8, qwen score 4.54, llama score 3.18, human scoe 3.2900000000000005.
M9, qwen score 4.51, llama score 3.05, huma

In [3]:
#this noetbook focus on the evaluation of summary quality using rouge and bert score
from datasets import load_dataset
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import json
# Set OpenAI's API key and API base to use vLLM's API server.
from scipy.stats import kendalltau, spearmanr
from openai import OpenAI
import sacrebleu
#load bert score model
from rouge_score import rouge_scorer, scoring
# from evaluate import load
# bert_score = load("bertscore")



def rouge(refs, preds):
    """
    Returns `t5` style ROUGE scores. See the related implementation:
    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
    :param refs:
        A `list` of reference `strs`.
    :param preds:
        A `list` of predicted `strs`.
    """
    rouge_types = ["rouge1", "rouge4", "rougeLsum"]
    scorer = rouge_scorer.RougeScorer(rouge_types)
    # Add newlines between sentences to correctly compute `rougeLsum`.

    def _prepare_summary(summary):
        summary = summary.replace(" . ", ".\n")
        return summary

    # Accumulate confidence intervals.
    aggregator = scoring.BootstrapAggregator()
    for ref, pred in zip(refs, preds):
        ref = _prepare_summary(ref)
        pred = _prepare_summary(pred)
        aggregator.add_scores(scorer.score(ref, pred))
    result = aggregator.aggregate()
    
    return {type: result[type].mid.fmeasure  for type in rouge_types}


# def BertScore(refs, preds):
#     bert_score_res = bert_score.compute(predictions=[refs], references=[preds], model_type="microsoft/deberta-xlarge-mnli", lang="en")
    
#     return bert_score_res

def get_score(refs, preds):
    rouge_res = rouge([refs], [preds])
    bert_score = 0  #BertScore(refs, preds)["f1"][0]
    
    # chrf = sacrebleu.corpus_chrf(preds, refs).score
    total_res = {
            "rouge1": rouge_res["rouge1"],
            "rougeL": rouge_res["rougeLsum"],
            "bertscore_f1": bert_score,
            # "chrf": chrf
        }
    
    return total_res


def loop_score_api_chat(summary_list, reference_list, metric):
    
    score_list = []
    
    
    for i in range(len(summary_list)):
        #if reference_list[0] is a list
        tmp_score = 0
        for j in range(len(reference_list[i])):
            reference = reference_list[i][j]
            summary = summary_list[i]
            tmp_score +=get_score(reference, summary)[metric]
        
        score_list.append(tmp_score/len(reference_list[i]))
        # score_list.append(get_score(reference_list[i], summary_list[i])[metric])
    return score_list   

def correlation_score(dict1, dict2):
    #system level
    tmp_list1 = []
    tmp_list2 = []
    for i in dict1.keys():
        tmp_list1.append(np.mean(dict1[i]))
        tmp_list2.append(np.mean(dict2[i]))
    print("kendalltau correlation of system level is ", kendalltau(tmp_list1, tmp_list2)[0])
    print("spearmans correlation of system level is ", spearmanr(tmp_list1, tmp_list2)[0])
    
    #summary level
    total_corr = 0
    total_corr2 = 0
    
    for i in dict1.keys():
        total_corr+=kendalltau(dict1[i], dict2[i])[0]
        total_corr2+=spearmanr(dict1[i], dict2[i])[0]
    print("kendalltau correlation of summary level is ", total_corr/len(dict1.keys()))
    print("spearmans correlation of summary level is ", total_corr2/len(dict1.keys()))
    


def evaluate(path, aspect, metric = "rougeL"):
    
    dataset_name = path.split("/")[-1].split(".")[0]
    print("evaluating dataset: ", dataset_name)
    model_name = metric
    
    target_dataset = pd.read_json(path)
    model_list = list(set(target_dataset['model'].tolist()))
    model_list.remove('M0')
    model_list = sorted(model_list)
    
    
    
    #save result    
    model_eva_dict= {}
    human_eva_dict = {}

    for m in model_list:
        print("evaluating model: ", m)
        tmp_dataset = target_dataset[(target_dataset['model']==m )]
       
        tmp_news_list = tmp_dataset['article'].tolist()
        
        tmp_summary_list = tmp_dataset['summary'].tolist()
        tmp_score_list = tmp_dataset[aspect].tolist()
        
        tmp_reference_list = tmp_dataset['references'].tolist()
        
        
        
        score_list = loop_score_api_chat(tmp_summary_list, tmp_reference_list, metric)
        
        model_eva_dict[m] = score_list
        human_eva_dict[m] = tmp_score_list  
    
    #save the result
    
    # save_name = str(model_name)+'_'+str(dataset_name)+'_'+str(aspect)+'_eva.json'
    # human_save = 'human_score_'+str(dataset_name)+'_'+str(aspect)+'_eva.json'
   
    
    # with open('./LLM_evaluation_correlation_with_human/'+save_name, 'w') as fp:
    #     json.dump(model_eva_dict, fp)
    # with open('./LLM_evaluation_correlation_with_human/'+human_save, 'w') as fp:
    #     json.dump(human_eva_dict, fp)
    

    correlation_score(model_eva_dict, human_eva_dict)
    
if __name__ == "__main__":
    
    p = './filter_annotations_summeval_reference.jsonl'# #'/home/xbr/LLM/benchmark_llm_summarization/likert_evaluation_results_cnndm_average.json'
    aspect = "expert_coherence"
    evaluate(p, aspect,metric="rougeL")



evaluating dataset:  filter_annotations_summeval_reference
evaluating model:  M1
evaluating model:  M10
evaluating model:  M11
evaluating model:  M12
evaluating model:  M13
evaluating model:  M14
evaluating model:  M15
evaluating model:  M17
evaluating model:  M2
evaluating model:  M20
evaluating model:  M22
evaluating model:  M23
evaluating model:  M5
evaluating model:  M8
evaluating model:  M9
kendalltau correlation of system level is  0.12380952380952381
spearmans correlation of system level is  0.1607142857142857
kendalltau correlation of summary level is  0.13073467435394007
spearmans correlation of summary level is  0.18163187689654095


In [6]:
#换一种计算rouge和human相关系数的方式
#计算qwen生成的summary的gouge和human的相关系数

#this noetbook focus on the evaluation of summary quality using rouge and bert score
from datasets import load_dataset
import pandas as pd
import re
from tqdm import tqdm
import numpy as np
import json
# Set OpenAI's API key and API base to use vLLM's API server.
from scipy.stats import kendalltau, spearmanr
from openai import OpenAI
import sacrebleu
#load bert score model
from rouge_score import rouge_scorer, scoring
from evaluate import load
bert_score = load("bertscore")



def rouge(refs, preds):
    """
    Returns `t5` style ROUGE scores. See the related implementation:
    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
    :param refs:
        A `list` of reference `strs`.
    :param preds:
        A `list` of predicted `strs`.
    """
    rouge_types = ["rouge1", "rouge4", "rougeLsum"]
    scorer = rouge_scorer.RougeScorer(rouge_types)
    # Add newlines between sentences to correctly compute `rougeLsum`.

    def _prepare_summary(summary):
        summary = summary.replace(" . ", ".\n")
        return summary

    # Accumulate confidence intervals.
    aggregator = scoring.BootstrapAggregator()
    for ref, pred in zip(refs, preds):
        ref = _prepare_summary(ref)
        pred = _prepare_summary(pred)
        aggregator.add_scores(scorer.score(ref, pred))
    result = aggregator.aggregate()
    
    return {type: result[type].mid.fmeasure  for type in rouge_types}


def BertScore(refs, preds):
    bert_score_res = bert_score.compute(predictions=[refs], references=[preds], model_type="microsoft/deberta-xlarge-mnli", lang="en")
    
    return bert_score_res

def bleu(refs, preds):
    """
    Returns `t5` style BLEU scores. See the related implementation:
    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41

    :param refs:
        A `list` of `list` of reference `str`s.
    :param preds:
        A `list` of predicted `str`s.
    """
    score = sacrebleu.corpus_bleu(preds, refs, smooth_method="exp", smooth_value=0.0, force=False,
                                  lowercase=False, tokenize="intl", use_effective_order=False).score
    return score

def get_score(refs, preds,metric):
    
    result = 0
    if(metric[:5]=="rouge"):
        rouge_res = rouge([refs], [preds])
        result = rouge_res[metric]
    elif(metric=="bertscore"):
        result = BertScore(refs, preds)["f1"][0]
    elif(metric=="bleu"):
        result = bleu([refs], [preds])
    elif(metric=="chrf"):
        if(preds==""):
            preds = " "
        result = sacrebleu.corpus_chrf(preds, [refs]).score
    
    
    return result


def loop_score_api_chat(summary_list, reference_list, metric):
    
    score_list = []
    
    for i in range(len(summary_list)):
            reference = reference_list[i]
            summary = summary_list[i]
            score_list.append(get_score(reference, summary, metric))
    
    return score_list   

def correlation_score(dict1, dict2):
    #system level
    tmp_list1 = []
    tmp_list2 = []
    for i in dict1.keys():
        tmp_list1.append(np.mean(dict1[i]))
        tmp_list2.append(np.mean(dict2[i]))
        
    print("kendalltau correlation of system level is ", kendalltau(tmp_list1, tmp_list2)[0])
    print("spearmans correlation of system level is ", spearmanr(tmp_list1, tmp_list2)[0])
    
    #summary level
    total_corr = 0
    total_corr2 = 0
    
    for i in dict1.keys():
        total_corr+=kendalltau(dict1[i], dict2[i])[0]
        total_corr2+=spearmanr(dict1[i], dict2[i])[0]
    print("kendalltau correlation of summary level is ", total_corr/len(dict1.keys()))
    print("spearmans correlation of summary level is ", total_corr2/len(dict1.keys()))
    


def evaluate(path, aspect, metric = "rougeLsum", reference_model = 'reference', few_shot = 0):
    
    dataset_name = path.split("/")[-1].split(".")[0]
    print("evaluating dataset: ", dataset_name)
    model_name = metric
    
    target_dataset = pd.read_json(path)
    model_list = list(set(target_dataset['model'].tolist()))
    model_list.remove(reference_model)
    model_list = sorted(model_list)
    
    reference = target_dataset[target_dataset["model"]==reference_model]
    
    
    #save result    
    model_eva_dict= {}
    human_eva_dict = {}

    for m in model_list:
        print("evaluating model: ", m)
        tmp_dataset = target_dataset[(target_dataset['model']==m )]
        
        tmp_news_list = tmp_dataset['article'].tolist()
        tmp_summary_list = tmp_dataset['summary'].tolist()
        tmp_score_list = tmp_dataset[aspect].tolist()
        
        tmp_reference_list = tmp_dataset['qwen_summary'].tolist()
        
        
        # for i in range(len(tmp_news_list)):
        #     tmp_reference_list.append(reference[reference['article']==tmp_news_list[i]]['summary'].values[0])
        
        
        score_list = loop_score_api_chat(tmp_summary_list, tmp_reference_list, metric)
        
        model_eva_dict[m] = score_list
        human_eva_dict[m] = tmp_score_list  
    
    #save the result
    
    save_name = str(model_name)+'_'+str(dataset_name)+'_'+str(aspect)+'_eva.json'
    human_save = 'human_score_'+str(dataset_name)+'_'+str(aspect)+'_eva.json'
   
    
    # with open('./LLM_evaluation_correlation_with_human/'+save_name, 'w') as fp:
    #     json.dump(model_eva_dict, fp)
    # with open('./LLM_evaluation_correlation_with_human/'+human_save, 'w') as fp:
    #     json.dump(human_eva_dict, fp)
    

    correlation_score(model_eva_dict, human_eva_dict)
    
if __name__ == "__main__":
    
    p = './filter_annotations_summeval_llama2_summary.jsonl'#'/home/xbr/LLM/benchmark_llm_summarization/likert_evaluation_results_xsum_average_with_qwen.json'#'./filter_annotations_summeval.jsonl'# #
    aspect = "expert_coherence"
    evaluate(p, aspect,reference_model="M0")





evaluating dataset:  filter_annotations_summeval_llama2_summary
evaluating model:  M1
evaluating model:  M10
evaluating model:  M11
evaluating model:  M12
evaluating model:  M13
evaluating model:  M14
evaluating model:  M15
evaluating model:  M17
evaluating model:  M2
evaluating model:  M20
evaluating model:  M22
evaluating model:  M23


SignificanceResult(statistic=0.5270462766947298, pvalue=0.206507295485425)
