In [13]:
import pandas as pd
from scorer import *

In [14]:
def make_clusters(model_map):
    all_clusters=[]
    model_map_df=pd.DataFrame({'m_id':model_map.keys(),'c_id':model_map.values()})
    model_map_group=model_map_df.groupby('c_id')
    cluster_id_list=list(set(model_map_df['c_id']))
    for cluster_id in cluster_id_list:
        all_mentions_id=list(model_map_group.get_group(cluster_id)['m_id'])
        all_clusters.append(all_mentions_id)
    return all_clusters

def reshape_map(clusters):
    modified_map={}
    for cl in clusters:
        for m_id in cl:
            modified_map[m_id]=cl[0]
    return modified_map

def calculate_coref_metric(model_clusters,gold_clusters,gold_map,model_map):
    #muc
    m_pn,m_pd = muc(model_clusters, gold_map)
    m_rn, m_rd = muc(gold_clusters, model_map)
    m_f1=f1(m_pn, m_pd, m_rn, m_rd, beta=1)
    print("muc")
    print("Recall: {:.6f} Precision: {:.6f} F1 {:.6f}".format(m_rn/m_rd,m_pn/m_pd,m_f1))
    #b_cubed
    b_pn,b_pd = b_cubed(model_clusters, gold_map)
    b_rn, b_rd = b_cubed(gold_clusters, model_map)
    b_f1=f1(b_pn, b_pd, b_rn, b_rd, beta=1)
    print("b_cubed")
    print("Recall: {:.6f} Precision: {:.6f} F1 {:.6f}".format(b_rn/b_rd,b_pn/b_pd,b_f1)) 
    #ceafe
    c_pn, c_pd, c_rn, c_rd=ceafe(model_clusters,gold_clusters)
    c_f1=f1(c_pn, c_pd, c_rn, c_rd, beta=1)
    print('ceafe')
    print("Recall: {:.6f} Precision: {:.6f} F1 {:.6f} ".format(c_rn/c_rd,c_pn/c_pd,c_f1))
    #lea
    l_pn, l_pd = lea(model_clusters, gold_clusters,model_map)
    l_rn, l_rd = lea(gold_clusters, model_clusters,gold_map)
    l_f1=f1(l_pn, l_pd, l_rn, l_rd, beta=1)
    print('lea')
    print("Recall: {:.6f} Precision: {:.6f} F1 {:.6f} ".format(l_rn/l_rd,l_pn/l_pd,l_f1))
    #conll
    conll_f1=(1/3)*(m_f1+b_f1+c_f1)
    print('CoNLL F1 {:.6}'.format(conll_f1))

def llm_eval(model_map_path, gold_map_path):
    '''
    Parameters:
        model_map_path: the path to the mapping from the ECR system
        gold_map_path: the path to the mapping from the corpus
    Outputs:
        model_cluster: the predicted mention clustering result from ECR system
        gold_cluster: the golden mention clustering result from the corpus
    '''
    model_map = pd.read_pickle(model_map_path)
    model_cluster = make_clusters(model_map)
    reformat_model_map = reshape_map(model_cluster) # To align with the format of gold_map
    gold_map = pd.read_pickle(gold_map_path)
    gold_cluster = make_clusters(gold_map)
    calculate_coref_metric(model_cluster,gold_cluster,gold_map,reformat_model_map)
    return model_cluster, gold_cluster

In [19]:
# Eval zero_shot Claude-2
prompt_type = 'zero_shot'
model_name = 'claude2'
model_map_path =  f'./{prompt_type}/model_map/{model_name}_model_map'
gold_map_path = './golden_annotation/gold_map'
claude_model_cluster, claude_gold_cluster = llm_eval(model_map_path, gold_map_path)

muc
Recall: 0.389744 Precision: 0.617886 F1 0.477987
b_cubed
Recall: 0.516015 Precision: 0.762954 F1 0.615646
ceafe
Recall: 0.649777 Precision: 0.579901 F1 0.612853 
lea
Recall: 0.650000 Precision: 0.783125 F1 0.710379 
CoNLL F1 0.568829


In [20]:
# Eval zero_shot GPT-4
prompt_type = 'zero_shot'
model_name = 'gpt4'
model_map_path =  f'./{prompt_type}/model_map/{model_name}_model_map'
gold_map_path = './golden_annotation/gold_map'
gpt_model_cluster, gpt_gold_cluster = llm_eval(model_map_path, gold_map_path)

muc
Recall: 0.735385 Precision: 0.734631 F1 0.735008
b_cubed
Recall: 0.798464 Precision: 0.678220 F1 0.733446
ceafe
Recall: 0.624168 Precision: 0.637633 F1 0.630829 
lea
Recall: 0.650000 Precision: 0.863946 F1 0.741856 
CoNLL F1 0.699761
