In [1]:
import pandas as pd
import numpy as np
import os
import scipy.stats as stats
import torch
from helper import *
from coval.coval.conll.reader import get_coref_infos
from coval.coval.eval.evaluator import evaluate_documents as evaluate
from coval.coval.eval.evaluator import muc, b_cubed, ceafe, lea
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False


In [10]:
def read(key, response):
    '''read keyfile 
        key:  the keyfile path for gold_map
        response: the keyfile path for model_map  
    '''
    return get_coref_infos('%s' % key, '%s' % response,
            False, False, True)

def give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path):
    '''get MUC, B-cubed, CEAFe, LEA, CoNLL F1 evaluation
        gold_map_path: the path for gold_map
        model_map_path: the path for model_map
        keyfile_save_path: the path to save the keyfiles generated by the func 'read'
    '''
    gold_map=pd.read_pickle(gold_map_path)
    model_map=pd.read_pickle(model_map_path)
    save_folder=os.path.join(keyfile_save_path)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
        print(f"Folder '{keyfile_save_path}' created successfully")  
    else:
        print('Testing') 
    gold_map_conll_file=save_folder + f'/evt_gold_test.keyfile'
    model_map_conll_file=save_folder + f'/evt_model_test.keyfile'
    generate_key_file(gold_map.items(), 'evt', save_folder, gold_map_conll_file)
    generate_key_file(model_map.items(), 'evt', save_folder, model_map_conll_file)
    doc= read(gold_map_conll_file, model_map_conll_file)
    mr, mp, mf = np.round(np.round(evaluate(doc, muc), 3) * 100, 1)
    br, bp, bf = np.round(np.round(evaluate(doc, b_cubed), 3) * 100, 1)
    cr, cp, cf = np.round(np.round(evaluate(doc, ceafe), 3) * 100, 1)
    lr, lp, lf = np.round(np.round(evaluate(doc, lea), 3) * 100, 1)
    conf = np.round((mf + bf + cf) / 3, 1)
    muc_res = f"MUC: R {mr} P {mp} F1 {mf}\n"
    bcubed_res = f"B-cubed: R {br} P {bp} F1 {bf}\n"
    ceafe_res = f"CEAFe: R {cr} P {cp} F1 {cf}\n"
    lea_res = f"LEA: R {lr} P {lp} F1 {lf}\n"
    conll_res = f"CoNLL F1: {conf}\n"
    multi_metrics_str = muc_res + bcubed_res + ceafe_res + lea_res + conll_res
    print(multi_metrics_str)
    with open(os.path.join(save_folder,'multi_metrics.txt'), "w") as file:
        file.write(multi_metrics_str)
    multi_metrics_dict = {'MUC':{'R':mr, 'P':mp, 'F1': mf},
                    'B-cubed':{'R':br, 'P':bp, 'F1': bf},
                    'CEAFe':{'R':cr, 'P':cp, 'F1': cf},
                    'LEA':{'R',lr, 'P',lp,'F1',lf},
                    'CoNLL F1':{conf}}
    return multi_metrics_dict

In [None]:
def give_evaluation_result_2(gold_map_path,model_map_path,save_path):
    gold_map=pd.read_pickle(gold_map_path)
    model_map=pd.read_pickle(model_map_path)
    save_folder=save_path
    # if not os.path.exists(save_folder):
    #     os.makedirs(save_folder)
    #     print(f"Folder '{model_tag}' created successfully")  
    # else:
    print('Testing') 
    gold_map_conll_file=save_folder + f'/evt_gold_test.keyfile'
    model_map_conll_file=save_folder + f'/evt_model_test.keyfile'
    generate_key_file(gold_map.items(), 'evt', save_folder, gold_map_conll_file)
    generate_key_file(model_map.items(), 'evt', save_folder, model_map_conll_file)
    doc= read(gold_map_conll_file, model_map_conll_file)
    mr, mp, mf = np.round(np.round(evaluate(doc, muc), 3) * 100, 1)
    br, bp, bf = np.round(np.round(evaluate(doc, b_cubed), 3) * 100, 1)
    cr, cp, cf = np.round(np.round(evaluate(doc, ceafe), 3) * 100, 1)
    lr, lp, lf = np.round(np.round(evaluate(doc, lea), 3) * 100, 1)
    conf = np.round((mf + bf + cf) / 3, 1)
    mu=(mr,mp,mf)
    b_cu=(br,bp,bf)
    cea=(cr,cp,cf)
    le=(lr,lp,lf)
    return mu, b_cu,cea,le,conf


def B_cubed_pairwise_t_test(test_programme_path):
    base_path=os.path.join(test_programme_path,'base')
    aug_path=os.path.join(test_programme_path,'aug')
    base_b_cubed_list=[]
    aug_b_cubed_list=[]
    for t in range(10):
        base_turn_path=os.path.join(base_path,'turn_{}'.format(t))
        aug_turn_path=os.path.join(aug_path,'turn_{}'.format(t))
        base_mu,base_b_cu,base_cea,base_le,base_conf=give_evaluation_result_2(base_turn_path+r'/gold_map',base_turn_path+r'/model_map',base_turn_path) 
        aug_mu,aug_b_cu,aug_cea,aug_le,aug_conf=give_evaluation_result_2(aug_turn_path+r'/gold_map',aug_turn_path+r'/model_map',aug_turn_path) 
        base_b_cubed_list.append(base_b_cu[-1])
        aug_b_cubed_list.append(aug_b_cu[-1])
    print(stats.ttest_rel(base_b_cubed_list, aug_b_cubed_list))
# ttest for ecb+
stats.ttest_rel([84.1, 85.6, 83.5, 84.4, 84.3, 84.5,82.8], [85.5, 86, 85.1, 84.9, 84.8, 84.9, 84.3])

# 1. Main Exp

## 1.1 ECB+

In [12]:
exp_type = 'main'
dataset = 'ecb'
ECRsystem = 'baseline'
save_path_prefix =  f'../outputs/{exp_type}/{dataset}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/main/ecb/baseline/eval_results/multi_metrics' created successfully
MUC: R 82.5 P 88.6 F1 85.4
B-cubed: R 82.6 P 88.6 F1 85.5
CEAFe: R 85.1 P 78.5 F1 81.7
LEA: R 74.0 P 77.4 F1 75.6
CoNLL F1: 84.2



In [9]:
exp_type = 'main'
dataset = 'ecb'
ECRsystem = 'enhanced'
save_path_prefix =  f'../outputs/{exp_type}/{dataset}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Testing
MUC: R 86.4 P 88.6 F1 87.5
B-cubed: R 85.7 P 88.4 F1 87.0
CEAFe: R 84.7 P 82.2 F1 83.4
LEA: R 77.4 P 79.6 F1 78.5
CoNLL F1: 86.0



## 1.2 FCC

In [13]:
exp_type = 'main'
dataset = 'fcc'
ECRsystem = 'baseline'
save_path_prefix =  f'../outputs/{exp_type}/{dataset}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/main/fcc/baseline/eval_results/multi_metrics' created successfully
MUC: R 79.2 P 88.9 F1 83.7
B-cubed: R 64.4 P 61.6 F1 63.0
CEAFe: R 73.3 P 46.0 F1 56.5
LEA: R 58.1 P 47.2 F1 52.1
CoNLL F1: 67.7



In [14]:
exp_type = 'main'
dataset = 'fcc'
ECRsystem = 'enhanced'
save_path_prefix =  f'../outputs/{exp_type}/{dataset}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/main/fcc/enhanced/eval_results/multi_metrics' created successfully
MUC: R 79.2 P 88.2 F1 83.4
B-cubed: R 66.8 P 74.7 F1 70.5
CEAFe: R 72.7 P 46.7 F1 56.9
LEA: R 60.1 P 60.1 F1 60.1
CoNLL F1: 70.3



## 1.3 GVC

In [15]:
exp_type = 'main'
dataset = 'gvc'
ECRsystem = 'baseline'
save_path_prefix =  f'../outputs/{exp_type}/{dataset}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/main/gvc/baseline/eval_results/multi_metrics' created successfully
MUC: R 89.3 P 92.3 F1 90.8
B-cubed: R 82.1 P 85.7 F1 83.8
CEAFe: R 76.6 P 67.5 F1 71.7
LEA: R 76.9 P 78.8 F1 77.8
CoNLL F1: 82.1



In [17]:
exp_type = 'main'
dataset = 'gvc'
ECRsystem = 'enhanced'
save_path_prefix =  f'../outputs/{exp_type}/{dataset}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/main/gvc/enhanced/eval_results/multi_metrics' created successfully
MUC: R 90.4 P 92.1 F1 91.3
B-cubed: R 84.8 P 86.8 F1 85.8
CEAFe: R 78.9 P 73.2 F1 76.0
LEA: R 79.8 P 80.7 F1 80.2
CoNLL F1: 84.4



# 2. Ablation Study

## 2.1 CAD

In [18]:
exp_type = 'ablation_study'
data_type = 'CAD'
save_path_prefix =  f'../outputs/{exp_type}/{data_type}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/ablation_study/CAD/eval_results/multi_metrics' created successfully
MUC: R 87.2 P 86.5 F1 86.8
B-cubed: R 86.7 P 85.4 F1 86.1
CEAFe: R 81.8 P 82.7 F1 82.2
LEA: R 77.2 P 76.2 F1 76.7
CoNLL F1: 85.0



## 2.2 TIA

In [19]:
exp_type = 'ablation_study'
data_type = 'TIA'
save_path_prefix =  f'../outputs/{exp_type}/{data_type}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/ablation_study/TIA/eval_results/multi_metrics' created successfully
MUC: R 82.1 P 85.7 F1 83.9
B-cubed: R 83.3 P 85.2 F1 84.2
CEAFe: R 82.7 P 78.6 F1 80.6
LEA: R 73.9 P 73.7 F1 73.8
CoNLL F1: 82.9



## 2.3 CIA

In [20]:
exp_type = 'ablation_study'
data_type = 'CIA'
save_path_prefix =  f'../outputs/{exp_type}/{data_type}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/ablation_study/CIA/eval_results/multi_metrics' created successfully
MUC: R 84.1 P 86.2 F1 85.2
B-cubed: R 84.5 P 86.5 F1 85.5
CEAFe: R 82.4 P 80.0 F1 81.2
LEA: R 74.7 P 75.5 F1 75.1
CoNLL F1: 84.0



## 2.4 TAD

In [21]:
exp_type = 'ablation_study'
data_type = 'TAD'
save_path_prefix =  f'../outputs/{exp_type}/{data_type}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/ablation_study/TAD/eval_results/multi_metrics' created successfully
MUC: R 75.7 P 89.0 F1 81.8
B-cubed: R 78.2 P 89.8 F1 83.6
CEAFe: R 86.3 P 73.0 F1 79.1
LEA: R 69.7 P 75.2 F1 72.4
CoNLL F1: 81.5



## 2.5 RMCT

In [30]:
exp_type = 'ablation_study'
data_type = 'RMCT'
save_path_prefix =  f'../outputs/{exp_type}/{data_type}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/ablation_study/RMCT/eval_results_2/multi_metrics' created successfully
MUC: R 88.7 P 80.4 F1 84.3
B-cubed: R 87.6 P 77.6 F1 82.3
CEAFe: R 71.8 P 82.1 F1 76.6
LEA: R 73.2 P 67.6 F1 70.3
CoNLL F1: 81.1



## 3. Out-Of-the-Domain Test

## 3.1 ECB_Cross_FCC

In [24]:
exp_type = 'ood_test'
data_type = 'ecb_cross_fcc'
ECRsystem = 'baseline'
save_path_prefix =  f'../outputs/{exp_type}/{data_type}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/ood_test/ecb_cross_fcc/baseline/eval_results/multi_metrics' created successfully
MUC: R 44.0 P 83.0 F1 57.5
B-cubed: R 24.7 P 87.1 F1 38.4
CEAFe: R 71.7 P 20.2 F1 31.5
LEA: R 16.0 P 47.3 F1 23.9
CoNLL F1: 42.5



In [25]:
exp_type = 'ood_test'
data_type = 'ecb_cross_fcc'
ECRsystem = 'enhanced'
save_path_prefix =  f'../outputs/{exp_type}/{data_type}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/ood_test/ecb_cross_fcc/enhanced/eval_results/multi_metrics' created successfully
MUC: R 60.0 P 80.1 F1 68.6
B-cubed: R 33.0 P 75.9 F1 46.0
CEAFe: R 58.9 P 24.9 F1 35.0
LEA: R 22.9 P 48.9 F1 31.2
CoNLL F1: 49.9



## 3.2 FCC_Cross_ECB

In [27]:
exp_type = 'ood_test'
data_type = 'fcc_cross_ecb'
ECRsystem = 'baseline'
save_path_prefix =  f'../outputs/{exp_type}/{data_type}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/ood_test/fcc_cross_ecb/baseline/eval_results/multi_metrics' created successfully
MUC: R 95.2 P 56.8 F1 71.2
B-cubed: R 93.8 P 16.5 F1 28.1
CEAFe: R 12.7 P 69.7 F1 21.5
LEA: R 61.6 P 12.7 F1 21.0
CoNLL F1: 40.3



In [28]:
exp_type = 'ood_test'
data_type = 'fcc_cross_ecb'
ECRsystem = 'enhanced'
save_path_prefix =  f'../outputs/{exp_type}/{data_type}/{ECRsystem}/eval_results'
gold_map_path = os.path.join(save_path_prefix ,'gold_map') 
model_map_path = os.path.join(save_path_prefix ,'model_map') 
keyfile_save_path = os.path.join(save_path_prefix ,'multi_metrics')
ecb_main_enhanced = give_evaluation_result(gold_map_path, model_map_path, keyfile_save_path)

Folder '../outputs/ood_test/fcc_cross_ecb/enhanced/eval_results/multi_metrics' created successfully
MUC: R 93.1 P 58.7 F1 72.0
B-cubed: R 91.4 P 19.5 F1 32.1
CEAFe: R 20.6 P 71.5 F1 32.0
LEA: R 61.6 P 14.0 F1 22.9
CoNLL F1: 45.4



In [29]:
# ttest for ecb+
stats.ttest_rel([84.1, 85.6, 83.5, 84.4, 84.3, 84.5,82.8], [85.5, 86, 85.1, 84.9, 84.8, 84.9, 84.3])

TtestResult(statistic=-4.209364560120707, pvalue=0.00562710764296143, df=6)