## read result files
These files have been generated with the [reference coreference scorer script](https://github.com/conll/reference-coreference-scorers).

In [21]:
import pandas as pd
import os 
import re
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.1f}'.format

In [22]:
def extract_totals(fname):
    sout = []
    copy = True
    with open(fname) as fin:
        for line in fin.readlines():
            if line.startswith('====== TOTALS ======='):
                copy = True

            if copy:
                sout.append(line)

            if line.startswith('METRIC'):
                copy = False
    return sout

In [23]:
MENTION_RESULTS_REGEX = re.compile(
    r".*Identification of Mentions: Recall: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\s*Precision: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\s*F1: ([0-9.]+)%.*", re.DOTALL)

COREF_RESULTS_REGEX = re.compile(
    r".*(?:Coreference|BLANC): Recall: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\s*Precision: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\s*F1: ([0-9.]+)%.*", re.DOTALL)

METRIC_REGEX = re.compile(
    r".*METRIC (.*):.*")



def get_results_dict(lines):
    results = {}

    metric = ''
    for s in lines:
        metric_match = re.match(METRIC_REGEX, s)
        if(metric_match):
            metric, = metric_match.groups()
            results[metric] = {}

        mention_match = re.match(MENTION_RESULTS_REGEX, s)
        if(mention_match):
            r, p, f1 = mention_match.groups()
            results[metric]['mention_rec'] = r
            results[metric]['mention_prec'] = p
            results[metric]['mention_f1'] = f1

        coref_match = re.match(COREF_RESULTS_REGEX, s)
        if(coref_match):
            r, p, f1 = coref_match.groups()
            results[metric]['coref_rec'] = r
            results[metric]['coref_prec'] = p
            results[metric]['coref_f1'] = f1
            
    return results

In [24]:
basepath = '/home/dafne/shared/FilterBubble/coref-resolution/pipeline/results_nosing'
os.listdir(basepath)

['dev_coref.txt',
 'dev_e2e.txt',
 'dev_entity.txt',
 'news_coref.txt',
 'news_e2e.txt',
 'news_entity.txt',
 'news_goldmentions_coref.txt',
 'news_goldmentions_e2e.txt',
 'news_goldmentions_entity.txt',
 'news_goldparsementions_coref.txt',
 'news_goldparsementions_entity.txt']

In [25]:
results = {}
for fname in os.listdir(basepath):
    expname = fname.split('.')[0]#.split('_')[-1]
    expname = expname.split('_')
    dataset = expname[0]
    tool = expname[-1]
    pipeline = expname[1] if len(expname)>2 else 'endtoend'
    sout = extract_totals(os.path.join(basepath, fname))
    results[(dataset, pipeline, tool)] = get_results_dict(sout)

In [26]:
for exp in results:
    if 'ceafe' in results[exp]:
        results[exp]['conll'] = {}
        for m in ['coref_rec', 'coref_prec','coref_f1']:
            results[exp]['conll'][m] = (float(results[exp]['muc'][m]) + 
                                                 float(results[exp]['bcub'][m]) + 
                                                 float(results[exp]['ceafe'][m]))/3

In [27]:
df_results = pd.DataFrame.from_dict({(*outerKey, innerKey): values for outerKey, innerDict in results.items() 
          for innerKey, values in innerDict.items()}, orient='index')
for c in df_results.columns:
    df_results[c] = df_results[c].astype(float)
    

df_results = df_results.sort_index()
df_results

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,mention_rec,mention_prec,mention_f1,coref_rec,coref_prec,coref_f1
dev,endtoend,coref,bcub,31.8,35.3,33.5,9.8,11.8,10.7
dev,endtoend,coref,blanc,31.8,35.3,33.5,6.3,7.8,7.0
dev,endtoend,coref,ceafe,31.8,35.3,33.5,16.3,17.0,16.6
dev,endtoend,coref,ceafm,31.8,35.3,33.5,16.0,17.8,16.9
dev,endtoend,coref,conll,,,,11.4,12.6,11.9
dev,endtoend,coref,muc,31.8,35.3,33.5,8.0,9.1,8.5
dev,endtoend,e2e,bcub,25.1,37.3,30.0,16.3,26.9,20.3
dev,endtoend,e2e,blanc,25.1,37.3,30.0,12.7,19.9,15.4
dev,endtoend,e2e,ceafe,25.1,37.3,30.0,18.0,26.6,21.5
dev,endtoend,e2e,ceafm,25.1,37.3,30.0,21.3,31.6,25.4


In [28]:
df_results_swapped = df_results.swaplevel().sort_index()

In [29]:
# Show mention identication scores, sorted on f1
mention_scores = df_results_swapped.loc[(slice(None), slice(None),'muc',slice(None)), ['mention_rec', 'mention_prec', 'mention_f1']]
mention_scores = mention_scores.sort_values('mention_f1', ascending=False)
mention_scores = mention_scores.sort_index(level=[0,1], sort_remaining=False)
mention_scores

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,mention_rec,mention_prec,mention_f1
dev,endtoend,muc,entity,50.3,66.1,57.1
dev,endtoend,muc,coref,31.8,35.3,33.5
dev,endtoend,muc,e2e,25.1,37.3,30.0
news,endtoend,muc,e2e,58.4,89.8,70.8
news,endtoend,muc,entity,48.5,76.8,59.4
news,endtoend,muc,coref,45.9,54.8,49.9
news,goldmentions,muc,e2e,90.1,100.0,94.8
news,goldmentions,muc,coref,57.4,68.1,62.3
news,goldmentions,muc,entity,48.7,77.4,59.8
news,goldparsementions,muc,coref,60.3,70.0,64.8


In [30]:
sorted_scores = df_results_swapped.sort_values('coref_f1',ascending=False)
sorted_scores = sorted_scores.sort_index(level=[0,1,2], sort_remaining=False)[['coref_rec', 'coref_prec', 'coref_f1']]
sorted_scores

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,coref_rec,coref_prec,coref_f1
dev,endtoend,bcub,entity,31.3,45.6,37.1
dev,endtoend,bcub,e2e,16.3,26.9,20.3
dev,endtoend,bcub,coref,9.8,11.8,10.7
dev,endtoend,blanc,entity,28.5,32.3,28.7
dev,endtoend,blanc,e2e,12.7,19.9,15.4
dev,endtoend,blanc,coref,6.3,7.8,7.0
dev,endtoend,ceafe,entity,31.9,44.8,37.2
dev,endtoend,ceafe,e2e,18.0,26.6,21.5
dev,endtoend,ceafe,coref,16.3,17.0,16.6
dev,endtoend,ceafm,entity,37.7,49.5,42.8
