## read result files
These files have been generated with the [reference coreference scorer script](https://github.com/conll/reference-coreference-scorers).

In [12]:
import pandas as pd
import os 
import re
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.1f}'.format

In [13]:
def extract_totals(fname):
    sout = []
    copy = True
    with open(fname) as fin:
        for line in fin.readlines():
            if line.startswith('====== TOTALS ======='):
                copy = True

            if copy:
                sout.append(line)

            if line.startswith('METRIC'):
                copy = False
    return sout

In [14]:
MENTION_RESULTS_REGEX = re.compile(
    r".*Identification of Mentions: Recall: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\s*Precision: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\s*F1: ([0-9.]+)%.*", re.DOTALL)

COREF_RESULTS_REGEX = re.compile(
    r".*(?:Coreference|BLANC): Recall: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\s*Precision: \([0-9.]+ / [0-9.]+\) ([0-9.]+)%\s*F1: ([0-9.]+)%.*", re.DOTALL)

METRIC_REGEX = re.compile(
    r".*METRIC (.*):.*")



def get_results_dict(lines):
    results = {}

    metric = ''
    for s in lines:
        metric_match = re.match(METRIC_REGEX, s)
        if(metric_match):
            metric, = metric_match.groups()
            results[metric] = {}

        mention_match = re.match(MENTION_RESULTS_REGEX, s)
        if(mention_match):
            r, p, f1 = mention_match.groups()
            results[metric]['mention_rec'] = r
            results[metric]['mention_prec'] = p
            results[metric]['mention_f1'] = f1

        coref_match = re.match(COREF_RESULTS_REGEX, s)
        if(coref_match):
            r, p, f1 = coref_match.groups()
            results[metric]['coref_rec'] = r
            results[metric]['coref_prec'] = p
            results[metric]['coref_f1'] = f1
            
    return results

In [15]:
basepath = '/home/dafne/shared/FilterBubble/coref-resolution/pipeline/results'
os.listdir(basepath)

['dev_coref.txt',
 'dev_e2e.txt',
 'dev_entity.txt',
 'news_coref.txt',
 'news_e2e.txt',
 'news_entity.txt',
 'news_goldmentions_coref.txt',
 'news_goldmentions_e2e.txt',
 'news_goldmentions_entity.txt',
 'news_goldparsementions_coref.txt',
 'news_goldparsementions_entity.txt']

In [16]:
results = {}
for fname in os.listdir(basepath):
    expname = fname.split('.')[0]#.split('_')[-1]
    expname = expname.split('_')
    dataset = expname[0]
    tool = expname[-1]
    pipeline = expname[1] if len(expname)>2 else 'endtoend'
    sout = extract_totals(os.path.join(basepath, fname))
    results[(dataset, pipeline, tool)] = get_results_dict(sout)

In [17]:
for exp in results:
    if 'ceafe' in results[exp]:
        results[exp]['conll'] = {}
        for m in ['coref_rec', 'coref_prec','coref_f1']:
            results[exp]['conll'][m] = (float(results[exp]['muc'][m]) + 
                                                 float(results[exp]['bcub'][m]) + 
                                                 float(results[exp]['ceafe'][m]))/3

In [18]:
df_results = pd.DataFrame.from_dict({(*outerKey, innerKey): values for outerKey, innerDict in results.items() 
          for innerKey, values in innerDict.items()}, orient='index')
for c in df_results.columns:
    df_results[c] = df_results[c].astype(float)
    

df_results = df_results.sort_index()
df_results

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,mention_rec,mention_prec,mention_f1,coref_rec,coref_prec,coref_f1
dev,endtoend,coref,bcub,81.8,81.4,81.6,64.9,67.6,66.2
dev,endtoend,coref,blanc,81.8,81.4,81.6,51.8,51.4,51.6
dev,endtoend,coref,ceafe,81.8,81.4,81.6,65.8,63.3,64.5
dev,endtoend,coref,ceafm,81.8,81.4,81.6,61.5,61.1,61.3
dev,endtoend,coref,conll,,,,58.1,59.4,58.7
dev,endtoend,coref,muc,81.8,81.4,81.6,43.5,47.5,45.4
dev,endtoend,e2e,bcub,88.4,88.2,88.3,71.9,81.8,76.5
dev,endtoend,e2e,blanc,88.4,88.2,88.3,61.9,72.5,66.2
dev,endtoend,e2e,ceafe,88.4,88.2,88.3,80.3,69.9,74.7
dev,endtoend,e2e,ceafm,88.4,88.2,88.3,72.6,72.5,72.6


In [19]:
df_results_swapped = df_results.swaplevel().sort_index()

In [20]:
# Show mention identication scores, sorted on f1
mention_scores = df_results_swapped.loc[(slice(None), slice(None),'muc',slice(None)), ['mention_rec', 'mention_prec', 'mention_f1']]
mention_scores = mention_scores.sort_values('mention_f1', ascending=False)
mention_scores = mention_scores.sort_index(level=[0,1], sort_remaining=False)
mention_scores

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,mention_rec,mention_prec,mention_f1
dev,endtoend,muc,e2e,88.4,88.2,88.3
dev,endtoend,muc,coref,81.8,81.4,81.6
dev,endtoend,muc,entity,81.8,81.4,81.6
news,endtoend,muc,e2e,91.1,91.6,91.3
news,endtoend,muc,coref,84.0,86.2,85.0
news,endtoend,muc,entity,84.0,86.2,85.0
news,goldmentions,muc,e2e,91.2,100.0,95.4
news,goldmentions,muc,coref,86.6,89.3,88.0
news,goldmentions,muc,entity,86.6,89.3,88.0
news,goldparsementions,muc,coref,94.5,95.8,95.2


In [22]:
sorted_scores = df_results_swapped.sort_values('coref_f1',ascending=False)
sorted_scores = sorted_scores.sort_index(level=[0,1,2], sort_remaining=False)[['coref_rec', 'coref_prec', 'coref_f1']]
sorted_scores

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,coref_rec,coref_prec,coref_f1
dev,endtoend,bcub,e2e,71.9,81.8,76.5
dev,endtoend,bcub,entity,65.7,68.5,67.0
dev,endtoend,bcub,coref,64.9,67.6,66.2
dev,endtoend,blanc,e2e,61.9,72.5,66.2
dev,endtoend,blanc,coref,51.8,51.4,51.6
dev,endtoend,blanc,entity,48.1,41.5,43.9
dev,endtoend,ceafe,e2e,80.3,69.9,74.7
dev,endtoend,ceafe,entity,66.5,63.4,64.9
dev,endtoend,ceafe,coref,65.8,63.3,64.5
dev,endtoend,ceafm,e2e,72.6,72.5,72.6
