In [1]:
import os
import sys
from collections import OrderedDict
from pathlib import Path

import pandas as pd
from yaml import safe_load

sys.path.append('..')
sys.path.append('../data/deft_corpus/evaluation/program')
from defx.util.official_evaluation import evaluate_subtasks

In [2]:
!hostname

serv-9209


In [3]:
models_path = Path('/home/huebner/research/semeval_2020_task_6/search')

In [4]:
!ls -l {models_path}

total 20
drwxr-xr-x 14 huebner deeplee 4096 26. Feb 17:55 repeated_joint_bert
drwxr-xr-x 12 huebner deeplee 4096 26. Feb 14:02 repeated_joint_bert__no_ner_embedder
drwxr-xr-x 12 huebner deeplee 4096 12. Feb 12:31 repeated_subtask2_changed_best_epoch
drwxr-xr-x 12 huebner deeplee 4096 14. Feb 11:38 repeated_subtask2_split_bert_coarse_f1
drwxr-xr-x 12 huebner deeplee 4096 14. Feb 11:39 repeated_subtask2_split_bert_modifier_f1


In [5]:
split='dev'
gold_dir = Path('../data/deft_split/raw/', split)

In [6]:
config_path = Path('../data/deft_corpus/evaluation/program/configs/eval_test.yaml')
assert config_path.exists() and config_path.is_file(), 'Config not found'
with config_path.open() as cfg_file:
    config = safe_load(cfg_file)

In [8]:
def format_model_results(model_results):
    run_dfs = []
    for run in model_results:
        run = run['subtask2']
        d = {(i,j): run[i][j] 
               for i in run.keys() 
               for j in run[i].keys()}

        mux = pd.MultiIndex.from_tuples(d.keys())
        run_df = pd.DataFrame(list(d.values()), index=mux)
        run_dfs.append(run_df)
    model_df = pd.concat(run_dfs, axis=1)
    agg_model_df = model_df.agg(['mean', 'std'], axis=1)
    join_columns = lambda x: '+-'.join(round(x*100, 2).astype(str)) if x[1] != 0 else x[0]
    joint_agg_model_df = agg_model_df.apply(join_columns, axis=1)
    row_idx = list(OrderedDict([(ix[0], '1') for ix in run_df.index]).keys())
    final_df = joint_agg_model_df.unstack(level=[1])
    final_df = final_df.reindex(row_idx, axis=0).reindex(['precision', 'recall', 'f1-score', 'support'], axis=1)
    return final_df

In [11]:
results = []
for model_dir in models_path.iterdir():
    model_name = model_dir.name.replace('repeated_', '').replace('subtask2_', '')
    #if not model_name.startswith('joint_bert'):
    #    continue
    model_results = []
    print('='*100)
    print(f'Evaluating {model_name}')
    for cwd, _, curr_files in os.walk(model_dir):
        if 'model.tar.gz' in curr_files:
            pred_dir = Path(cwd, f'{split}_submission')
            result = evaluate_subtasks([2],
                                       gold_dir=gold_dir,
                                       pred_dir=pred_dir,
                                       eval_config=config,
                                       quiet=True)
            model_results.append(result)
    print(format_model_results(model_results))
    print('='*100)
    results.append({'Model': model_name, 'accuracy': 0.0})

Evaluating changed_best_epoch
                             precision        recall      f1-score support
B-Term                     73.68+-1.49   71.52+-2.02   72.55+-0.87     640
I-Term                     71.83+-1.35   64.52+-2.42   67.94+-1.22    1040
B-Definition                65.0+-1.11   64.69+-1.69   64.83+-0.87     573
I-Definition               70.92+-1.51   72.87+-2.59   71.84+-0.92    8080
B-Alias-Term               63.47+-2.83   57.87+-5.24    60.4+-2.97      94
I-Alias-Term               45.66+-5.48   39.31+-6.58   41.61+-2.92      72
B-Referential-Definition  62.61+-11.64   47.22+-8.38   52.75+-4.92      18
I-Referential-Definition   64.59+-9.16   58.33+-8.78   60.49+-5.94      24
B-Referential-Term        39.57+-13.38  26.92+-11.61  31.25+-11.54      13
I-Referential-Term        38.75+-14.49   56.0+-12.65   44.6+-13.48       5
B-Qualifier               34.23+-11.64   38.75+-9.22    35.8+-9.68       8
I-Qualifier                37.13+-6.03   47.65+-7.69   41.57+-6.11    