## Create empty results file

```python
df = pd.DataFrame(columns=['model', 'dataset', 'mode', 'filtered', 
                           'precision', 'recall', 'f1', 'acc'])
```

```python
df.to_pickle("results.pkl")
```

## Latest Results

In [1]:
import pandas as pd

results = pd.read_pickle('results.pkl')
results

Unnamed: 0,model,dataset,mode,filtered,precision,recall,f1,acc
0,flair/ner-english,DutchPolicyDocs,strict,False,0.547449,0.180047,0.270975,0.180047
1,flair/ner-english,DutchPolicyDocs,forgiving,False,0.646736,0.212624,0.320033,0.212701
2,flair/ner-english,TR-News,strict,False,0.885305,0.749052,0.811499,0.749052
3,flair/ner-english,TR-News,forgiving,False,0.912186,0.771797,0.836140,0.771797
4,flair/ner-english,LGL,strict,False,0.787641,0.671384,0.724881,0.671384
...,...,...,...,...,...,...,...,...
75,XLM-R-ner,TR-News,forgiving,False,0.906083,0.644158,0.752993,0.644158
76,XLM-R-ner,LGL,strict,False,0.756106,0.444182,0.559614,0.444182
77,XLM-R-ner,LGL,forgiving,False,0.879893,0.516903,0.651232,0.516903
78,XLM-R-ner,GWN,strict,True,0.833561,0.470362,0.601378,0.470362


## Table 1: f1-scores

In [3]:
results_f = results[results['mode'] == 'forgiving'].sort_values(['model', 'dataset'])

In [4]:
f_scores_combined = []

for idx, row in results.groupby(['model', 'dataset']):
    
    f_scores_combined.append(row.iloc[0]['f1'].round(3).astype(str) + ' (' + row.iloc[1]['f1'].round(3).astype(str) + ')')


In [5]:
results_f['f1-combined'] = f_scores_combined

In [6]:
pd.pivot_table(results_f, values='f1-combined', index=['model'], columns=['dataset'], aggfunc=lambda x: ' '.join(x))

dataset,DutchPolicyDocs,GWN,LGL,TR-News
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LaBSE,0.567 (0.615),0.689 (0.715),0.651 (0.724),0.745 (0.785)
XLM-R-ner,0.467 (0.539),0.601 (0.656),0.56 (0.651),0.669 (0.753)
en_core_web_lg,0.141 (0.192),0.56 (0.712),0.498 (0.699),0.697 (0.81)
en_core_web_trf,0.35 (0.41),0.587 (0.745),0.556 (0.77),0.718 (0.837)
flair/ner-dutch-large,0.645 (0.697),0.737 (0.75),0.69 (0.747),0.775 (0.803)
flair/ner-english,0.271 (0.32),0.725 (0.78),0.725 (0.78),0.811 (0.836)
flair/ner-english-large,0.652 (0.703),0.77 (0.781),0.745 (0.804),0.828 (0.86)
flair/ner-multi,0.584 (0.647),0.744 (0.759),0.696 (0.758),0.801 (0.833)
mBERT,0.543 (0.59),0.681 (0.699),0.642 (0.709),0.739 (0.774)
nl_core_news_lg,0.472 (0.557),0.366 (0.432),0.327 (0.389),0.504 (0.565)


### Add EUPEG results

In [7]:
eupeg_results = pd.read_excel('geoparsing-results-EUPEG.xlsx')

eupeg_subset = eupeg_results[(eupeg_results['dataset'].isin(['LGL', 'TR-News', 'GeoWebNews'])) &
                             (eupeg_results['geoparser'].isin(['StanfordNER', 'DM_NLP+Pop', 'UniMelb+Pop']))]

pd.pivot_table(eupeg_subset, values='f-score', index='geoparser', columns='dataset')

dataset,GeoWebNews,LGL,TR-News
geoparser,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DM_NLP+Pop,0.717,0.677,0.677
StanfordNER,0.739,0.677,0.803
UniMelb+Pop,0.722,0.673,0.715


## Table 2: averaged metrics on English corpora

In [8]:
results_english_corpora = results[results['dataset'] != 'DutchPolicyDocs']

In [9]:
results_english_corpora_s = results_english_corpora[results_english_corpora['mode'] == 'strict']
results_english_corpora_f = results_english_corpora[results_english_corpora['mode'] == 'forgiving']

In [10]:
model_metrics_avg_s = results_english_corpora_s.groupby('model').mean().round(3).rename(columns={'precision': 'precision_s','recall': 'recall_s','f1': 'f1_s', 'acc': 'acc_s'})

model_metrics_avg_f = results_english_corpora_f.groupby('model').mean().round(3).rename(columns={'precision': 'precision_f','recall': 'recall_f','f1': 'f1_f', 'acc': 'acc_f'})

In [11]:
tmp = pd.concat([model_metrics_avg_s, model_metrics_avg_f], axis=1)

tmp['avg_precision'] = tmp['precision_s'].astype(str) + ' (' + tmp['precision_f'].astype(str) + ')' 

tmp['avg_recall'] = tmp['recall_s'].astype(str) + ' (' + tmp['recall_f'].astype(str) + ')' 

tmp['avg_f1'] = tmp['f1_s'].astype(str) + ' (' + tmp['f1_f'].astype(str) + ')' 

tmp['avg_acc'] = tmp['acc_s'].astype(str) + ' (' + tmp['acc_f'].astype(str) + ')' 

tmp = tmp[['avg_precision', 'avg_recall', 'avg_f1', 'avg_acc']]
tmp

# tmp.to_excel('models-avg-metrics.xlsx')

Unnamed: 0_level_0,avg_precision,avg_recall,avg_f1,avg_acc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LaBSE,0.715 (0.762),0.679 (0.725),0.695 (0.741),0.679 (0.725)
XLM-R-ner,0.798 (0.898),0.496 (0.558),0.61 (0.687),0.496 (0.558)
en_core_web_lg,0.725 (0.919),0.492 (0.621),0.585 (0.74),0.492 (0.621)
en_core_web_trf,0.759 (0.96),0.526 (0.664),0.62 (0.784),0.526 (0.664)
flair/ner-dutch-large,0.865 (0.903),0.639 (0.669),0.734 (0.767),0.639 (0.669)
flair/ner-english,0.82 (0.869),0.697 (0.739),0.754 (0.799),0.697 (0.739)
flair/ner-english-large,0.882 (0.919),0.704 (0.735),0.781 (0.815),0.704 (0.735)
flair/ner-multi,0.838 (0.878),0.677 (0.71),0.747 (0.783),0.677 (0.71)
mBERT,0.724 (0.765),0.658 (0.696),0.687 (0.727),0.658 (0.697)
nl_core_news_lg,0.637 (0.739),0.292 (0.338),0.399 (0.462),0.292 (0.338)


#### combine with EUPEG results

In [12]:
eupeg_results = pd.read_excel('geoparsing-results-EUPEG.xlsx')
eupeg_subset = eupeg_results[(eupeg_results['dataset'].isin(['LGL', 'TR-News', 'GeoWebNews'])) &
                             (eupeg_results['geoparser'].isin(['StanfordNER', 'DM_NLP+Pop', 'UniMelb+Pop']))]

eupeg_subset.groupby('geoparser').mean().round(3)

Unnamed: 0_level_0,precision,recall,f-score,accuracy
geoparser,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DM_NLP+Pop,0.781,0.62,0.69,
StanfordNER,0.84,0.663,0.74,0.663
UniMelb+Pop,0.796,0.634,0.703,


## Table 3: model metrics on DPD corpora

In [12]:
results_DPD = results[results['dataset'] == 'DutchPolicyDocs']

results_DPD = results_DPD[results_DPD['model'].isin(['LaBSE', 
                                              'XLM-R-ner',
                                              'flair/ner-multi',
                                              'flair/ner-dutch-large', 
                                              'flair/ner-english-large', 
                                              'mBERT', 
                                              'nl_core_news_lg'])]

In [14]:
results_DPD_s = results_DPD[results_DPD['mode'] == 'strict']
results_DPD_f = results_DPD[results_DPD['mode'] == 'forgiving']

In [16]:
model_metrics_avg_s = results_DPD_s.groupby('model').mean().round(3).rename(columns={'precision': 'precision_s','recall': 'recall_s','f1': 'f1_s', 'acc': 'acc_s'})

model_metrics_avg_f = results_DPD_f.groupby('model').mean().round(3).rename(columns={'precision': 'precision_f','recall': 'recall_f','f1': 'f1_f', 'acc': 'acc_f'})

In [17]:
tmp = pd.concat([model_metrics_avg_s, model_metrics_avg_f], axis=1)

tmp['precision'] = tmp['precision_s'].astype(str) + ' (' + tmp['precision_f'].astype(str) + ')' 

tmp['recall'] = tmp['recall_s'].astype(str) + ' (' + tmp['recall_f'].astype(str) + ')' 

tmp['f1'] = tmp['f1_s'].astype(str) + ' (' + tmp['f1_f'].astype(str) + ')' 

tmp['acc'] = tmp['acc_s'].astype(str) + ' (' + tmp['acc_f'].astype(str) + ')' 

tmp = tmp[['precision', 'recall', 'f1', 'acc']]
tmp

# tmp.to_excel('models-avg-metrics.xlsx')

Unnamed: 0_level_0,precision,recall,f1,acc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LaBSE,0.535 (0.58),0.604 (0.654),0.567 (0.615),0.604 (0.654)
XLM-R-ner,0.572 (0.661),0.394 (0.455),0.467 (0.539),0.394 (0.455)
flair/ner-dutch-large,0.648 (0.7),0.642 (0.694),0.645 (0.697),0.642 (0.694)
flair/ner-english-large,0.65 (0.701),0.654 (0.705),0.652 (0.703),0.654 (0.705)
flair/ner-multi,0.62 (0.686),0.553 (0.611),0.584 (0.647),0.553 (0.611)
mBERT,0.517 (0.562),0.572 (0.621),0.543 (0.59),0.572 (0.621)
nl_core_news_lg,0.585 (0.691),0.396 (0.467),0.472 (0.557),0.396 (0.467)


## Table 4: average metrics per corpora

In [2]:
results_s = results[results['mode'] == 'strict']
results_f = results[results['mode'] == 'forgiving']

In [3]:
dataset_metrics_s = results_s.groupby('dataset').mean().round(3).rename(columns={'precision': 'precision_s','recall': 'recall_s','f1': 'f1_s', 'acc': 'acc_s'})

dataset_metrics_f = results_f.groupby('dataset').mean().round(3)



In [4]:
tmp = pd.concat([dataset_metrics_f, dataset_metrics_s], axis=1)

tmp['avg_precision'] = tmp['precision_s'].astype(str) + ' (' + tmp['precision'].astype(str) + ')' 

tmp['avg_recall'] = tmp['recall_s'].astype(str) + ' (' + tmp['recall'].astype(str) + ')' 

tmp['avg_f1'] = tmp['f1_s'].astype(str) + ' (' + tmp['f1'].astype(str) + ')' 

tmp['avg_acc'] = tmp['acc_s'].astype(str) + ' (' + tmp['acc'].astype(str) + ')' 

tmp = tmp[['avg_precision', 'avg_recall', 'avg_f1', 'avg_acc']]
tmp

# tmp.to_excel('dataset-avg-metrics.xlsx')

Unnamed: 0_level_0,avg_precision,avg_recall,avg_f1,avg_acc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DutchPolicyDocs,0.569 (0.65),0.433 (0.483),0.469 (0.527),0.433 (0.483)
GWN,0.805 (0.88),0.547 (0.593),0.646 (0.703),0.547 (0.593)
LGL,0.701 (0.815),0.55 (0.632),0.609 (0.703),0.55 (0.632)
TR-News,0.823 (0.889),0.661 (0.711),0.729 (0.786),0.661 (0.711)
