## Create empty results file

```python
import pandas as pd

df = pd.DataFrame(columns=['model', 'dataset', 'mode', 'filtered', 
                           'precision', 'recall', 'f1', 'acc'])
```

```python
df.to_pickle("results.pkl")
```

## Latest Results

In [1]:
import pandas as pd

results = pd.read_pickle('results.pkl')
results

Unnamed: 0,model,dataset,mode,filtered,precision,recall,f1,acc
0,flair/ner-english,DutchPolicyDocs,strict,False,0.738226,0.243190,0.365857,0.243190
1,flair/ner-english,DutchPolicyDocs,forgiving,False,0.891566,0.292806,0.440834,0.293704
2,flair/ner-english,TR-News,strict,False,0.885305,0.749052,0.811499,0.749052
3,flair/ner-english,TR-News,forgiving,False,0.924731,0.781818,0.847291,0.782411
4,flair/ner-english,LGL,strict,False,0.787641,0.671384,0.724881,0.671384
...,...,...,...,...,...,...,...,...
75,XLM-R-ner,TR-News,forgiving,False,0.958378,0.676205,0.792936,0.681335
76,XLM-R-ner,LGL,strict,False,0.756106,0.444182,0.559614,0.444182
77,XLM-R-ner,LGL,forgiving,False,0.946136,0.551267,0.696638,0.555818
78,XLM-R-ner,GWN,strict,True,0.833561,0.470362,0.601378,0.470362


## Table 1: f1-scores

In [2]:
results_f = results[results['mode'] == 'forgiving'].sort_values(['model', 'dataset'])

In [3]:
f_scores_combined = []

for idx, row in results.groupby(['model', 'dataset']):
    
    f_scores_combined.append(row.iloc[0]['f1'].round(3).astype(str) + ' (' + row.iloc[1]['f1'].round(3).astype(str) + ')')


In [4]:
results_f['f1-combined'] = f_scores_combined

In [5]:
pd.pivot_table(results_f, values='f1-combined', index=['model'], columns=['dataset'], aggfunc=lambda x: ' '.join(x))

dataset,DutchPolicyDocs,GWN,LGL,TR-News
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LaBSE,0.766 (0.951),0.689 (0.82),0.651 (0.807),0.745 (0.871)
XLM-R-ner,0.624 (0.765),0.601 (0.689),0.56 (0.697),0.669 (0.793)
en_core_web_lg,0.187 (0.262),0.56 (0.719),0.498 (0.706),0.697 (0.813)
en_core_web_trf,0.477 (0.572),0.587 (0.75),0.556 (0.774),0.718 (0.842)
flair/ner-dutch-large,0.866 (0.948),0.737 (0.758),0.69 (0.759),0.775 (0.816)
flair/ner-english,0.366 (0.441),0.752 (0.771),0.725 (0.793),0.811 (0.847)
flair/ner-english-large,0.88 (0.959),0.77 (0.788),0.745 (0.815),0.828 (0.873)
flair/ner-multi,0.788 (0.886),0.744 (0.77),0.696 (0.769),0.801 (0.844)
mBERT,0.728 (0.937),0.681 (0.799),0.642 (0.793),0.739 (0.857)
nl_core_news_lg,0.64 (0.773),0.366 (0.454),0.327 (0.428),0.504 (0.584)


### Add EUPEG results

In [6]:
eupeg_results = pd.read_excel('geoparsing-results-EUPEG.xlsx')

eupeg_subset = eupeg_results[(eupeg_results['dataset'].isin(['LGL', 'TR-News', 'GeoWebNews'])) &
                             (eupeg_results['geoparser'].isin(['StanfordNER', 'DM_NLP+Pop', 'UniMelb+Pop']))]

pd.pivot_table(eupeg_subset, values='f-score', index='geoparser', columns='dataset')

dataset,GeoWebNews,LGL,TR-News
geoparser,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DM_NLP+Pop,0.717,0.677,0.677
StanfordNER,0.739,0.677,0.803
UniMelb+Pop,0.722,0.673,0.715


## Table 2: averaged metrics on English corpora

In [7]:
results_english_corpora = results[results['dataset'] != 'DutchPolicyDocs']

In [8]:
results_english_corpora_s = results_english_corpora[results_english_corpora['mode'] == 'strict']
results_english_corpora_f = results_english_corpora[results_english_corpora['mode'] == 'forgiving']

In [9]:
model_metrics_avg_s = results_english_corpora_s.groupby('model').mean().round(3).rename(columns={'precision': 'precision_s','recall': 'recall_s','f1': 'f1_s', 'acc': 'acc_s'})

model_metrics_avg_f = results_english_corpora_f.groupby('model').mean().round(3).rename(columns={'precision': 'precision_f','recall': 'recall_f','f1': 'f1_f', 'acc': 'acc_f'})

In [10]:
tmp = pd.concat([model_metrics_avg_s, model_metrics_avg_f], axis=1)

tmp['avg_precision'] = tmp['precision_s'].astype(str) + ' (' + tmp['precision_f'].astype(str) + ')' 

tmp['avg_recall'] = tmp['recall_s'].astype(str) + ' (' + tmp['recall_f'].astype(str) + ')' 

tmp['avg_f1'] = tmp['f1_s'].astype(str) + ' (' + tmp['f1_f'].astype(str) + ')' 

tmp['avg_acc'] = tmp['acc_s'].astype(str) + ' (' + tmp['acc_f'].astype(str) + ')' 

tmp = tmp[['avg_precision', 'avg_recall', 'avg_f1', 'avg_acc']]
tmp

# tmp.to_excel('models-avg-metrics.xlsx')

Unnamed: 0_level_0,avg_precision,avg_recall,avg_f1,avg_acc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LaBSE,0.715 (0.897),0.679 (0.78),0.695 (0.833),0.679 (0.853)
XLM-R-ner,0.798 (0.955),0.496 (0.588),0.61 (0.726),0.496 (0.593)
en_core_web_lg,0.725 (0.927),0.492 (0.626),0.585 (0.746),0.492 (0.627)
en_core_web_trf,0.759 (0.967),0.526 (0.668),0.62 (0.789),0.526 (0.669)
flair/ner-dutch-large,0.865 (0.917),0.639 (0.678),0.734 (0.778),0.639 (0.679)
flair/ner-english,0.862 (0.909),0.686 (0.723),0.763 (0.804),0.686 (0.724)
flair/ner-english-large,0.882 (0.932),0.704 (0.744),0.781 (0.825),0.704 (0.744)
flair/ner-multi,0.838 (0.892),0.677 (0.72),0.747 (0.795),0.677 (0.722)
mBERT,0.724 (0.898),0.658 (0.752),0.687 (0.816),0.658 (0.817)
nl_core_news_lg,0.637 (0.784),0.292 (0.357),0.399 (0.489),0.292 (0.357)


#### combine with EUPEG results

In [11]:
eupeg_results = pd.read_excel('geoparsing-results-EUPEG.xlsx')
eupeg_subset = eupeg_results[(eupeg_results['dataset'].isin(['LGL', 'TR-News', 'GeoWebNews'])) &
                             (eupeg_results['geoparser'].isin(['StanfordNER', 'DM_NLP+Pop', 'UniMelb+Pop']))]

eupeg_subset.groupby('geoparser').mean().round(3)

Unnamed: 0_level_0,precision,recall,f-score,accuracy
geoparser,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DM_NLP+Pop,0.781,0.62,0.69,
StanfordNER,0.84,0.663,0.74,0.663
UniMelb+Pop,0.796,0.634,0.703,


## Table 3: model metrics on DPD corpora

In [12]:
results_DPD = results[results['dataset'] == 'DutchPolicyDocs']

results_DPD = results_DPD[results_DPD['model'].isin(['LaBSE', 
                                              'XLM-R-ner',
                                              'flair/ner-multi',
                                              'flair/ner-dutch-large', 
                                              'flair/ner-english-large', 
                                              'mBERT', 
                                              'nl_core_news_lg'])]

In [13]:
results_DPD_s = results_DPD[results_DPD['mode'] == 'strict']
results_DPD_f = results_DPD[results_DPD['mode'] == 'forgiving']

In [14]:
model_metrics_avg_s = results_DPD_s.groupby('model').mean().round(3).rename(columns={'precision': 'precision_s','recall': 'recall_s','f1': 'f1_s', 'acc': 'acc_s'})

model_metrics_avg_f = results_DPD_f.groupby('model').mean().round(3).rename(columns={'precision': 'precision_f','recall': 'recall_f','f1': 'f1_f', 'acc': 'acc_f'})

In [15]:
tmp = pd.concat([model_metrics_avg_s, model_metrics_avg_f], axis=1)

tmp['precision'] = tmp['precision_s'].astype(str) + ' (' + tmp['precision_f'].astype(str) + ')' 

tmp['recall'] = tmp['recall_s'].astype(str) + ' (' + tmp['recall_f'].astype(str) + ')' 

tmp['f1'] = tmp['f1_s'].astype(str) + ' (' + tmp['f1_f'].astype(str) + ')' 

tmp['acc'] = tmp['acc_s'].astype(str) + ' (' + tmp['acc_f'].astype(str) + ')' 

tmp = tmp[['precision', 'recall', 'f1', 'acc']]
tmp

# tmp.to_excel('models-avg-metrics.xlsx')

Unnamed: 0_level_0,precision,recall,f1,acc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LaBSE,0.723 (0.946),0.815 (0.956),0.766 (0.951),0.815 (1.067)
XLM-R-ner,0.765 (0.951),0.527 (0.64),0.624 (0.765),0.527 (0.655)
flair/ner-dutch-large,0.87 (0.958),0.861 (0.938),0.866 (0.948),0.861 (0.948)
flair/ner-english-large,0.878 (0.96),0.883 (0.958),0.88 (0.959),0.883 (0.966)
flair/ner-multi,0.837 (0.948),0.745 (0.831),0.788 (0.886),0.745 (0.844)
mBERT,0.693 (0.942),0.766 (0.932),0.728 (0.937),0.766 (1.042)
nl_core_news_lg,0.794 (0.963),0.537 (0.645),0.64 (0.773),0.537 (0.651)


## Table 4: average metrics per corpora

In [16]:
results_s = results[results['mode'] == 'strict']
results_f = results[results['mode'] == 'forgiving']

In [17]:
dataset_metrics_s = results_s.groupby('dataset').mean().round(3).rename(columns={'precision': 'precision_s','recall': 'recall_s','f1': 'f1_s', 'acc': 'acc_s'})

dataset_metrics_f = results_f.groupby('dataset').mean().round(3)



In [18]:
tmp = pd.concat([dataset_metrics_f, dataset_metrics_s], axis=1)

tmp['avg_precision'] = tmp['precision_s'].astype(str) + ' (' + tmp['precision'].astype(str) + ')' 

tmp['avg_recall'] = tmp['recall_s'].astype(str) + ' (' + tmp['recall'].astype(str) + ')' 

tmp['avg_f1'] = tmp['f1_s'].astype(str) + ' (' + tmp['f1'].astype(str) + ')' 

tmp['avg_acc'] = tmp['acc_s'].astype(str) + ' (' + tmp['acc'].astype(str) + ')' 

tmp = tmp[['avg_precision', 'avg_recall', 'avg_f1', 'avg_acc']]
tmp

# tmp.to_excel('dataset-avg-metrics.xlsx')

Unnamed: 0_level_0,avg_precision,avg_recall,avg_f1,avg_acc
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DutchPolicyDocs,0.767 (0.932),0.583 (0.676),0.632 (0.749),0.583 (0.703)
GWN,0.818 (0.936),0.543 (0.606),0.649 (0.732),0.543 (0.621)
LGL,0.701 (0.859),0.55 (0.654),0.609 (0.734),0.55 (0.668)
TR-News,0.823 (0.929),0.661 (0.73),0.729 (0.814),0.661 (0.746)
