In [1]:
import sys

# Insert utils folder into path
sys.path.insert(1, '../utils')

## Load the default transfomer spacy model

In [2]:
import spacy

split = True

model_name = 'nl_core_news_lg'
filtered = False

# Only enable the ner tagger
ner_pipeline = spacy.load(model_name, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

## DutchPolicyDocs

In [3]:
file_path = '../../data/DutchPolicyDocs/DutchPolicyDocs.json'
dataset = 'DutchPolicyDocs'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=False)

### Make Predictions

In [4]:
import spaCy_predictions

processed_results = spaCy_predictions.make_predictions(ner_pipeline, data_all_toponyms)

1044it [00:02, 356.44it/s]


### Evaluation DPD

In [5]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Do you want to overwrite results? (y/n)y
Evaluation mode: strict
fp: 686 | tp: 3062 | fn: 2481
precision: 0.817 | recall: 0.552 | f-score: 0.659 | accuracy: 0.552
------------------------------------------------------------------------

Do you want to overwrite results? (y/n)y
Evaluation mode: forgiving
fp: 140 | tp: 3608 | fn: 1985
precision: 0.963 | recall: 0.645 | f-score: 0.773 | accuracy: 0.651
------------------------------------------------------------------------



In [6]:
# Store outcomes

import store_outcomes

store_outcomes.store_outcome(model_name, dataset, strict, forgiving)

## TR-News

### Loading the dataset

In [7]:
# Get file path TR-News dataset
file_path = '../../data/TR-News/TR-News.xml'
dataset = 'TR-News'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Make Predictions

In [8]:
import spaCy_predictions

processed_results = spaCy_predictions.make_predictions(ner_pipeline, data_all_toponyms)

174it [00:01, 100.63it/s]


### Evaluation TR-News

In [9]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Do you want to overwrite results? (y/n)y
Evaluation mode: strict
fp: 163 | tp: 534 | fn: 784
precision: 0.766 | recall: 0.405 | f-score: 0.530 | accuracy: 0.405
------------------------------------------------------------------------

Do you want to overwrite results? (y/n)y
Evaluation mode: forgiving
fp: 103 | tp: 594 | fn: 726
precision: 0.852 | recall: 0.450 | f-score: 0.589 | accuracy: 0.451
------------------------------------------------------------------------



In [10]:
# Store outcomes

import store_outcomes

store_outcomes.store_outcome(model_name, dataset, strict, forgiving)

## LGL

### Loading the dataset

In [11]:
# Get file path LGL dataset
file_path = '../../data/LGL/LGL.xml'
dataset = 'LGL'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Make Predictions

In [12]:
import spaCy_predictions

processed_results = spaCy_predictions.make_predictions(ner_pipeline, data_all_toponyms)

887it [00:07, 117.59it/s]


### Evaluation LGL

In [13]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Do you want to overwrite results? (y/n)y
Evaluation mode: strict
fp: 870 | tp: 1278 | fn: 3810
precision: 0.595 | recall: 0.251 | f-score: 0.353 | accuracy: 0.251
------------------------------------------------------------------------

Do you want to overwrite results? (y/n)y
Evaluation mode: forgiving
fp: 564 | tp: 1584 | fn: 3518
precision: 0.737 | recall: 0.310 | f-score: 0.437 | accuracy: 0.311
------------------------------------------------------------------------



In [14]:
# Store outcomes

import store_outcomes

store_outcomes.store_outcome(model_name, dataset, strict, forgiving)

## GeoWebNews

### Loading the dataset

In [15]:
# Get file path GWN dataset
file_path = '../../data/GeoWebNews/GeoWebNews.xml'
dataset = 'GWN'
filtered = True

import loading_functions

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Make Predictions

In [16]:
import spaCy_predictions

processed_predictions_filtered = spaCy_predictions.make_predictions(ner_pipeline, data_filtered_toponyms)

346it [00:03, 106.36it/s]


### Evaluation GWN

In [17]:
import evaluate

# Filtered toponyms
strict, forgiving = evaluate.evaluate(data_filtered_toponyms, processed_predictions_filtered,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Do you want to overwrite results? (y/n)y
Evaluation mode: strict
fp: 335 | tp: 726 | fn: 1872
precision: 0.684 | recall: 0.279 | f-score: 0.397 | accuracy: 0.279
------------------------------------------------------------------------

Do you want to overwrite results? (y/n)y
Evaluation mode: forgiving
fp: 213 | tp: 848 | fn: 1760
precision: 0.799 | recall: 0.325 | f-score: 0.462 | accuracy: 0.326
------------------------------------------------------------------------



In [18]:
# Store outcomes

import store_outcomes

store_outcomes.store_outcome(model_name, dataset, strict, forgiving)