In [1]:
import sys

# Insert utils folder into path
sys.path.insert(1, '../utils')

## Loading Fine-tuned LaBSE model

In [2]:
model_path = '../models/LaBSE-fine-tuned-conll-2003'
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

split = True

model_name = 'LaBSE'
filtered = False

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=len(label_list))

### DutchPolicyDocs

In [3]:
file_path = '../../../data/DutchPolicyDocs/DutchPolicyDocs.json'
dataset = 'DutchPolicyDocs'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [4]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
import preparing_dataset

DPD = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

  0%|          | 0/2 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [6]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [7]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(DPD)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [8]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, DPD, label_list, tokenizer)

### Evaluation DPD

In [9]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 2904 | tp: 3346 | fn: 2197
precision: 0.535 | recall: 0.604 | f-score: 0.567 | accuracy: 0.604
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 2625 | tp: 3625 | fn: 1919
precision: 0.580 | recall: 0.654 | f-score: 0.615 | accuracy: 0.654
------------------------------------------------------------------------



In [11]:
# Store outcomes

import store_outcomes

store_outcomes.store_outcome(model_name, dataset, strict, forgiving)

## TR-News

### Loading the dataset

In [12]:
# Get file path TR-News dataset
file_path = '../../../data/TR-News/TR-News.xml'
dataset = 'TR-News'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [13]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [14]:
import preparing_dataset

TRN = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [16]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(TRN)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [17]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, TRN, label_list, tokenizer)

### Evaluation TR-News

In [18]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 328 | tp: 977 | fn: 341
precision: 0.749 | recall: 0.741 | f-score: 0.745 | accuracy: 0.741
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 275 | tp: 1030 | fn: 288
precision: 0.789 | recall: 0.781 | f-score: 0.785 | accuracy: 0.781
------------------------------------------------------------------------



In [19]:
# Store outcomes

import store_outcomes

store_outcomes.store_outcome(model_name, dataset, strict, forgiving)

## LGL

### Loading the dataset

In [20]:
# Get file path LGL dataset
file_path = '../../../data/LGL/LGL.xml'
dataset = 'LGL'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [21]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [22]:
import preparing_dataset

LGL = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [23]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [24]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(LGL)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [25]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, LGL, label_list, tokenizer)

### Evaluation LGL

In [26]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 1915 | tp: 3381 | fn: 1707
precision: 0.638 | recall: 0.665 | f-score: 0.651 | accuracy: 0.665
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 1534 | tp: 3762 | fn: 1333
precision: 0.710 | recall: 0.738 | f-score: 0.724 | accuracy: 0.739
------------------------------------------------------------------------



In [27]:
# Store outcomes

import store_outcomes

store_outcomes.store_outcome(model_name, dataset, strict, forgiving)

## GeoWebNews

### Loading the dataset

In [28]:
# Get file path GWN dataset
file_path = '../../../data/GeoWebNews/GeoWebNews.xml'
dataset = 'GWN'
filtered = True

import loading_functions

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [29]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [30]:
import preparing_dataset

GWN_filtered = preparing_dataset.prepare_dataset(data_filtered_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [31]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [32]:
import numpy as np

raw_pred_filtered, _, _ = test_trainer.predict(GWN_filtered)
predictions_filtered = np.argmax(raw_pred_filtered, axis=2)

### Process predictions

In [33]:
import process_predictions

processed_predictions_filtered = process_predictions.process_predictions(predictions_filtered, GWN_filtered, label_list, tokenizer)

### Evaluation GWN

In [34]:
import evaluate

# Filtered toponyms
strict, forgiving = evaluate.evaluate(data_filtered_toponyms, processed_predictions_filtered,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 523 | tp: 1640 | fn: 958
precision: 0.758 | recall: 0.631 | f-score: 0.689 | accuracy: 0.631
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 462 | tp: 1701 | fn: 897
precision: 0.786 | recall: 0.655 | f-score: 0.715 | accuracy: 0.655
------------------------------------------------------------------------



In [35]:
# Store outcomes

import store_outcomes

store_outcomes.store_outcome(model_name, dataset, strict, forgiving)