In [1]:
import sys

# Insert utils folder into path
sys.path.insert(1, '../utils')

## Loading Fine-tuned LaBSE model

In [2]:
model_path = '../models/LaBSE-fine-tuned-conll-2003'
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

split = True

model_name = 'LaBSE'
filtered = False

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=len(label_list))

### DutchPolicyDocs

In [3]:
file_path = '../../../data/DutchPolicyDocs/DutchPolicyDocs.json'
dataset = 'DutchPolicyDocs'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [4]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
import preparing_dataset

DPD = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

  0%|          | 0/2 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [6]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [7]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(DPD)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [8]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, DPD, label_list, tokenizer)

### Evaluation DPD

In [9]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 2809 | tp: 3346 | fn: 1659
precision: 0.544 | recall: 0.669 | f-score: 0.600 | accuracy: 0.604
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 2534 | tp: 3625 | fn: 1353
precision: 0.589 | recall: 0.728 | f-score: 0.651 | accuracy: 0.654
------------------------------------------------------------------------



In [10]:
strict

(['Noordholland',
  'Kanaal',
  'IJ',
  'Kanaal',
  'Stadsrivier',
  'Metropool',
  'Amsterdam',
  'A9',
  'Metro',
  'Amsterdam',
  'Alkmaar',
  '-',
  'Noord',
  'Smart Town Alkmaar',
  'Alkmaar',
  'Noord',
  '- Holland',
  'Metropool',
  'Amsterdam',
  'Noord',
  '-',
  'Holland',
  '- Noord',
  'Noord',
  'Alkmaar',
  'Metropoolregio',
  'Amsterdam',
  'Alkmaar',
  'Alkmaar',
  'Metropoolregio',
  'Noord',
  '-',
  'Holland - Noord',
  'PEN',
  'Alkmaar Noord',
  'Sportstad Alkmaar',
  'De Schermer',
  'De Schermer',
  'Woonboulevard',
  'Zgt',
  'XL Businesspark Twente',
  'Bedrijvenpark',
  'Twente',
  'Bedrijventerrein',
  'Twentepoort',
  'Bornsestraat',
  'Turfkade',
  'Dollegoor',
  'Noordbroek',
  'Buitenhaven',
  'A35',
  'Aadorp',
  'Het Gravenkwartier',
  'De',
  'Kop van Zuid',
  'De',
  'Almelose',
  'Exoosche Aa',
  'Lateraal',
  'Almelo',
  '-',
  'de Haandrik',
  'Almelo',
  'Koornmarkt',
  'Grotestraat',
  '-',
  'Kerkstraat',
  'Molenstraat',
  'Bi',
  '(',
  'e',

In [11]:
forgiving

(['Noordholland',
  'Kanaal',
  'Stadsrivier',
  'Metropool',
  'A9',
  'Metro',
  'Alkmaar',
  '-',
  'Noord',
  'Smart Town Alkmaar',
  'Alkmaar',
  'Noord',
  'Metropool',
  'Noord',
  '-',
  'Holland',
  '- Noord',
  'Noord',
  'Alkmaar',
  'Metropoolregio',
  'Alkmaar',
  'Metropoolregio',
  'Noord',
  '-',
  'Holland - Noord',
  'PEN',
  'Alkmaar Noord',
  'De Schermer',
  'De Schermer',
  'Woonboulevard',
  'Zgt',
  'XL Businesspark Twente',
  'Bedrijvenpark',
  'Twente',
  'Bedrijventerrein',
  'Twentepoort',
  'Bornsestraat',
  'Turfkade',
  'Dollegoor',
  'Noordbroek',
  'Buitenhaven',
  'A35',
  'Aadorp',
  'Het Gravenkwartier',
  'De',
  'De',
  'Almelose',
  'Lateraal',
  'Almelo',
  '-',
  'de Haandrik',
  'Almelo',
  'Grotestraat',
  '-',
  'Kerkstraat',
  'Molenstraat',
  'Bi',
  '(',
  'e',
  ') den Broecke',
  'Bloom',
  "'",
  'n Ven',
  'Almelo',
  '-',
  'De Haandrik',
  'De Woesten',
  'Bleskolk',
  'Almelo',
  'Almelo',
  'Almelos',
  'Kerkelanden',
  'Schelfhors

## TR-News

### Loading the dataset

In [12]:
# Get file path TR-News dataset
file_path = '../../../data/TR-News/TR-News.xml'
dataset = 'TR-News'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [13]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [14]:
import preparing_dataset

TRN = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [16]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(TRN)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [17]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, TRN, label_list, tokenizer)

### Evaluation TR-News

In [18]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 316 | tp: 977 | fn: 323
precision: 0.756 | recall: 0.752 | f-score: 0.754 | accuracy: 0.741
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 263 | tp: 1030 | fn: 273
precision: 0.797 | recall: 0.790 | f-score: 0.794 | accuracy: 0.781
------------------------------------------------------------------------



In [19]:
strict

(['White House',
  'White House',
  'Interstate 64',
  'South Regional Jail',
  'U',
  '.',
  'S',
  '.',
  'Rose Garden',
  'Rose Garden',
  'Ronald Reagan Building',
  'U',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'U',
  '.',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'Islamic State',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'U',
  '.',
  'S',
  '.',
  'Southern District of Ohio',
  'U',
  '.',
  'S',
  '.',
  'San Antonio Four',
  'Bantam Superior Court',
  'Wooster St.',
  'Litchfield Superior Court',
  'Cumberland Farms',
  'South Main St',
  '-',
  'North West',
  'Twitter',
  'Facebook',
  'Kremlin',
  'Kremlin',
  'Downtown Eastside',
  'London City',
  'London City',
  'Central Anatolia',
  'Rhineland',
  '-',
  'Palatinate',
  'Film',
  'Dolby Theatre',
  'B',
  '.',
  'C',
  '.',
  'Macdonald',
  '-',
  'Cartier International Airport',
  'Montreal',
  '-',
  'Pierre Elliott Trude

In [20]:
forgiving

(['White House',
  'White House',
  'Interstate 64',
  'South Regional Jail',
  'U',
  '.',
  'S',
  '.',
  'Rose Garden',
  'Rose Garden',
  'Ronald Reagan Building',
  'U',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'U',
  '.',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'Islamic State',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'San Antonio Four',
  'Wooster St.',
  'Cumberland Farms',
  'South Main St',
  '-',
  'North West',
  'Twitter',
  'Facebook',
  'Kremlin',
  'Kremlin',
  'Downtown Eastside',
  'Rhineland',
  '-',
  'Palatinate',
  'Film',
  'Dolby Theatre',
  'B',
  '.',
  'C',
  '.',
  'Macdonald',
  '-',
  'Cartier International Airport',
  'Montreal',
  '-',
  'Pierre Elliott Trudeau International Airport',
  'Mont',
  '-',
  'Royal Avenue',
  'Sage House',
  "Children's Hospital of",
  'St',
  'Phoenix',
  'Phoenix',
  'Qu

## LGL

### Loading the dataset

In [21]:
# Get file path LGL dataset
file_path = '../../../data/LGL/LGL.xml'
dataset = 'LGL'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [22]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [23]:
import preparing_dataset

LGL = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [24]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [25]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(LGL)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [26]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, LGL, label_list, tokenizer)

### Evaluation LGL

In [27]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 1799 | tp: 3381 | fn: 1588
precision: 0.653 | recall: 0.680 | f-score: 0.666 | accuracy: 0.665
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 1423 | tp: 3762 | fn: 1224
precision: 0.726 | recall: 0.755 | f-score: 0.740 | accuracy: 0.739
------------------------------------------------------------------------



In [28]:
strict

(['Kelleyland',
  'Orchard St.',
  'Cottonport Fire Station',
  'Warden',
  'St',
  '. James Youth Detention Center',
  'Pointe Coupe',
  'Gannett',
  'DeSoto',
  'R',
  '-',
  'Otter Tail County Road',
  'Highway',
  'Otter Tail',
  'Grant',
  'Grant /',
  'Wilkin',
  'Douglas County Road 4',
  'Highway 12',
  'Highway',
  '-',
  'Hesco',
  'Fargodome',
  'Sandbag Central',
  'Road',
  'County Road',
  'Road',
  'Douglas County Hospital',
  'St',
  'Ky',
  '.',
  'Conn',
  '.',
  'S',
  '.',
  'D',
  '.',
  'Ariz',
  '.',
  'Alexandria City',
  'Minnie Howard',
  'Durant Center',
  'Washington Latin School',
  'Washington Latin Public Charter School',
  'D. C.',
  'Washington Latin School',
  'Washington Latin School Charter School',
  'Northwest D. C.',
  'D',
  '.',
  'C',
  '.',
  'Washington Latin School',
  'East End',
  'Sub',
  '-',
  'Saharan Africa',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Misri',
  'Gulf',
  'Madrid',
  'Gul

In [29]:
forgiving

(['Kelleyland',
  'Orchard St.',
  'Warden',
  'St',
  '. James Youth Detention Center',
  'Pointe Coupe',
  'Gannett',
  'DeSoto',
  'R',
  '-',
  'Highway',
  'Otter Tail',
  'Grant /',
  'Highway 12',
  'Highway',
  '-',
  'Hesco',
  'Fargodome',
  'Sandbag Central',
  'Road',
  'County Road',
  'Road',
  'St',
  'Ky',
  'Conn',
  'S',
  '.',
  'D',
  '.',
  'Ariz',
  'Minnie Howard',
  'Durant Center',
  'Washington Latin School',
  'Washington Latin Public Charter School',
  'Washington Latin School',
  'D',
  '.',
  'C',
  '.',
  'Washington Latin School',
  'East End',
  'Sub',
  '-',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Misri',
  'Gulf',
  'Madrid',
  'Gulf',
  'US',
  'US',
  'US',
  'Gulf',
  'US',
  'Gulf',
  'US',
  'St',
  ". John's Lutheran School",
  'Sheldon Peck Homestead',
  'Pleasant Lane',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'Ill',
  'Houston',
  'City Hall',
  'E',
  'U Frame It',
  'Martha

## GeoWebNews

### Loading the dataset

In [31]:
# Get file path GWN dataset
file_path = '../../../data/GeoWebNews/GeoWebNews.xml'
dataset = 'GWN'
filtered = True

import loading_functions

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [32]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [33]:
import preparing_dataset

GWN_filtered = preparing_dataset.prepare_dataset(data_filtered_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [34]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [35]:
import numpy as np

raw_pred_filtered, _, _ = test_trainer.predict(GWN_filtered)
predictions_filtered = np.argmax(raw_pred_filtered, axis=2)

### Process predictions

In [38]:
import process_predictions

processed_predictions_filtered = process_predictions.process_predictions(predictions_filtered, GWN_filtered, label_list, tokenizer)

### Evaluation GWN

In [39]:
import evaluate

# Filtered toponyms
strict, forgiving = evaluate.evaluate(data_filtered_toponyms, processed_predictions_filtered,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 502 | tp: 1640 | fn: 857
precision: 0.766 | recall: 0.657 | f-score: 0.707 | accuracy: 0.631
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 441 | tp: 1701 | fn: 799
precision: 0.794 | recall: 0.680 | f-score: 0.733 | accuracy: 0.655
------------------------------------------------------------------------



In [40]:
strict

(['Bernard Xavier Philippe de Marigny de Mandeville',
  'Louisiana Purchase',
  'Royal',
  'Desire',
  'Champs',
  '-',
  'Élysées',
  'Smoky Mary',
  'Jamison',
  'Norbury',
  'Granville',
  'Somerville',
  'Jamison',
  'Norbury',
  'Norbury',
  'Stonewall',
  'Club',
  'Buckland',
  'Vienna',
  'Fairfax',
  '-',
  'GMU',
  'Orange',
  'Turtle Creek Cir',
  'Va',
  'Kremlin',
  'Pope',
  'St',
  ". Peter's Basilica",
  'White House',
  'Below Deck Mediterranean',
  'Below Deck',
  'Manila',
  'Manila',
  'Commonwealth of Independent States',
  'CIS',
  'CIS',
  'CIS',
  'North America',
  'Gate 10',
  'Gate 10',
  'Washington',
  'D',
  '.',
  'C',
  '.',
  'U',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'Kremlin',
  '-',
  '-',
  'Tamil',
  'Nadu',
  'States',
  'State of Tamil Nadu',
  'No',
  '. 2 Detention Centre',
  'No',
  '. 2 Detention Centre',
  'Kyjev',
  'Eva',
  "'",
  's Village',
  'Eva',
  "'",
  's Village',
  'New Jersey,',
  'Bakgatla',
  '-',
  'ba',


In [41]:
forgiving

(['Bernard Xavier Philippe de Marigny de Mandeville',
  'Desire',
  'Champs',
  '-',
  'Élysées',
  'Smoky Mary',
  'Jamison',
  'Norbury',
  'Granville',
  'Somerville',
  'Jamison',
  'Norbury',
  'Norbury',
  'Stonewall',
  'Buckland',
  'Vienna',
  'Fairfax',
  '-',
  'GMU',
  'Orange',
  'Turtle Creek Cir',
  'Va',
  'Kremlin',
  'Pope',
  'St',
  'White House',
  'Below Deck',
  'Manila',
  'Manila',
  'Commonwealth of Independent States',
  'CIS',
  'CIS',
  'CIS',
  'North America',
  'Gate 10',
  'Gate 10',
  'Washington',
  'D',
  '.',
  'C',
  '.',
  'U',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'Kremlin',
  '-',
  '-',
  'Tamil',
  'States',
  'No',
  'No',
  '. 2 Detention Centre',
  'Eva',
  "'",
  's Village',
  'Eva',
  "'",
  's Village',
  'Bakgatla',
  '-',
  'ba',
  '-',
  'Kgafela',
  'Bakgatla',
  'Washington',
  'D',
  '.',
  'C',
  '.',
  'U',
  'Church',
  'U',
  'Umuanunu',
  'Benue',
  'Benue',
  'Market at',
  'FATA',
  'Pine',
  'Processing