In [1]:
import sys

# Insert utils folder into path
sys.path.insert(1, '../utils')

## Loading Fine-tuned mBERT model and predictions for non-labelled dataset

In [2]:
model_path = '../models/ner-multilingual-bert-fine-tuned-conll-2003'
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

split = True

model_name = 'mBERT'
filtered = False

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=len(label_list))

### DutchPolicyDocs

In [3]:
file_path = '../../../data/DutchPolicyDocs/DutchPolicyDocs.json'
dataset = 'DutchPolicyDocs'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [4]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
import preparing_dataset

DPD = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

  0%|          | 0/2 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [6]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [7]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(DPD)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [8]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, DPD, label_list, tokenizer)

### Evaluation DPD

In [9]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 2860 | tp: 3171 | fn: 1815
precision: 0.526 | recall: 0.636 | f-score: 0.576 | accuracy: 0.572
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 2595 | tp: 3442 | fn: 1514
precision: 0.570 | recall: 0.695 | f-score: 0.626 | accuracy: 0.621
------------------------------------------------------------------------



In [10]:
strict

(['Kanaal',
  'Kanaal',
  'Stadsrivier',
  'De Mare',
  'Alkmaar',
  '-',
  'Noord',
  'Provincie Noord - Holland',
  'Alkmaar',
  'Regio Alkmaar',
  'Noord',
  '-',
  'Holland',
  'Amsterdam',
  'Noord',
  '-',
  'Holland - Noord',
  'Noord',
  'Alkmaar',
  'Alkmaar',
  'Alkmaar',
  'Noord',
  'Noord',
  'Noord',
  '-',
  'Holland',
  '- Noord',
  'Leeghwater',
  'PEN',
  'Overloop',
  'Alkmaar Noord',
  'Sportstad Alkmaar',
  'De Schermer',
  'De Schermer',
  'Woonboulevard',
  'Turfkade',
  'Dollegoor',
  'Noordbroek',
  'Buitenhaven',
  'Aadorp',
  'Gravenkwartier',
  'Stadskade',
  'Kop van Zuid',
  'Almelose',
  'Exoosche Aa',
  'Lateraal',
  'Almelo',
  '-',
  'de Haandrik',
  'Ko',
  'Koorn',
  'Grotestraat',
  '-',
  'Kerkstraat',
  'Molenstraat',
  'Bi',
  '(',
  'e',
  ')',
  'den Broecke',
  "Bloom'n Ven",
  'Nijrees',
  'Aadorp',
  'Almelo',
  '-',
  'De Haandrik',
  'De Woesten',
  'Bleskolk',
  'Almelo',
  'Almelo',
  'Almelose',
  'Kerkelanden',
  'Schelfhorst',
  'Wind

In [11]:
forgiving

(['Kanaal',
  'Stadsrivier',
  'De Mare',
  'Alkmaar',
  '-',
  'Noord',
  'Alkmaar',
  'Regio Alkmaar',
  'Noord',
  '-',
  'Holland',
  'Noord',
  '-',
  'Holland - Noord',
  'Noord',
  'Alkmaar',
  'Alkmaar',
  'Noord',
  'Noord',
  'Noord',
  '-',
  'Holland',
  '- Noord',
  'Leeghwater',
  'PEN',
  'Overloop',
  'Alkmaar Noord',
  'De Schermer',
  'De Schermer',
  'Woonboulevard',
  'Turfkade',
  'Dollegoor',
  'Noordbroek',
  'Buitenhaven',
  'Aadorp',
  'Gravenkwartier',
  'Almelose',
  'Lateraal',
  'Almelo',
  '-',
  'de Haandrik',
  'Ko',
  'Koorn',
  'Grotestraat',
  '-',
  'Kerkstraat',
  'Molenstraat',
  'Bi',
  '(',
  'e',
  ')',
  'den Broecke',
  "Bloom'n Ven",
  'Nijrees',
  'Almelo',
  '-',
  'De Haandrik',
  'De Woesten',
  'Bleskolk',
  'Almelo',
  'Almelo',
  'Almelose',
  'Kerkelanden',
  'Schelfhorst',
  'Windmolenbroek',
  'Buiten Wonen',
  'De Woesten',
  'Buiten Wonen',
  'Noord',
  '-',
  'Brabant',
  'Baarle',
  'Baarle',
  'Baarle',
  '-',
  'Nassau',
  'Ba

## TR-News

### Loading the dataset

In [12]:
# Get file path TR-News dataset
file_path = '../../../data/TR-News/TR-News.xml'
dataset = 'TR-News'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [13]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [14]:
import preparing_dataset

TRN = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [16]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(TRN)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [17]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, TRN, label_list, tokenizer)

### Evaluation TR-News

In [18]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 298 | tp: 954 | fn: 341
precision: 0.762 | recall: 0.737 | f-score: 0.749 | accuracy: 0.724
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 253 | tp: 999 | fn: 300
precision: 0.798 | recall: 0.769 | f-score: 0.783 | accuracy: 0.758
------------------------------------------------------------------------



In [19]:
strict

(['-',
  'White House',
  'South Regional Jail',
  'U',
  '.',
  'S',
  '.',
  'Rose Garden',
  'Southern Poverty Law Center',
  'Ronald Reagan Building',
  'U',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'Southern',
  'Wooster St.',
  'Cumberland Farms',
  'South Main St',
  '-',
  'North West',
  'Kremlin',
  'The',
  'Kremlin',
  'Downtown Eastside',
  'London City',
  'London City',
  'Central Anatolia',
  'Rhineland',
  '-',
  'Palatinate',
  'Dolby Theatre',
  'Air Berlin',
  'B',
  '.',
  'C',
  '.',
  'Macdonald',
  '-',
  'Cartier International Airport',
  'Montreal',
  '-',
  'Pierre Elliott Trudeau International Airport',
  'Mont',
  '-',
  'Royal Avenue',
  'Sage House',
  "Children's Hospital of Manitoba",
  "' s Hospital",
  'St',
  'Phoenix',
  'Phoenix',
  'Que',
  '.',
  'Crusader',
  'Karak',
  'Paris Town Hall',
  'Nazi Ger

In [20]:
forgiving

(['-',
  'White House',
  'South Regional Jail',
  'U',
  '.',
  'S',
  '.',
  'Rose Garden',
  'Southern Poverty Law Center',
  'Ronald Reagan Building',
  'U',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'Islamic State',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'Southern',
  'Wooster St.',
  'Cumberland Farms',
  'South Main St',
  '-',
  'North West',
  'Kremlin',
  'The',
  'Kremlin',
  'Downtown Eastside',
  'Rhineland',
  '-',
  'Palatinate',
  'Dolby Theatre',
  'B',
  '.',
  'C',
  '.',
  'Macdonald',
  '-',
  'Cartier International Airport',
  'Montreal',
  '-',
  'Pierre Elliott Trudeau International Airport',
  'Mont',
  '-',
  'Royal Avenue',
  'Sage House',
  "' s Hospital",
  'St',
  'Phoenix',
  'Phoenix',
  'Que',
  'Crusader',
  'Karak',
  'Petit Cambodge',
  'St',
  '. Peter',
  'St. Paul',
  'St',
  '. Mark',
  'Cop',
  'Orthodox Cathedral',
  'Coptic Church',
  'Islamic Sharia',
 

## LGL

### Loading the dataset

In [21]:
# Get file path LGL dataset
file_path = '../../../data/LGL/LGL.xml'
dataset = 'LGL'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [22]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [23]:
import preparing_dataset

LGL = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [24]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [25]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(LGL)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [26]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, LGL, label_list, tokenizer)

### Evaluation LGL

In [27]:
import evaluate

# All toponyms
strict, forgiving = evaluate.evaluate(data_all_toponyms, processed_results,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 1676 | tp: 3256 | fn: 1680
precision: 0.660 | recall: 0.660 | f-score: 0.660 | accuracy: 0.640
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 1343 | tp: 3595 | fn: 1358
precision: 0.728 | recall: 0.726 | f-score: 0.727 | accuracy: 0.707
------------------------------------------------------------------------



In [28]:
strict

(['Orchard St.',
  'Cottonport Fire Station',
  'St',
  '. James Youth Detention Center',
  'Minnesota House',
  'R',
  '-',
  'Otter Tail',
  'Highway',
  'Otter Tail / Grant',
  'Grant /',
  'Wilkin',
  'Otter',
  'Tail',
  'Highway',
  '-',
  'Hesco',
  'Fargo City',
  'Fargodome',
  'Sandbag Central',
  'Douglas County Hospital',
  'St',
  'R',
  '-',
  'Ky',
  '.',
  'Conn',
  '.',
  'S',
  '.',
  'D',
  '.',
  'Ariz',
  '.',
  'Minnie Howard',
  'Durant Center',
  'Washington,',
  'D. C',
  '.',
  'Northwest D. C.',
  'D',
  '.',
  'C',
  '.',
  'East End',
  'Southern Sudan',
  'Sub',
  '-',
  'Saharan Africa',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Gulf',
  'Gulf',
  'US',
  'US',
  'US',
  'Gulf',
  'US',
  'Gulf',
  'US',
  'Sharm El - Sheikh',
  'St',
  ". John's Lutheran School",
  'Sheldon Peck Homestead',
  'Woodfield Shopping Center',
  'Streets of Woodfield',
  'Hudson',
  'Decatur',
  'U',
  '.',
  'S',
  '.',
  'Ill'

In [29]:
forgiving

(['Orchard St.',
  'St',
  '. James Youth Detention Center',
  'R',
  '-',
  'Otter Tail',
  'Highway',
  'Grant /',
  'Otter',
  'Highway',
  '-',
  'Hesco',
  'Fargodome',
  'Sandbag Central',
  'St',
  'R',
  '-',
  'Ky',
  'Conn',
  'S',
  '.',
  'D',
  '.',
  'Ariz',
  'Minnie Howard',
  'Durant Center',
  'D',
  '.',
  'C',
  '.',
  'East End',
  'Sub',
  '-',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Egypt',
  'Sudan',
  'Gulf',
  'Gulf',
  'US',
  'US',
  'US',
  'Gulf',
  'US',
  'Gulf',
  'US',
  'St',
  ". John's Lutheran School",
  'Sheldon Peck Homestead',
  'Hudson',
  'Decatur',
  'U',
  '.',
  'S',
  '.',
  'Ill',
  'Pe',
  'Tarrant',
  'Houston',
  'Big',
  'Law Enforcement Center',
  'City Hall',
  'E',
  'City Hall',
  'North',
  'Cooper Street',
  'Viridian',
  'Viridian',
  'Rolling Hills Country Club',
  'Madison Creek',
  'Hampton Hills',
  'Silkwood Trail',
  'Sports Center',
  'Exxon',
  'Durango',
  'Durango',
  'USMD Medical Center'

## GeoWebNews

### Loading the dataset

In [30]:
# Get file path GWN dataset
file_path = '../../../data/GeoWebNews/GeoWebNews.xml'
dataset = 'GWN'
filtered = True

import loading_functions

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=filtered, split=split)

### Processing the data for Huggingface Trainer

In [31]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [32]:
import preparing_dataset

GWN_filtered = preparing_dataset.prepare_dataset(data_filtered_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [33]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [34]:
import numpy as np

raw_pred_filtered, _, _ = test_trainer.predict(GWN_filtered)
predictions_filtered = np.argmax(raw_pred_filtered, axis=2)

### Process predictions

In [37]:
import process_predictions

processed_predictions_filtered = process_predictions.process_predictions(predictions_filtered, GWN_filtered, label_list, tokenizer)

### Evaluation GWN

In [38]:
import evaluate

# Filtered toponyms
strict, forgiving = evaluate.evaluate(data_filtered_toponyms, processed_predictions_filtered,
                                      model_name=model_name, dataset=dataset, filtered=filtered)

Evaluation mode: strict
fp: 449 | tp: 1584 | fn: 892
precision: 0.779 | recall: 0.640 | f-score: 0.703 | accuracy: 0.610
------------------------------------------------------------------------

Evaluation mode: forgiving
fp: 407 | tp: 1626 | fn: 851
precision: 0.800 | recall: 0.656 | f-score: 0.721 | accuracy: 0.626
------------------------------------------------------------------------



In [39]:
strict

(['Faubourg',
  'Royal',
  'Desire',
  'Claiborne Power House',
  'Champs',
  '-',
  'Élysées',
  'Smoky Mary',
  'Jamison',
  'Norbury',
  'Granville',
  'Somerville',
  'Jamison',
  'Norbury',
  'Norbury',
  'Club',
  'Bed',
  'Buckland',
  'Ronald',
  'Patriot',
  'Vienna',
  'Fairfax',
  '-',
  'GMU',
  'Orange',
  'Turtle Creek Cir.',
  'Va',
  'Marjory Stoneman',
  'Kremlin',
  'St',
  ". Peter's Basilica",
  'White House',
  'Commonwealth of Independent States',
  'CIS',
  'Dubai',
  'North America',
  'Treasure Beach',
  'Washington, D. C.',
  'U',
  'Sheikh Akil',
  'Islamic State',
  'CTV Saskatoon',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'West',
  'Aeroflot',
  'Kremlin',
  'VNA',
  'Tamil',
  'Tamil',
  'States',
  'No',
  '2',
  '2',
  'Eva',
  'Seasons',
  'DWTC',
  'Bakgatla',
  '-',
  'ba',
  '-',
  'Kgafela',
  'Bakgatla',
  'Washington,',
  'D',
  '. C.',
  'In',
  'D',
  '.',
  'C',
  '.',
  'U',
  'GA',
  'CT',
  'CT',
  '-',
  'U',
  'Pontifex',


In [40]:
forgiving

(['Faubourg',
  'Desire',
  'Champs',
  '-',
  'Élysées',
  'Smoky Mary',
  'Jamison',
  'Norbury',
  'Granville',
  'Somerville',
  'Jamison',
  'Norbury',
  'Norbury',
  'Bed',
  'Buckland',
  'Ronald',
  'Patriot',
  'Vienna',
  'Fairfax',
  '-',
  'GMU',
  'Orange',
  'Turtle Creek Cir.',
  'Va',
  'Marjory Stoneman',
  'Kremlin',
  'St',
  'White House',
  'Commonwealth of Independent States',
  'CIS',
  'Dubai',
  'North America',
  'Treasure Beach',
  'U',
  'Sheikh Akil',
  'Islamic State',
  'U',
  '.',
  'S',
  '.',
  'U',
  '.',
  'S',
  '.',
  'West',
  'Aeroflot',
  'Kremlin',
  'VNA',
  'Tamil',
  'Tamil',
  'States',
  'No',
  '2',
  '2',
  'Eva',
  'Seasons',
  'DWTC',
  'Bakgatla',
  '-',
  'ba',
  '-',
  'Kgafela',
  'Bakgatla',
  'Washington,',
  'D',
  '. C.',
  'In',
  'D',
  '.',
  'C',
  '.',
  'U',
  'GA',
  'CT',
  'CT',
  '-',
  'U',
  'Pontifex',
  'CALAM',
  'Read More',
  'Umuanunu',
  'Benue',
  'Benue',
  'Guma',
  'Peshawar',
  'New Quay',
  'Gilgit',
  