In [1]:
import sys

# Insert utils folder into path
sys.path.insert(1, '../utils')

## Load the default spacy model

In [2]:
import spacy

# Only enable the ner tagger
ner_pipeline = spacy.load("en_core_web_lg", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

## DutchPolicyDocs

### Loading the dataset

In [3]:
file_path = '../../../data/DutchPolicyDocs/DutchPolicyDocs.json' 

import loading_functions

toponym_data = loading_functions.prepare_data(file_path, filtered=False, split=False)

### Make Predictions

In [4]:
import spaCy_predictions

processed_results = spaCy_predictions.make_predictions(ner_pipeline, toponym_data)

1044it [00:02, 388.10it/s]


### Evaluation DPD

In [5]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(toponym_data, processed_results)

fp: 599 | tp: 467 | fn: 2430
precision: 0.438 | recall: 0.161 | f-score: 0.236


In [6]:
fps

['Alkmaar-Noord',
 'Alkmaar',
 'Amsterdam',
 'Noord',
 'Alkmaar',
 'De stad maakt',
 'Alkmaar',
 'Alkmaar',
 'De gemeente maakt',
 'Alkmaar',
 'Noord',
 'Alkmaar',
 'Het ziekenhuis',
 'de Alkmaarderhout',
 'Kerkelanden',
 'de 19e',
 'Het dorp heeft',
 'Baarle',
 'Baarle',
 'De gemeente kent',
 'Tilburg',
 'Tilburg en Turnhout',
 'De provincie en de beide',
 'Eikelenbosch',
 'de verte',
 'Voor Bergen',
 'de buurt van de rivier',
 'Bergen',
 'aan de vele dagelijkse',
 'Nieuw Bergen',
 'de lat hoog',
 'Nieuw Bergen',
 'Nieuw Bergen',
 'Bladel de komende',
 'binnen de gemeente',
 'de Verordening ruimte en de beleidsregel',
 'De tijden dat de overheid alleen de inrichting van Nederland',
 'De stad Brielle',
 'Zuid',
 'Utrecht',
 'Unesco',
 'Utrecht',
 'de stad',
 'de Amerongsewetering en de Rijndijk',
 'de Achterdijk',
 'de Amerongerwetering',
 'Kenmerkend',
 'Koningsdag',
 'Julianadorp',
 'Holland',
 'De provincie',
 'de buurt van de voorzieningen van het',
 'de Duinzoom',
 'Holland',
 'De

In [7]:
fns

['Alkmaar',
 'Alkmaar',
 'Alkmaar',
 'Noordhollandsch Kanaal',
 'het IJ',
 'Amstel',
 'Maas',
 'Alkmaar',
 'Alkmaar',
 'Metropoolregio Amsterdam',
 'Metropoolregio Amsterdam',
 'Grootschermer',
 'De Rijp',
 'Alkmaar-Noord',
 'winkelcentrum De Mare',
 'Provincie Noord-Holland',
 'Noordhollandsch Kanaal',
 'Alkmaar',
 'Alkmaar',
 'Regio Alkmaar',
 'Alkmaar',
 'Alkmaar',
 'Noord-Holland',
 'Metropoolregio Amsterdam',
 'Noord-Holland-Noord',
 'Regio Alkmaar',
 'Alkmaar',
 'Alkmaar',
 'Metropoolregio Amsterdam',
 'Zaancorridor',
 'gemeente Alkmaar',
 'Noordhollandsch Kanaal',
 'Alkmaar',
 'Alkmaar',
 'Alkmaar',
 'Overloopwijken',
 'Alkmaar Noord',
 'Alkmaar Noord',
 'Alkmaarderhout',
 'Stompetoren',
 'Overijssel',
 'Almelo',
 'Almelo',
 'Almelo',
 'Loolee',
 'Bornsebeek',
 'Almelose en Exoosche Aa',
 'Regge',
 'Lateraal kanaal',
 'Doorbraak',
 'kanaal Almelo - de Haandrik',
 'Bornerbroek',
 'Bornerbroek',
 'Turfkade',
 'Dollegoor',
 'Bornsestraat',
 'Turfkade',
 'Ossenkoppelerhoek',
 'Schel

## TR-News

### Loading the dataset

In [3]:
# Get file path TR-News dataset
file_path = '../../../data/TR-News/TR-News.xml'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=False, split=True)

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=True, split=True)

### Make Predictions

In [4]:
import spaCy_predictions

processed_results = spaCy_predictions.make_predictions(ner_pipeline, data_all_toponyms)

174it [00:01, 102.84it/s]


### Evaluation TR-News

In [5]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(data_all_toponyms, processed_results)

fp: 190 | tp: 807 | fn: 485
precision: 0.809 | recall: 0.625 | f-score: 0.705


In [6]:
fps

['Hawaii U.N.',
 'North Carolina',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'the South Regional Jail',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'New York',
 'New York',
 'the United States',
 'the United States',
 'the United States',
 'Atlantic',
 'West Chester',
 'the United States',
 'Southern District',
 'New York',
 'South Carolina',
 'South Carolina',
 'New Milford',
 'Eastside',
 'London City',
 'London City',
 "San Francisco's",
 'Central Anatolia',
 'the South Pacific',
 'Navhal',
 'Air Berlin',
 'the Central Coast',
 'West Coast',
 'Montreal',
 'Palma',
 'Little Portugal',
 'Phoenix',
 'Phoenix',
 'Karak',
 'the United States',
 'New York',
 'Giza',
 'the Saint Peter and Saint Paul Coptic Orthodox Church',
 'Costa Rica',
 'Costa Rica',
 'Costa Rica',
 'Costa Rica',
 'Costa Rica',
 'Southern California',
 'Georgia',
 'Georgia',
 'Georgia',
 'Red Deer',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georg

In [7]:
fns

['Turkish',
 'Aleppo',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Kurdish',
 'Turkish',
 'Russian',
 'Syrian',
 'Russian',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Russian',
 'Turkish',
 'Russian',
 'Russian',
 'Hawaii',
 'Russian',
 'Syrian',
 'North Carolina',
 'West Virginia',
 'Granville County',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'Granville County',
 'New York',
 'New York',
 'New York City',
 'United States',
 'United States',
 'United States',
 'White House',
 'Texas',
 'College Station',
 'Texas',
 'Texas',
 'U.S.',
 'West Chester',
 'Xavier University',
 'United States',
 'British',
 'European',
 'U.S.',
 'New York',
 'DETROIT',
 'U.S.',
 'U.S.',
 'United States',
 'New York',
 'Michigan',
 'Wisconsin',
 'South Carolina',
 'South Carolina',
 'Cuban',
 'BANTAM',
 'Bantam',
 'New Milford',
 'Danbury',
 'New Milford',
 'Calgary',
 'Red 

In [8]:
# Filtered toponyms
fps, fns = evaluate.evaluate(data_filtered_toponyms, processed_results)

fp: 190 | tp: 807 | fn: 445
precision: 0.809 | recall: 0.645 | f-score: 0.718


In [9]:
fps

['Hawaii U.N.',
 'North Carolina',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'the South Regional Jail',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'New York',
 'New York',
 'the United States',
 'the United States',
 'the United States',
 'Atlantic',
 'West Chester',
 'the United States',
 'Southern District',
 'New York',
 'South Carolina',
 'South Carolina',
 'New Milford',
 'Eastside',
 'London City',
 'London City',
 "San Francisco's",
 'Central Anatolia',
 'the South Pacific',
 'Navhal',
 'Air Berlin',
 'the Central Coast',
 'West Coast',
 'Montreal',
 'Palma',
 'Little Portugal',
 'Phoenix',
 'Phoenix',
 'Karak',
 'the United States',
 'New York',
 'Giza',
 'the Saint Peter and Saint Paul Coptic Orthodox Church',
 'Costa Rica',
 'Costa Rica',
 'Costa Rica',
 'Costa Rica',
 'Costa Rica',
 'Southern California',
 'Georgia',
 'Georgia',
 'Georgia',
 'Red Deer',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georgia',
 'Georg

In [10]:
fns

['Turkish',
 'Aleppo',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Turkish',
 'Russian',
 'Syrian',
 'Russian',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Russian',
 'Turkish',
 'Russian',
 'Russian',
 'Hawaii',
 'Russian',
 'Syrian',
 'North Carolina',
 'West Virginia',
 'Granville County',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'Granville County',
 'New York',
 'New York',
 'New York City',
 'United States',
 'United States',
 'United States',
 'White House',
 'Texas',
 'College Station',
 'Texas',
 'Texas',
 'U.S.',
 'West Chester',
 'Xavier University',
 'United States',
 'British',
 'European',
 'U.S.',
 'New York',
 'DETROIT',
 'U.S.',
 'U.S.',
 'United States',
 'New York',
 'Michigan',
 'Wisconsin',
 'South Carolina',
 'South Carolina',
 'Cuban',
 'BANTAM',
 'Bantam',
 'New Milford',
 'Danbury',
 'New Milford',
 'Calgary',
 'Red Deer',
 'Rus

## LGL

### Loading the dataset

In [26]:
# Get file path LGL dataset
file_path = '../../../data/LGL/LGL.xml'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=False, split=True)

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=True, split=True)

### Make Predictions

In [27]:
import spaCy_predictions

processed_results = spaCy_predictions.make_predictions(ner_pipeline, data_all_toponyms)

887it [00:06, 128.40it/s]


### Evaluation LGL

In [28]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(data_all_toponyms, processed_results)

fp: 1172 | tp: 2089 | fn: 2670
precision: 0.641 | recall: 0.439 | f-score: 0.521


In [29]:
fps

['Orchard St.',
 'Pogemiller',
 'Twin Lakes - Highway',
 'Pelican Rapids',
 'the Red River',
 'Red River',
 'the Red River Valley',
 'North Dakota',
 'Oakport Township',
 'Groth',
 'Fargo City',
 'Red River',
 'Lake Vermont',
 'Freeborn Lake',
 'North Nokomis Street',
 'New Haven',
 'Sioux Falls',
 'New Haven',
 'the Alexandria City Public Schools',
 'Old Town',
 'Old Town',
 'New York City',
 'Old Town',
 'Northwest D.C.',
 'Cutts',
 'Southern Sudan',
 'Sub-Saharan Africa',
 'Sri Lanka',
 'Sri Lanka',
 'Sri Lanka',
 'Sri Lanka',
 'Sri Lanka',
 "Sri Lanka's",
 'Petrojet',
 'Petrojet',
 'Petrojet',
 'Zamalek',
 'Esperance',
 'Esperance',
 'US',
 'the Middle East',
 'Gulf',
 'the Middle East',
 'Gulf',
 'US',
 'US',
 'US',
 'the West Bank',
 'Gulf',
 'US',
 'Gulf',
 'the Middle East',
 'US',
 'Sharm El-Sheikh',
 'the High Dam',
 'North Monticello',
 'DuPage County',
 'Babcock Grove',
 'Charlotte',
 'Glen Ellyn',
 'Macon County',
 'Hudson',
 'Evergreen Lake',
 'Decatur',
 'Des Plaines',
 

In [30]:
fns

['Memphis St.',
 'Augusta St.',
 'Mansfield',
 'Mansfield',
 'Shreveport',
 'Mansfield',
 'DeSoto Parish',
 'Cook',
 'Minneapolis',
 'Highway 200',
 'Mahnomen',
 'Mahnomen County Road',
 'Mahnomen County Road',
 'Twin Lakes',
 'Highway 10',
 'Otter Tail County',
 'Highway 108',
 'Pelican Rapids',
 'Star Lake',
 'Otter Tail/Grant',
 'Grant/Wilkin',
 'County Road 43',
 'Highway 114',
 'Douglas County',
 'Lake Mary',
 'Benson',
 'Minnesota',
 'Minnesota',
 'Alexandria',
 'Red River',
 'Fargo',
 'Moorhead',
 'Red River',
 'Red River Valley',
 'Fargo',
 'Fargo',
 '40th Avenue South',
 'North Dakota',
 'Oakport Township',
 'Moorhead',
 'Red River',
 'Moorhead',
 'Minnesota',
 'Fargo',
 'Fargo',
 'Red River',
 'Fargo',
 'Douglas County',
 'County Road 35',
 'Lake Vermont',
 'County Road 56',
 'County Road 15',
 'Brandon',
 'County Road 96',
 'Freeborn Lake',
 'North Nokomis Street',
 'Darling Avenue',
 'Van Dorn Street',
 'Alexandria',
 'Alexandria',
 'New Haven',
 'Sioux Falls',
 'S.D.',
 'S

In [35]:
# Filtered toponyms
fps, fns = evaluate.evaluate(data_filtered_toponyms, processed_results)

fp: 1202 | tp: 2029 | fn: 2214
precision: 0.628 | recall: 0.478 | f-score: 0.543


In [33]:
fps

['Orchard St.',
 'Pogemiller',
 'Twin Lakes - Highway',
 'Pelican Rapids',
 'the Red River',
 'Red River',
 'the Red River Valley',
 'North Dakota',
 'Oakport Township',
 'Groth',
 'Fargo City',
 'Red River',
 'Lake Vermont',
 'Nokomis',
 'North Nokomis Street',
 'New Haven',
 'Sioux Falls',
 'New Haven',
 'the Alexandria City Public Schools',
 'Old Town',
 'Old Town',
 'New York City',
 'Old Town',
 'Northwest D.C.',
 'Cutts',
 'Southern Sudan',
 'Sub-Saharan Africa',
 'Godapitiya',
 'Sri Lanka',
 'Sri Lanka',
 'Sri Lanka',
 'Sri Lanka',
 'Sri Lanka',
 'Petrojet',
 'Zamalek',
 'Esperance',
 'Esperance',
 'US',
 'the Middle East',
 'Gulf',
 'the Middle East',
 'Gulf',
 'US',
 'US',
 'US',
 'the West Bank',
 'Gulf',
 'US',
 'Gulf',
 'the Middle East',
 'US',
 'Sharm El-Sheikh',
 'the High Dam',
 'North Monticello',
 'DuPage County',
 'Babcock Grove',
 'Charlotte',
 'Glen Ellyn',
 'Woodfield',
 'Macon County',
 'Hudson',
 'Evergreen Lake',
 'Decatur',
 'Des Plaines',
 'North Texas',
 'Ta

In [34]:
fns

['Mansfield',
 'Mansfield',
 'Shreveport',
 'Mansfield',
 'Cook',
 'Minneapolis',
 'Mahnomen',
 'Twin Lakes',
 'Otter Tail County',
 'Pelican Rapids',
 'Star Lake',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Alexandria',
 'Red River',
 'Fargo',
 'Moorhead',
 'Red River',
 'Fargo',
 'Fargo',
 'North Dakota',
 'Oakport Township',
 'Moorhead',
 'Red River',
 'Moorhead',
 'Minnesota',
 'Fargo',
 'Fargo',
 'Red River',
 'Fargo',
 'Douglas County',
 'Lake Vermont',
 'Brandon',
 'Freeborn Lake',
 'Alexandria',
 'Alexandria',
 'New Haven',
 'Sioux Falls',
 'S.D.',
 'Surprise',
 'Alexandria',
 'New Haven',
 'Alexandria',
 'Alexandria',
 'New York City',
 'D.C.',
 'D.C.',
 'Sudanese',
 'Sudan',
 'Chinese',
 'Africa',
 'African',
 'Egyptian',
 'Egyptian',
 'Sudan',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Egyptian',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Egyptian',
 'Sudanese',
 'Sudanese',
 'Alexandria',
 'Sudanese',
 'Sri Lankan',
 'Sr

## GeoWebNews

### Loading the dataset

In [3]:
# Get file path GWN dataset
file_path = '../../../data/GeoWebNews/GeoWebNews.xml'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=False, split=True)

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=True, split=True)

### Make Predictions

In [4]:
import spaCy_predictions

processed_results = spaCy_predictions.make_predictions(ner_pipeline, data_all_toponyms)

348it [00:03, 103.70it/s]


In [5]:
processed_results_filtered = spaCy_predictions.make_predictions(ner_pipeline, data_filtered_toponyms)

346it [00:03, 112.88it/s]


### Evaluation GWN

In [6]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(data_all_toponyms, processed_results)

fp: 418 | tp: 1193 | fn: 4086
precision: 0.741 | recall: 0.226 | f-score: 0.346


In [7]:
fps

['Finiels',
 'the Mississippi River',
 'Methodist',
 'Frenchmen Street',
 'Lake Manassas',
 'Manassas',
 'San Marcos',
 'Ruby',
 "Mississippi State's",
 'Mississippi State',
 "Victoria Vivians'",
 'UConn',
 "Mississippi State's",
 'North Carolina',
 "St. Peter's Basilica",
 'Bollywood',
 'Manisha Koirala',
 'Fire Island',
 'Fort Salonga',
 'New Orleans',
 'New York',
 'South Korea',
 'New York City',
 'New Orleans',
 "the Islamic Republic's",
 'Godolphin',
 'West Coast',
 'Godolphin',
 'North America',
 'West Coast',
 'West Coast',
 'Treasure Beach',
 'Animal Kingdom',
 'Morris County',
 'Morris County',
 'New York',
 'New York',
 'NEW JERSEY',
 'Washington',
 'D.C.',
 'the Middle East',
 'West',
 'the United States',
 'the United Kingdom',
 'the United States',
 'Asia-Pacific',
 'Tamil Nadu',
 'States',
 'Kamenné Square',
 'the United States',
 'the United States',
 'Washington Township',
 'New Jersey',
 'South Africa',
 'South Venezuela',
 'The United States',
 'the United States',
 

In [8]:
fns

['area',
 'plantation',
 'mansion',
 'substation',
 'Louisiana',
 'Louisiana Purchase',
 'parcel',
 'French',
 'plat',
 'French Quarter',
 'Mississippi River',
 "Rue d'Enghein",
 'street',
 'Almonaster',
 'Franklin',
 'avenue',
 'Marigny Plantation',
 'faubourg',
 'Press Street',
 'area',
 "Faubourg D'Aunoy",
 'neighbor',
 'building',
 'Chartres',
 'Franklin',
 'Methodist church',
 'restaurant',
 'street',
 'complex',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 '2231 Royal',
 'townhouse',
 'basement',
 'Greek',
 'structures',
 'Elysian Fields',
 'Royal Street',
 'Bourbon',
 'Marigny Canal',
 'grid',
 'avenue',
 'park',
 'Washington Square',
 'Champs-Élysées',
 'Elysian Fields',
 'Pontchartrain Railroad',
 'Pontchartrain Railroad',
 'Appalachians',
 'line',
 'Royal Street',
 'Washington Square',
 'block',
 'Holy Redeemer Church',
 'edifice',
 'Third Presbyterian Church',
 'Frenchmen Street',
 'Kurdish',
 'city',
 'Turkish',
 'Syrian',
 'Kurd

In [9]:
# Filtered toponyms
fps, fns = evaluate.evaluate(data_filtered_toponyms, processed_results_filtered)

fp: 423 | tp: 1179 | fn: 1287
precision: 0.736 | recall: 0.478 | f-score: 0.580


In [10]:
fps

['Finiels',
 'the Mississippi River',
 'Methodist',
 'Frenchmen Street',
 'Norbury',
 'Granville',
 'Somerville',
 'Manassas',
 'San Marcos',
 'Ruby',
 "Mississippi State's",
 'Mississippi State',
 "Victoria Vivians'",
 'UConn',
 "Mississippi State's",
 "St. Peter's Basilica",
 'Bollywood',
 'Manisha Koirala',
 'Fire Island',
 'Fort Salonga',
 'New Orleans',
 'New York',
 'South Korea',
 'New York City',
 "the Islamic Republic's",
 'Godolphin',
 'West Coast',
 'Godolphin',
 'North America',
 'West Coast',
 'West Coast',
 'Treasure Beach',
 'Animal Kingdom',
 'Morris County',
 'Morris County',
 'New York',
 'New York',
 'NEW JERSEY',
 'Washington',
 'D.C.',
 'the Middle East',
 'the United States',
 'West',
 'the Soviet Union',
 'the United States',
 'Asia-Pacific',
 'Tamil Nadu',
 'States',
 'Kamenné Square',
 'the United States',
 'the United States',
 'Washington Township',
 'New Jersey',
 'South Africa',
 'The United States',
 'the United States',
 'the United States',
 'Dayasiri Ja

In [11]:
fns

['Louisiana',
 'French',
 'French Quarter',
 'Mississippi River',
 'Faubourg Marigny',
 'German',
 'Irish',
 "Rue d'Enghein",
 'Almonaster',
 'Franklin',
 'Marigny Plantation',
 'Press Street',
 "Faubourg D'Aunoy",
 'Chartres',
 'Franklin',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 'Greek',
 'Elysian Fields',
 'Royal Street',
 'Bourbon',
 'Desire Street',
 'Bywater',
 'Elysian Fields',
 'New Orleans Railways and Light Company Claiborne Power House',
 'French',
 'Marigny Canal',
 'Washington Square',
 'Champs-Élysées',
 'Elysian Fields',
 'Pontchartrain Railroad',
 'Appalachians',
 'Royal Street',
 'Washington Square',
 'Holy Redeemer Church',
 'Third Presbyterian Church',
 'Frenchmen Street',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Qamishli',
 'Syrian',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Afrin',
 'Turkish',
 'Afrin',
 'Robert Trent Jones Golf Club',
 'Stonewall Golf Club',
 'Stonewall',
 'Virginia Gateway',
 'Haymarket Village 