In [1]:
import sys

# Insert utils folder into path
sys.path.insert(1, '../utils')

## Loading Fine-tuned LaBSE model

In [2]:
model_path = '../models/LaBSE-fine-tuned-conll-2003'
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

split = True

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=len(label_list))

### DutchPolicyDocs

In [3]:
file_path = '../../../data/DutchPolicyDocs/DutchPolicyDocs.json' 

import loading_functions

toponym_data = loading_functions.prepare_data(file_path, filtered=False, split=split)

### Processing the data for Huggingface Trainer

In [4]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
import preparing_dataset

DPD = preparing_dataset.prepare_dataset(toponym_data, tokenizer)

  0%|          | 0/2 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [7]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [8]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(DPD)
predictions = np.argmax(raw_pred, axis=2)

### Process predictions

In [15]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, DPD, label_list, tokenizer)

### Evaluation DPD

In [16]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(toponym_data, processed_results)

fp: 2809 | tp: 3346 | fn: 1659
precision: 0.544 | recall: 0.669 | f-score: 0.600


In [17]:
fps

['Noordholland',
 'Kanaal',
 'IJ',
 'Kanaal',
 'Stadsrivier',
 'Metropool',
 'Amsterdam',
 'A9',
 'Metro',
 'Amsterdam',
 'Alkmaar',
 '-',
 'Noord',
 'Smart Town Alkmaar',
 'Alkmaar',
 'Noord',
 '- Holland',
 'Metropool',
 'Amsterdam',
 'Noord',
 '-',
 'Holland',
 '- Noord',
 'Noord',
 'Alkmaar',
 'Metropoolregio',
 'Amsterdam',
 'Alkmaar',
 'Alkmaar',
 'Metropoolregio',
 'Noord',
 '-',
 'Holland - Noord',
 'PEN',
 'Alkmaar Noord',
 'Sportstad Alkmaar',
 'De Schermer',
 'De Schermer',
 'Woonboulevard',
 'Zgt',
 'XL Businesspark Twente',
 'Bedrijvenpark',
 'Twente',
 'Bedrijventerrein',
 'Twentepoort',
 'Bornsestraat',
 'Turfkade',
 'Dollegoor',
 'Noordbroek',
 'Buitenhaven',
 'A35',
 'Aadorp',
 'Het Gravenkwartier',
 'De',
 'Kop van Zuid',
 'De',
 'Almelose',
 'Exoosche Aa',
 'Lateraal',
 'Almelo',
 '-',
 'de Haandrik',
 'Almelo',
 'Koornmarkt',
 'Grotestraat',
 '-',
 'Kerkstraat',
 'Molenstraat',
 'Bi',
 '(',
 'e',
 ') den Broecke',
 'Bloom',
 "'",
 'n Ven',
 'Doorbraak',
 'Aadorp',
 

In [18]:
fns

['Noordhollandsch Kanaal',
 'het IJ',
 'Metropoolregio Amsterdam',
 'Metropoolregio Amsterdam',
 'Alkmaar-Noord',
 'Alkmaar',
 'Overloopwijken',
 'Alkmaar',
 'Regio Alkmaar',
 'Noord-Holland',
 'Metropoolregio Amsterdam',
 'Noord-Holland-Noord',
 'Noord-Holland-Noord',
 'Metropoolregio Amsterdam',
 'gemeente Alkmaar',
 'Metropoolregio Amsterdam',
 'Noord-Holland-Noord',
 'PEN-dorp',
 'Overloopwijken',
 'Alkmaarderhout',
 'De Kop van Zuid',
 'De Compacte Kern',
 'Het Gravenkwartier',
 'Almelose en Exoosche Aa',
 'Lateraal kanaal',
 'kanaal Almelo - de Haandrik',
 'Koornmarkt',
 'de Doelen',
 'de Koornmarkt',
 'Grotestraat-Kerkstraat',
 'Molenstraat',
 'de Doorbraak',
 'Aadorp',
 'kanaal Almelo-De Haandrik',
 'Bendien',
 'Stoomspinnerij Twente',
 'Schelfhorst',
 'Vriezenveen',
 'Noord-Brabant',
 'Baarle-Nassau',
 'Baarle-Nassau',
 'Baarle-Nassau',
 'Baarle- Hertog',
 'Baarle-Hertog',
 'gemeente Baarle-Nassau',
 'Baarle-Hertog',
 'Baarle-Nassau',
 'Baarle-Hertog',
 'Breda',
 'Baarle-Nassa

## TR-News

### Loading the dataset

In [19]:
# Get file path TR-News dataset
file_path = '../../../data/TR-News/TR-News.xml'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=False, split=split)

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=True, split=split)

### Processing the data for Huggingface Trainer

In [20]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [21]:
import preparing_dataset

TRN = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

TRN_filtered = preparing_dataset.prepare_dataset(data_filtered_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [23]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(TRN)
predictions = np.argmax(raw_pred, axis=2)

raw_pred_filtered, _, _ = test_trainer.predict(TRN_filtered)
predictions_filtered = np.argmax(raw_pred_filtered, axis=2)

### Process predictions

In [24]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, TRN, label_list, tokenizer)

processed_results_filtered = process_predictions.process_predictions(predictions_filtered, TRN_filtered, label_list, tokenizer)

### Evaluation TR-News

In [25]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(data_all_toponyms, processed_results)

fp: 316 | tp: 977 | fn: 323
precision: 0.756 | recall: 0.752 | f-score: 0.754


In [26]:
fps

['White House',
 'White House',
 'Interstate 64',
 'South Regional Jail',
 'U',
 '.',
 'S',
 '.',
 'Rose Garden',
 'Rose Garden',
 'Ronald Reagan Building',
 'U',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'Southern District of Ohio',
 'U',
 '.',
 'S',
 '.',
 'San Antonio Four',
 'Bantam Superior Court',
 'Wooster St.',
 'Litchfield Superior Court',
 'Cumberland Farms',
 'South Main St',
 '-',
 'North West',
 'Twitter',
 'Facebook',
 'Kremlin',
 'Kremlin',
 'Downtown Eastside',
 'London City',
 'London City',
 'Central Anatolia',
 'Rhineland',
 '-',
 'Palatinate',
 'Film',
 'Dolby Theatre',
 'B',
 '.',
 'C',
 '.',
 'Macdonald',
 '-',
 'Cartier International Airport',
 'Montreal',
 '-',
 'Pierre Elliott Trudeau International Airport',
 'Mont',
 '-',
 'Royal Avenue',
 'Sage House',
 "Chi

In [27]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Kurdish',
 'Turkish',
 'Russian',
 'Syrian',
 'Russian',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Russian',
 'Turkish',
 'Russian',
 'Russian',
 'Russian',
 'Syrian',
 'U.S.',
 'Texas',
 'Texas',
 'Texas',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'Xavier University',
 'British',
 'U.S.',
 'European',
 'U.S.',
 'U.S.',
 'U.S.',
 'Ohio',
 'DETROIT',
 'U.S.',
 'U.S.',
 'New York',
 'Michigan',
 'Cuban',
 'BANTAM',
 'Bantam',
 'Russian',
 'Russian',
 'Russian',
 'Russian',
 'London',
 'London',
 'Anatolia',
 'German',
 'Iraqi',
 'Rhineland-Palatinate',
 'Iraqi',
 'German',
 'Spanish',
 'Canadian',
 'Cannes',
 'Spanish',
 'France',
 'French',
 'French',
 'Spanish',
 'Berlin',
 'German',
 'European',
 'American',
 'Chinese',
 'B.C.',
 'Macdonald-Cartier International Airport',
 'Montreal-Pierre Elliott Trudeau International Airport',
 'Canada',
 'Mont-Royal Avenu

In [28]:
# Filtered toponyms
fps, fns = evaluate.evaluate(data_filtered_toponyms, processed_results_filtered)

fp: 349 | tp: 942 | fn: 314
precision: 0.730 | recall: 0.750 | f-score: 0.740


In [29]:
fps

['White House',
 'White House',
 'Interstate 64',
 'South Regional Jail',
 'U',
 '.',
 'S',
 '.',
 'Rose Garden',
 'Rose Garden',
 'Ronald Reagan Building',
 'U',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'Southern District of Ohio',
 'U',
 '.',
 'S',
 '.',
 'San Antonio Four',
 'Bantam Superior Court',
 'Wooster St.',
 'Litchfield Superior Court',
 'Cumberland Farms',
 'South Main St',
 '-',
 'North West',
 'Twitter',
 'Facebook',
 'Kremlin',
 'Kremlin',
 'Downtown Eastside',
 'London City',
 'London City',
 'Central Anatolia',
 'Rhineland',
 '-',
 'Palatinate',
 'Film',
 'Dolby Theatre',
 'B',
 '.',
 'C',
 '.',
 'West Coast',
 'Macdonald',
 '-',
 'Cartier International Airport',
 'Montreal',
 '-',
 'Pierre Elliott Trudeau International Airport',
 'Mont',
 '-',
 'Royal Avenue',
 'Sag

In [30]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Turkish',
 'Russian',
 'Syrian',
 'Russian',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Russian',
 'Turkish',
 'Russian',
 'Russian',
 'Russian',
 'Syrian',
 'U.S.',
 'Texas',
 'Texas',
 'Texas',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'Xavier University',
 'British',
 'U.S.',
 'European',
 'U.S.',
 'U.S.',
 'U.S.',
 'Ohio',
 'DETROIT',
 'U.S.',
 'U.S.',
 'New York',
 'Michigan',
 'Cuban',
 'BANTAM',
 'Bantam',
 'Russian',
 'Russian',
 'Russian',
 'Russian',
 'London',
 'London',
 'Anatolia',
 'German',
 'Iraqi',
 'Rhineland-Palatinate',
 'Iraqi',
 'German',
 'Spanish',
 'Canadian',
 'Cannes',
 'Spanish',
 'France',
 'French',
 'French',
 'Spanish',
 'Berlin',
 'German',
 'European',
 'American',
 'Chinese',
 'B.C.',
 'Macdonald-Cartier International Airport',
 'Montreal-Pierre Elliott Trudeau International Airport',
 'Canada',
 'Canada',
 'Man.',
 'Canada',

## LGL

### Loading the dataset

In [31]:
# Get file path LGL dataset
file_path = '../../../data/LGL/LGL.xml'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=False, split=split)

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=True, split=split)

### Processing the data for Huggingface Trainer

In [32]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [33]:
import preparing_dataset

LGL = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

LGL_filtered = preparing_dataset.prepare_dataset(data_filtered_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [34]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [35]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(LGL)
predictions = np.argmax(raw_pred, axis=2)

raw_pred_filtered, _, _ = test_trainer.predict(LGL_filtered)
predictions_filtered = np.argmax(raw_pred_filtered, axis=2)

### Process predictions

In [36]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, LGL, label_list, tokenizer)

processed_results_filtered = process_predictions.process_predictions(predictions_filtered, LGL_filtered, label_list, tokenizer)

### Evaluation LGL

In [37]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(data_all_toponyms, processed_results)

fp: 1799 | tp: 3381 | fn: 1588
precision: 0.653 | recall: 0.680 | f-score: 0.666


In [38]:
fps

['Kelleyland',
 'Orchard St.',
 'Cottonport Fire Station',
 'Warden',
 'St',
 '. James Youth Detention Center',
 'Pointe Coupe',
 'Gannett',
 'DeSoto',
 'R',
 '-',
 'Otter Tail County Road',
 'Highway',
 'Otter Tail',
 'Grant',
 'Grant /',
 'Wilkin',
 'Douglas County Road 4',
 'Highway 12',
 'Highway',
 '-',
 'Hesco',
 'Fargodome',
 'Sandbag Central',
 'Road',
 'County Road',
 'Road',
 'Douglas County Hospital',
 'St',
 'Ky',
 '.',
 'Conn',
 '.',
 'S',
 '.',
 'D',
 '.',
 'Ariz',
 '.',
 'Alexandria City',
 'Minnie Howard',
 'Durant Center',
 'Washington Latin School',
 'Washington Latin Public Charter School',
 'D. C.',
 'Washington Latin School',
 'Washington Latin School Charter School',
 'Northwest D. C.',
 'D',
 '.',
 'C',
 '.',
 'Washington Latin School',
 'East End',
 'Sub',
 '-',
 'Saharan Africa',
 'Egypt',
 'Sudan',
 'Egypt',
 'Sudan',
 'Egypt',
 'Sudan',
 'Egypt',
 'Sudan',
 'Misri',
 'Gulf',
 'Madrid',
 'Gulf',
 'US',
 'US',
 'US',
 'Gulf',
 'US',
 'Gulf',
 'US',
 'Sharm El -

In [39]:
fns

['Rapides Parish',
 'Avoyelles',
 'Cottonport',
 'MANSFIELD',
 'Mansfield',
 'Shreveport',
 'DeSoto Parish',
 'Cook',
 'Minnesota',
 'Highway 200',
 'Highway 10',
 'Otter Tail County',
 'Highway 108',
 'Otter Tail/Grant',
 'Grant/Wilkin',
 'Highway 114',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Alexandria',
 'Douglas County',
 'County Road 35',
 'County Road 56',
 'County Road 15',
 'County Road 96',
 'Douglas County',
 'St. Cloud',
 'Alexandria',
 'Ky.',
 'Conn.',
 'S.D.',
 'Ariz.',
 'Alexandria',
 'Alexandria',
 'D.C.',
 'Washington',
 'D.C.',
 'D.C.',
 'D.C.',
 'Alexandria',
 'Virginia',
 'Sudanese',
 'Chinese',
 'Africa',
 'African',
 'Egyptian',
 'Egyptian',
 'Sudan',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Egyptian',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Egyptian',
 'Sudanese',
 'Sudanese',
 'Alexandria',
 'Sudanese',
 'Sri Lankan',
 'Sri Lankan',
 'Sinhalese',
 'Sinhalese',
 'Sinhalese',
 'Sri Lankan',
 'Sri Lankan',
 'Sri Lankan',
 'Sri Lan

In [40]:
# Filtered toponyms
fps, fns = evaluate.evaluate(data_filtered_toponyms, processed_results_filtered)

fp: 2126 | tp: 2884 | fn: 1464
precision: 0.576 | recall: 0.663 | f-score: 0.616


In [41]:
fps

['Kelleyland',
 'Orchard St.',
 'Cottonport Fire Station',
 'Memphis St.',
 'Augusta St.',
 'St',
 '. James Youth Detention Center',
 'Pointe Coupe',
 'Gannett',
 'DeSoto',
 'R',
 '-',
 'Mahnomen County Road',
 'Mahnomen County Road',
 'Otter Tail County Road',
 'Highway',
 'Otter Tail',
 'Grant',
 'Grant /',
 'Wilkin',
 'County Road 43',
 'Douglas County Road 4',
 'Lake Mary',
 'Highway 12',
 'Benson',
 'Highway',
 '-',
 'Red River Valley',
 '40th Avenue South',
 'Hesco',
 'Fargodome',
 'Sandbag Central',
 'Road',
 'County Road',
 'Road',
 'Nokomis',
 'North Nokomis Street',
 'Darling Avenue',
 'Douglas County Hospital',
 'St',
 'County Road 109',
 'Van Dorn Street',
 'Ky',
 '.',
 'Conn',
 '.',
 'S',
 '.',
 'D',
 '.',
 'Ariz',
 '.',
 'Alexandria City',
 'Minnie Howard',
 'Durant Center',
 'Old Town',
 'Washington Latin School',
 'Old Town',
 'Washington Latin Public Charter School',
 'D. C.',
 'Old Town',
 'Washington Latin School',
 'Washington Latin School Charter School',
 'Northwe

In [42]:
fns

['Rapides Parish',
 'Avoyelles',
 'Cottonport',
 'MANSFIELD',
 'Mansfield',
 'Shreveport',
 'Cook',
 'Minnesota',
 'Otter Tail County',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Alexandria',
 'Douglas County',
 'Douglas County',
 'St. Cloud',
 'Alexandria',
 'Ky.',
 'Conn.',
 'S.D.',
 'Ariz.',
 'Alexandria',
 'Alexandria',
 'D.C.',
 'D.C.',
 'D.C.',
 'D.C.',
 'Alexandria',
 'Virginia',
 'Sudanese',
 'Chinese',
 'Africa',
 'African',
 'Egyptian',
 'Egyptian',
 'Sudan',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Egyptian',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Egyptian',
 'Sudanese',
 'Sudanese',
 'Alexandria',
 'Sudanese',
 'Sri Lankan',
 'Sri Lankan',
 'Sinhalese',
 'Sinhalese',
 'Sinhalese',
 'Sri Lankan',
 'Sri Lankan',
 'Sri Lankan',
 'Sri Lankan',
 'Sri Lankan',
 'Sri Lankan',
 'Sri Lankan',
 'Sinhalese',
 'Sinhalese',
 'Sri Lankan',
 'Sri Lankan',
 'Zamalek',
 'Zamalek',
 'Egypt',
 'Egyptian',
 'US',
 'Egyptian',
 'US',
 'Egyptian',
 'Israeli',
 '

## GeoWebNews

### Loading the dataset

In [43]:
# Get file path GWN dataset
file_path = '../../../data/GeoWebNews/GeoWebNews.xml'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=False, split=split)

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=True, split=split)

### Processing the data for Huggingface Trainer

In [44]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [45]:
import preparing_dataset

GWN = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

GWN_filtered = preparing_dataset.prepare_dataset(data_filtered_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [46]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [47]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(GWN)
predictions = np.argmax(raw_pred, axis=2)

raw_pred_filtered, _, _ = test_trainer.predict(GWN_filtered)
predictions_filtered = np.argmax(raw_pred_filtered, axis=2)

### Process predictions

In [48]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, GWN, label_list, tokenizer)

processed_results_filtered = process_predictions.process_predictions(predictions_filtered, GWN_filtered, label_list, tokenizer)

### Evaluation GWN

In [49]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(data_all_toponyms, processed_results)

fp: 498 | tp: 1653 | fn: 3711
precision: 0.768 | recall: 0.308 | f-score: 0.440


In [50]:
fps

['Louisiana Purchase',
 'Chartres /',
 'D',
 "'",
 'Aunoy',
 'Royal',
 'Champs',
 '-',
 'Élysées',
 'Smoky Mary',
 'Stonewall',
 'Club',
 'Bath',
 'Buckland',
 'Vienna',
 'Fairfax',
 '-',
 'GMU',
 'Orange',
 'Turtle Creek Cir',
 'Va',
 '.',
 'Kremlin',
 'Pope',
 'St',
 ". Peter's Basilica",
 'White House',
 'White House',
 'Below Deck',
 'Manila',
 'Manila',
 'Bethany',
 'Commonwealth of Independent States',
 'CIS',
 'CIS',
 'CIS',
 'North America',
 'Gate 10',
 'Gate 10',
 'Washington',
 'D',
 '.',
 'C',
 '.',
 'U',
 'West',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Kremlin',
 '-',
 '-',
 'State of Tamil Nadu',
 'No',
 '. 2 Detention Centre',
 'Tianjin No',
 '. 2 Detention Centre',
 'Kyjev',
 'Eva',
 "'",
 's Village',
 'Eva',
 "'",
 's Village',
 'New Jersey,',
 'Facebook',
 'Vanni',
 'MEA',
 'Bakgatla',
 '-',
 'ba',
 '-',
 'Kgafela',
 'Bakgatla',
 'Washington, D. C.',
 'D',
 '.',
 'C',
 '.',
 'U',
 'North',
 'Church',
 'U',
 'Benue',
 'Benue',
 'Market at',
 'FATA',
 'Pakist

In [51]:
fns

['area',
 'plantation',
 'mansion',
 'substation',
 'Louisiana',
 'Louisiana Purchase',
 'parcel',
 'French',
 'plat',
 'squares',
 'neighborhood',
 'city',
 'street',
 'avenue',
 'faubourg',
 'area',
 'neighbor',
 'building',
 'Chartres',
 'Methodist church',
 'restaurant',
 'street',
 'complex',
 '2231 Royal',
 '2231 Royal',
 'townhouse',
 'basement',
 'Greek',
 'structures',
 'system',
 'neighborhoods',
 'edifice',
 'grid',
 'avenue',
 'park',
 'Champs-Élysées',
 'Pontchartrain Railroad',
 'Pontchartrain Railroad',
 'line',
 'block',
 'edifice',
 'Carnegie Library',
 'church',
 'library',
 'lot',
 'Kurdish',
 'city',
 'Turkish',
 'Syrian',
 'Kurdish',
 'towns',
 'Syrian',
 'Turkish',
 'campaign',
 'Syrian',
 'Kurdish',
 'frontier',
 'forces',
 'Turkish',
 'Britain',
 'Syrian',
 'Syrian Observatory for Human Rights Monitoring',
 'group',
 'Kurdish',
 'Turkish',
 'Syrian',
 'clashes',
 'community',
 'style',
 'design',
 'residences',
 'garage',
 'clubroom',
 'terrace',
 'yard',
 'porc

In [52]:
# Filtered toponyms
fps, fns = evaluate.evaluate(data_filtered_toponyms, processed_results_filtered)

fp: 502 | tp: 1640 | fn: 857
precision: 0.766 | recall: 0.657 | f-score: 0.707


In [53]:
fps

['Bernard Xavier Philippe de Marigny de Mandeville',
 'Louisiana Purchase',
 'Royal',
 'Desire',
 'Champs',
 '-',
 'Élysées',
 'Smoky Mary',
 'Jamison',
 'Norbury',
 'Granville',
 'Somerville',
 'Jamison',
 'Norbury',
 'Norbury',
 'Stonewall',
 'Club',
 'Buckland',
 'Vienna',
 'Fairfax',
 '-',
 'GMU',
 'Orange',
 'Turtle Creek Cir',
 'Va',
 'Kremlin',
 'Pope',
 'St',
 ". Peter's Basilica",
 'White House',
 'Below Deck Mediterranean',
 'Below Deck',
 'Manila',
 'Manila',
 'Commonwealth of Independent States',
 'CIS',
 'CIS',
 'CIS',
 'North America',
 'Gate 10',
 'Gate 10',
 'Washington',
 'D',
 '.',
 'C',
 '.',
 'U',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Kremlin',
 '-',
 '-',
 'Tamil',
 'Nadu',
 'States',
 'State of Tamil Nadu',
 'No',
 '. 2 Detention Centre',
 'No',
 '. 2 Detention Centre',
 'Kyjev',
 'Eva',
 "'",
 's Village',
 'Eva',
 "'",
 's Village',
 'New Jersey,',
 'Bakgatla',
 '-',
 'ba',
 '-',
 'Kgafela',
 'Bakgatla',
 'Washington',
 'D. C.',
 'D',
 '.',
 'C',
 '.

In [54]:
fns

['Louisiana',
 'French',
 'German',
 'Irish',
 'Methodist church',
 '2231 Royal',
 'Greek',
 'New Orleans Railways and Light Company Claiborne Power House',
 'French',
 'Champs-Élysées',
 'Pontchartrain Railroad',
 'Carnegie Library',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Syrian',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Turkish',
 'Britain',
 'Syrian',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Stonewall Golf Club',
 'Buckland Elementary',
 'Wegmans',
 'Vienna/Fairfax-GMU Metro',
 'Washington',
 'Washington',
 'Florida Atlantic University',
 'California Polytechnic State University',
 'California State University',
 'Washington',
 'African',
 'African',
 'African',
 'African',
 'African',
 'African',
 'Russian',
 'Russian',
 'Russian Higher School of Economics',
 'Mississippi',
 'Louisville',
 "St. Peter's Basilica",
 'Nigerian',
 'Nigerian',
 'New York',
 'Mediterranean',
 'Manila Cathedral',
 'Manila Cathedral',
 'France',
 'EU',
 'EU',
 'EU',
 'European',
 'European',
 'European',