In [1]:
import sys

# Insert utils folder into path
sys.path.insert(1, '../utils')

## Loading Fine-tuned mBERT model and predictions for non-labelled dataset

In [2]:
model_path = '../models/ner-multilingual-bert-fine-tuned-conll-2003'
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=len(label_list))

## TR-News

### Loading the dataset

In [3]:
# Get file path TR-News dataset
file_path = '../../../data/TR-News/TR-News.xml'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=False, split=False)

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=True, split=False)

### Processing the data for Huggingface Trainer

In [4]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
import preparing_dataset

TRN = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

TRN_filtered = preparing_dataset.prepare_dataset(data_filtered_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [6]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [7]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(TRN)
predictions = np.argmax(raw_pred, axis=2)

raw_pred_filtered, _, _ = test_trainer.predict(TRN_filtered)
predictions_filtered = np.argmax(raw_pred_filtered, axis=2)

### Process predictions

In [8]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, TRN, label_list, tokenizer)

processed_results_filtered = process_predictions.process_predictions(predictions_filtered, TRN_filtered, label_list, tokenizer)

### Evaluation TR-News

In [9]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(data_all_toponyms, processed_results)

fp: 268 | tp: 789 | fn: 300
precision: 0.746 | recall: 0.725 | f-score: 0.735


In [10]:
fps

['White House',
 'U',
 '.',
 'S',
 '.',
 'Rose Garden',
 'Southern Poverty Law Center',
 'Ronald Reagan Building',
 'U',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Southern',
 'Wooster St.',
 'Cumberland Farms',
 'South Main St',
 '-',
 'North West',
 'Kremlin',
 'Kremlin',
 'Downtown Eastside',
 'London City',
 'London City',
 'Central Anatolia',
 'Rhineland',
 '-',
 'Palatinate',
 'Dolby Theatre',
 'Air Berlin',
 'B',
 '.',
 'C',
 '.',
 'Macdonald',
 '-',
 'Cartier International Airport',
 'Montreal',
 '-',
 'Pierre Elliott Trudeau International Airport',
 'Mont',
 '-',
 'Royal Avenue',
 'Sage House',
 "Children's Hospital of Manitoba",
 "' s Hospital",
 'St',
 'Phoenix',
 'Phoenix',
 'Que',
 '.',
 'Crusader',
 'Karak',
 'Paris Town Hall',
 'Nazi Germany',
 'Petit Cambodge',
 'Peter',
 'Paul',
 'St',
 '. Mark',
 'Coptic Orthodox',
 'Coptic Church',
 'Orthodox Churc

In [11]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Kurdish',
 'Turkish',
 'Russian',
 'Granville County',
 'WASHINGTON',
 'U.S.',
 'Texas',
 'Texas',
 'Texas',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'Xavier University',
 'British',
 'U.S.',
 'European',
 'U.S.',
 'U.S.',
 'U.S.',
 'DETROIT',
 'U.S.',
 'U.S.',
 'New York',
 'Michigan',
 'Cuban',
 'BANTAM',
 'Bantam',
 'New Milford',
 'TORRINGTON',
 'Russian',
 'Russian',
 'Russian',
 'Russia',
 'Russian',
 'London',
 'London',
 'Anatolia',
 'German',
 'Iraqi',
 'Rhineland-Palatinate',
 'Iraqi',
 'German',
 'Spanish',
 'Venice',
 'Canadian',
 'Cannes',
 'Spanish',
 'France',
 'French',
 'French',
 'Spanish',
 'BERLIN',
 'Berlin',
 'German',
 'European',
 'American',
 'Chinese',
 'B.C.',
 'Macdonald-Cartier International Airport',
 'Montreal-Pierre Elliott Trudeau International Airport',
 'Canada',
 'Mont-Royal Avenue',
 'Canada',
 'Manitoba',
 'Canada',
 'Toronto',
 "St. Michael's Hospital",
 'Que.',
 'Jordanian'

In [12]:
# Filtered toponyms
fps, fns = evaluate.evaluate(data_filtered_toponyms, processed_results_filtered)

fp: 302 | tp: 753 | fn: 293
precision: 0.714 | recall: 0.720 | f-score: 0.717


In [13]:
fps

['White House',
 'U',
 '.',
 'S',
 '.',
 'Rose Garden',
 'Southern Poverty Law Center',
 'Ronald Reagan Building',
 'U',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Southern',
 'Wooster St.',
 'Cumberland Farms',
 'South Main St',
 '-',
 'North West',
 'Kremlin',
 'Kremlin',
 'Downtown Eastside',
 'London City',
 'London City',
 'Central Anatolia',
 'Rhineland',
 '-',
 'Palatinate',
 'Dolby Theatre',
 'Air Berlin',
 'B',
 '.',
 'C',
 '.',
 'West Coast',
 'Macdonald',
 '-',
 'Cartier International Airport',
 'Montreal',
 '-',
 'Pierre Elliott Trudeau International Airport',
 'Mont',
 '-',
 'Royal Avenue',
 'Sage House',
 "Children's Hospital of Manitoba",
 "' s Hospital",
 'College Street',
 'Dufferin Street',
 'St',
 'Phoenix',
 'Phoenix',
 'Que',
 '.',
 'Crusader',
 'Karak',
 'Karak Castle',
 'Paris Town Hall',
 'Nazi Germany',
 'Peter',
 'Paul',
 'St',
 '. Mark',
 '

In [14]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Turkish',
 'Russian',
 'Granville County',
 'WASHINGTON',
 'U.S.',
 'Texas',
 'Texas',
 'Texas',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'Xavier University',
 'British',
 'U.S.',
 'European',
 'U.S.',
 'U.S.',
 'U.S.',
 'DETROIT',
 'U.S.',
 'U.S.',
 'New York',
 'Michigan',
 'Cuban',
 'BANTAM',
 'Bantam',
 'New Milford',
 'TORRINGTON',
 'Russian',
 'Russian',
 'Russian',
 'Russia',
 'Russian',
 'London',
 'London',
 'Anatolia',
 'German',
 'Iraqi',
 'Rhineland-Palatinate',
 'Iraqi',
 'German',
 'Spanish',
 'Venice',
 'Canadian',
 'Cannes',
 'Spanish',
 'France',
 'French',
 'French',
 'Spanish',
 'BERLIN',
 'Berlin',
 'German',
 'European',
 'American',
 'Chinese',
 'B.C.',
 'Macdonald-Cartier International Airport',
 'Montreal-Pierre Elliott Trudeau International Airport',
 'Canada',
 'Canada',
 'Manitoba',
 'Canada',
 'Toronto',
 "St. Michael's Hospital",
 'Que.',
 'Jordanian',
 'Jordanian',
 'Jordanian',
 'Ka

## LGL

### Loading the dataset

In [15]:
# Get file path LGL dataset
file_path = '../../../data/LGL/LGL.xml'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=False, split=False)

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=True, split=False)

### Processing the data for Huggingface Trainer

In [16]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [17]:
import preparing_dataset

LGL = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

LGL_filtered = preparing_dataset.prepare_dataset(data_filtered_toponyms, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [18]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [19]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(LGL)
predictions = np.argmax(raw_pred, axis=2)

raw_pred_filtered, _, _ = test_trainer.predict(LGL_filtered)
predictions_filtered = np.argmax(raw_pred_filtered, axis=2)

### Process predictions

In [20]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, LGL, label_list, tokenizer)

processed_results_filtered = process_predictions.process_predictions(predictions_filtered, LGL_filtered, label_list, tokenizer)

### Evaluation LGL

In [21]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(data_all_toponyms, processed_results)

fp: 1391 | tp: 2735 | fn: 1528
precision: 0.663 | recall: 0.642 | f-score: 0.652


In [22]:
fps

['Orchard St.',
 'Cottonport Fire Station',
 'St',
 '. James Youth Detention Center',
 'Minnesota',
 'Otter Tail',
 'Highway',
 'Otter Tail / Grant',
 'Grant /',
 'Wilkin',
 'Otter',
 'Tail',
 'Highway',
 '-',
 'Hesco',
 'Oakport',
 'Douglas County Hospital',
 'St',
 'R',
 '-',
 'Ky',
 '.',
 'Conn',
 '.',
 'S',
 '.',
 'D',
 '.',
 'Ariz',
 '.',
 'Cora Kelly',
 'City Hall',
 'Washington, D. C.',
 'East End',
 'Southern Sudan',
 'Sub',
 'Sri',
 'Gulf',
 'Madrid',
 'Gulf',
 'US',
 'US',
 'US',
 'US',
 'Sharm El - Sheikh',
 'St',
 ". John's Lutheran School",
 'Sheldon Peck Homestead',
 'Woodfield Shopping Center',
 'Streets of Woodfield',
 'Hudson',
 'Decatur',
 'U',
 '.',
 'S',
 '.',
 'Ill',
 '.',
 'Pe',
 'Tarrant',
 'Houston',
 'Big',
 'Law Enforcement Center',
 'City Hall',
 'E',
 '. Broad St.',
 'Mansfield Law Enforcement Center',
 'North Texas',
 'City Hall',
 'North Arlington',
 'Cooper Street',
 'Silkwood Trail',
 'South Arlington',
 'Sports Center',
 'Exxon',
 'Durango',
 'Durango',

In [23]:
fns

['Rapides Parish',
 'Cottonport',
 'Alexandria',
 'MANSFIELD',
 'Mansfield',
 'Shreveport',
 'Cook',
 'Minnesota',
 'Highway 200',
 'Mahnomen County Road',
 'Mahnomen County Road',
 'Highway 10',
 'Otter Tail County',
 'Highway 108',
 'Otter Tail/Grant',
 'Grant/Wilkin',
 'County Road 43',
 'Highway 114',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Otter Tail',
 'Alexandria',
 'Oakport Township',
 'Douglas County',
 'County Road 35',
 'County Road 56',
 'County Road 15',
 'County Road 96',
 'Nokomis',
 'Douglas County',
 'St. Cloud',
 'Alexandria',
 'Ky.',
 'Conn.',
 'S.D.',
 'Ariz.',
 'Alexandria',
 'Washington',
 'D.C.',
 'Virginia',
 'Sudanese',
 'Sudan',
 'Chinese',
 'Africa',
 'African',
 'Egyptian',
 'Egyptian',
 'Sudan',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Godapitiya',
 'Sri Lankan',
 'Sri Lankan',
 'Sinhalese',
 'Sinhalese',
 'Sinhalese',
 'Sri Lankan',
 'Egypt',
 'Egyptian',
 'US',
 'Egyptian',
 'US',
 'Egyptian',
 'Israeli',
 'Madrid Conference',
 'Palestinian',
 'I

In [24]:
# Filtered toponyms
fps, fns = evaluate.evaluate(data_filtered_toponyms, processed_results_filtered)

fp: 1703 | tp: 2326 | fn: 1405
precision: 0.577 | recall: 0.623 | f-score: 0.599


In [25]:
fps

['Orchard St.',
 'Cottonport Fire Station',
 'Memphis St.',
 'Augusta St.',
 'St',
 '. James Youth Detention Center',
 'Minnesota',
 'Otter Tail',
 'Highway',
 'Otter Tail / Grant',
 'Grant /',
 'Wilkin',
 'Lake Mary',
 'Benson',
 'Otter',
 'Tail',
 'Highway',
 '-',
 'Red River Valley',
 '40th Avenue South',
 'Hesco',
 'Oakport',
 'North Nokomis Street',
 'Darling Avenue',
 'Douglas County Hospital',
 'St',
 'County Road 109',
 'R',
 '-',
 'Van Dorn Street',
 'Ky',
 '.',
 'Conn',
 '.',
 'S',
 '.',
 'D',
 '.',
 'Ariz',
 '.',
 'Cora Kelly',
 'Old Town',
 'Old Town',
 'Washington, D. C.',
 'Old Town',
 'East End',
 'Southern Sudan',
 'Sub',
 'Sri',
 'Cairo Stadium',
 'Gulf',
 'Madrid',
 'Gulf',
 'US',
 'US',
 'US',
 'US',
 'Sharm El - Sheikh',
 'North Monticello',
 'St',
 ". John's Lutheran School",
 'Babcock Grove',
 'Sheldon Peck Homestead',
 'Woodfield',
 'Martingale',
 'Woodfield Shopping Center',
 'Streets of Woodfield',
 'Hudson',
 'Evergreen Lake',
 'Decatur',
 'U',
 '.',
 'S',
 '.

In [26]:
fns

['Rapides Parish',
 'Cottonport',
 'Alexandria',
 'MANSFIELD',
 'Mansfield',
 'Shreveport',
 'Cook',
 'Minnesota',
 'Otter Tail County',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Otter Tail',
 'Alexandria',
 'Oakport Township',
 'Douglas County',
 'Douglas County',
 'St. Cloud',
 'Alexandria',
 'Ky.',
 'Conn.',
 'S.D.',
 'Ariz.',
 'Alexandria',
 'Washington',
 'D.C.',
 'Virginia',
 'Sudanese',
 'Sudan',
 'Chinese',
 'Africa',
 'African',
 'Egyptian',
 'Egyptian',
 'Sudan',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sri Lankan',
 'Sri Lankan',
 'Sinhalese',
 'Sinhalese',
 'Sinhalese',
 'Sri Lankan',
 'Egypt',
 'Egyptian',
 'US',
 'Egyptian',
 'US',
 'Egyptian',
 'Israeli',
 'Palestinian',
 'Israeli',
 'Palestinian',
 'Palestinian',
 'Palestinian',
 'Palestinian',
 'Syrians',
 'Syrians',
 'Palestinians',
 'Palestinians',
 'Iranian',
 'Israeli',
 'Sharm El-Sheikh',
 'Americans',
 'Israeli',
 'Lombard',
 'Roselle',
 'Roselle',
 'Illinois',
 'Roselle',
 'Northbrook',
 'Ill.',
 'HUDSON'

## GeoWebNews

### Loading the dataset

In [27]:
# Get file path GWN dataset
file_path = '../../../data/GeoWebNews/GeoWebNews.xml'

import loading_functions

data_all_toponyms = loading_functions.prepare_data(file_path, filtered=False, split=False)

data_filtered_toponyms = loading_functions.prepare_data(file_path, filtered=True, split=False)

### Processing the data for Huggingface Trainer

In [28]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [29]:
import preparing_dataset

GWN = preparing_dataset.prepare_dataset(data_all_toponyms, tokenizer)

GWN_filtered = preparing_dataset.prepare_dataset(data_filtered_toponyms, tokenizer)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Prepare evaluation trainer for predictions

In [30]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator)

### Make Predictions

In [31]:
import numpy as np

raw_pred, _, _ = test_trainer.predict(GWN)
predictions = np.argmax(raw_pred, axis=2)

raw_pred_filtered, _, _ = test_trainer.predict(GWN_filtered)
predictions_filtered = np.argmax(raw_pred_filtered, axis=2)

### Process predictions

In [32]:
import process_predictions

processed_results = process_predictions.process_predictions(predictions, GWN, label_list, tokenizer)

processed_results_filtered = process_predictions.process_predictions(predictions_filtered, GWN_filtered, label_list, tokenizer)

### Evaluation GWN

In [33]:
import evaluate

# All toponyms
fps, fns = evaluate.evaluate(data_all_toponyms, processed_results)

fp: 345 | tp: 1216 | fn: 3064
precision: 0.779 | recall: 0.284 | f-score: 0.416


In [34]:
fps

['D',
 "'",
 'Aunoy',
 'Royal',
 'Kremlin',
 'COLUM',
 'St',
 ". Peter's Basilica",
 'White House',
 'White House',
 'Bethany',
 'Commonwealth of Independent States',
 'CIS',
 'Emir',
 'North America',
 'Sheikh Akil',
 'Islamic State',
 'West',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Soviet',
 'No',
 'N',
 'Seasons',
 'Polisario',
 'Vanni',
 'DWTC',
 'GA',
 'CT',
 'CT',
 '-',
 'U',
 '.',
 'S',
 '.',
 'Pontifex',
 'Read More',
 'Benue',
 'Benue',
 'Guma',
 'Peshawar',
 'Park Wana',
 'New Quay',
 'Gilgit',
 '-',
 'Baltistan',
 'Gilgit',
 '-',
 'Baltistan',
 'Gilgit',
 '-',
 'Baltistan',
 'Gilgit',
 '-',
 'Baltistan',
 'Gilgit',
 '-',
 'Baltistan',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Old Bailey',
 'PA',
 'N',
 '.',
 'C',
 '.',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Caravan',
 'Great House of',
 'Cilicia',
 'Embassy',
 'Republic of Armenia',
 'Motherland',
 'Holy',
 'Armenia',
 '-',
 'Macho Nikuya',
 'Tat',
 'Matson Snr.',
 'Korea',
 'OKLAHOMA',
 'U',
 '.',
 'S',
 '.',
 'G

In [35]:
fns

['area',
 'plantation',
 'mansion',
 'substation',
 'Louisiana',
 'Louisiana Purchase',
 'parcel',
 'French',
 'plat',
 'squares',
 'neighborhood',
 'city',
 'community',
 'African Americans',
 'German',
 'Irish',
 'populations',
 'blocks',
 'residents',
 'blocks',
 'intersection',
 'mills',
 'plant',
 'stables',
 'factory',
 'barn',
 'streets',
 'street',
 'avenue',
 'faubourg',
 'area',
 'neighbor',
 'building',
 'Methodist church',
 'restaurant',
 'street',
 'complex',
 '2231 Royal',
 '2231 Royal',
 'townhouse',
 'basement',
 'Greek',
 'structures',
 'BEIRUT',
 'Kurdish',
 'city',
 'Turkish',
 'Syrian',
 'Kurdish',
 'towns',
 'Syrian',
 'Turkish',
 'campaign',
 'Syrian',
 'Kurdish',
 'frontier',
 'forces',
 'Turkish',
 'Britain',
 'Syrian',
 'Syrian Observatory for Human Rights Monitoring',
 'group',
 'Kurdish',
 'Turkish',
 'Syrian',
 'clashes',
 'community',
 'style',
 'design',
 'residences',
 'garage',
 'clubroom',
 'terrace',
 'yard',
 'porch',
 'house',
 'kitchen',
 'rooms',
 

In [36]:
# Filtered toponyms
fps, fns = evaluate.evaluate(data_filtered_toponyms, processed_results_filtered)

fp: 356 | tp: 1200 | fn: 760
precision: 0.771 | recall: 0.612 | f-score: 0.683


In [37]:
fps

['D',
 "'",
 'Aunoy',
 'Royal',
 'Jamison',
 'Norbury',
 'Granville',
 'Somerville',
 'Jamison',
 'Norbury',
 'Norbury',
 'Kremlin',
 'COLUM',
 'St',
 ". Peter's Basilica",
 'White House',
 'White House',
 'Commonwealth of Independent States',
 'CIS',
 'Emir',
 'North America',
 'Sheikh Akil',
 'Islamic State',
 'CTV Saskatoon',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'West',
 'Soviet',
 'No',
 'N',
 'Seasons',
 'Vanni',
 'DWTC',
 'GA',
 'CT',
 'CT',
 '-',
 'U',
 '.',
 'S',
 '.',
 'Pontifex',
 'Read More',
 'Umuanunu',
 'Benue',
 'Benue',
 'Guma',
 'Peshawar',
 'Park Wana',
 'New Quay',
 'Gilgit',
 '-',
 'Baltistan',
 'Gilgit',
 '-',
 'Baltistan',
 'Gilgit',
 '-',
 'Baltistan',
 'Gilgit',
 '-',
 'Baltistan',
 'Gilgit',
 '-',
 'Baltistan',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Montana',
 'Old Bailey',
 'PA',
 'N',
 '.',
 'C',
 '.',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Caravan',
 'Great House of',
 'Cilicia',
 'Embassy',
 'Republic of Armenia',
 'Motherland',
 'Holy',
 '-

In [38]:
fns

['Louisiana',
 'French',
 'German',
 'Irish',
 'Methodist church',
 '2231 Royal',
 'Greek',
 'BEIRUT',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Syrian',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Turkish',
 'Britain',
 'Syrian',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Washington',
 'Washington',
 'African',
 'African',
 'African',
 'African',
 'Russian',
 'Russian',
 'Russian Higher School of Economics',
 'COLUMBUS',
 'Mississippi',
 "St. Peter's Basilica",
 'Nigerian',
 'Nigerian',
 'Muscat',
 'Omani',
 'Oman',
 'New York',
 'Mediterranean',
 'France',
 'EU',
 'EU',
 'EU',
 'European',
 'European',
 'EU',
 'European',
 'EU',
 'European',
 'France',
 'August',
 'Korean',
 'Broadway',
 'SAN ANTONIO',
 'Louisville',
 'Iran',
 'Dubai World Cup',
 'Dubai World Cup',
 'Emirati',
 'American',
 'Dubai World Cup',
 'Dubai Turf',
 'Belgian',
 'Dubai World Cup',
 'California',
 'Syrian',
 'Syrian',
 'Syrian',
 'Iraqi',
 'Syrian',
 'US',
 'Turkish',
 'American',
 'Sheikh Akil graveyard',
 'Winnipe