In [35]:
%%html
<style>
table {float:left}
</style>

In [2]:
import os
import spacy
from tqdm import tqdm
import xml.etree.ElementTree as et

In [3]:
import copy

def calc_precision(tp, fp):
    return tp/(tp + fp)

def calc_recall(tp, fn):
    return tp/(tp + fn)

def calc_fscore(precision, recall):
    return 2 * (precision * recall) / (precision + recall)

def evaluate(gold_truth_labels, predictions):
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    for gold, pred in zip(gold_truth_labels, predictions):
        
        tp_tmp, fp_tmp, fn_tmp, fns_temp, fps_temp  = evaluate_one_article(gold, pred)
        
        tp += tp_tmp
        fp += fp_tmp
        fn += fn_tmp
        
        fns.extend(fns_temp)
        fps.extend(fps_temp) 
        
    precision = calc_precision(tp, fp)
    recall = calc_recall(tp, fn)
    f_score = calc_fscore(precision, recall)    
    
    print(f'fp: {fp} | tp: {tp} | fn: {fn}')
    print(f'precision: {precision:.3f} | recall: {recall:.3f} | f-score: {f_score:.3f}')
    
    return fps, fns  
    

def evaluate_one_article(gold_truth, prediction):
    
    gold = gold_truth['entities'].copy()
    pred = prediction['entities'].copy()
    
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    
    i = 0
    
    while len(gold) > 0 and len(pred) > 0:
        i += 1

        # Check if the first two elements are the same
        if gold[0] == pred[0]:
            tp += 1
            gold.pop(0)
            pred.pop(0)
        
        else:
            # Grab the first appearing element
            element, source = (gold[0], 'gold') if gold[0]['start_pos'] < pred[0]['start_pos'] else (pred[0], 'pred')
            
            # Remove the element first appearing element
            if source == 'gold':
                fn += 1
                fns.append(element['text'])
                gold.remove(element)
            elif source == 'pred':
                fp += 1
                fps.append(element['text'])
                pred.remove(element)
    
    if len(gold) > 0:
        fn += 1
    elif len(pred) > 0:
        fp += 1
        
    return tp, fp, fn, fns, fps       

def run_flair(text):

    # make a sentence
    sentence = Sentence(text)

    # run NER over sentence
    tagger.predict(sentence)
    
    for entity in sentence.to_dict(tag_type='ner')['entities']:
        print(entity)

In [4]:
def load_file(file_path):
    """
    Loads file and returns all the articles
    """
    # Load the data
    tree = et.parse(file_path)
    root = tree.getroot()

    return root

def process_article(article, filtered, file_path):
    """
    Takes article and process into desired structure
    """
    if 'GeoWebNews' in file_path:
        if filtered:
            return {'text': article.find('text').text, 
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym') 
                                                     if top.find('latitude') != None and top.find('longitude') != None], key=lambda k: k['start_pos'])}
        
        else:
            return {'text': article.find('text').text, 
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')], key=lambda k: k['start_pos'])}
    
    
    elif not filtered:
        return {'text': article.find('text').text,
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                                 ], key=lambda k: k['start_pos'])}
        
    else:
        return {'text': article.find('text').text,
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                             if top.find('gaztag/lat') != None and top.find('gaztag/lon') != None
                                 ], key=lambda k: k['start_pos'])}

def process_articles(root, filtered, file_path):
    """
    Takes articles and processes them into desired structure
    """
    data = []
    
    for article in root:
        
        data.append(process_article(article, filtered, file_path))
    
    return data

def prepare_data(file_path, filtered):
    
    root = load_file(file_path)
    
    data = process_articles(root, filtered, file_path)
    
    return data

In [5]:
def make_predictions_spacy(data):
    
    predictions = []
    
    texts = [article['text'] for article in data]
    
    for doc in tqdm(nlp.pipe(texts)):
        # Do something with the doc here
        
        pred = {'entities': [{'text': ent.text, 
                              'start_pos': len(doc[0:ent.end].text) - len(doc[ent.start]), 
                              'end_pos': len(doc[0:ent.end].text)} for ent in doc.ents if 
                                                             ent.label_ == 'LOC' or ent.label_ == 'FAC' or ent.label_ == 'GPE']}
        
        if pred:
            pred['text'] = doc
            
            predictions.append(pred)
        
    return predictions


### Load the default transfomer spacy model

In [6]:
# Only enable the ner tagger
nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

## TR-News

In [7]:
# Get file path LGL dataset
file_path = '../../data/TR-News/TR-News.xml'

In [8]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for TR-News

In [9]:
predictions_all_toponyms = make_predictions_spacy(data_all_toponyms)
predictions_filtered_toponyms = make_predictions_spacy(data_filtered_toponyms)

118it [02:22,  1.21s/it]
118it [02:24,  1.22s/it]


#### Results TR-News & Comparison

In [10]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

fp: 295 | tp: 754 | fn: 498
precision: 0.719 | recall: 0.602 | f-score: 0.655


In [11]:
fps

['U.S.-led',
 'North Carolina',
 'West Virginia',
 'Granville County',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'Interstate 64',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'New York',
 'New York',
 'the United States',
 'the United States',
 'the United States',
 'the Rose Garden',
 'College Station',
 'Ronald Reagan Building',
 'West Chester',
 'the United States',
 'Southern District of Ohio',
 'South Carolina',
 'South Carolina',
 'Wooster St.',
 'New Milford',
 'Downtown Eastside',
 "San Francisco's",
 'Rhineland-Palatinate',
 'the South Pacific',
 'Singapore, Singapore',
 'the Central Coast',
 'West Coast',
 'Pearson International Airport',
 'Macdonald-Cartier International Airport',
 'Montreal-Pierre Elliott Trudeau International Airport',
 'Mont-Royal Avenue',
 'Etobicoke',
 'Man',
 'Churchill',
 'Churchill',
 'Churchill',
 'Winnipeg',
 'Little Portugal',
 'College Street',
 'Dufferin Street',
 'Phoenix',
 'Phoenix',
 'Gatineau',
 'Que',
 'Ottawa',
 'K

In [12]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Turkish',
 'Russian',
 'Syrian',
 'Russian',
 'Turkey',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Russian',
 'Turkish',
 'Russian',
 'Russian',
 'Pyongyang',
 'Russian',
 'Syrian',
 'North Carolina',
 'West Virginia',
 'Granville County',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'Granville County',
 'New York',
 'New York',
 'New York City',
 'United States',
 'United States',
 'United States',
 'White House',
 'Texas',
 'College Station',
 'Texas',
 'Texas',
 'U.S.',
 'West Chester',
 'Xavier University',
 'United States',
 'British',
 'European',
 'Ohio',
 'New York',
 'U.S.',
 'United States',
 'Wisconsin',
 'New York',
 'Michigan',
 'Pennsylvania',
 'South Carolina',
 'South Carolina',
 'Cuban',
 'Bantam',
 'New Milford',
 'New Milford',
 'Red Deer',
 'Russian',
 'Russian',

In [13]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

fp: 296 | tp: 754 | fn: 542
precision: 0.718 | recall: 0.582 | f-score: 0.643


In [14]:
fps

['U.S.-led',
 'North Carolina',
 'West Virginia',
 'Granville County',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'Interstate 64',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'New York',
 'New York',
 'the United States',
 'the United States',
 'the United States',
 'the Rose Garden',
 'College Station',
 'Ronald Reagan Building',
 'West Chester',
 'the United States',
 'Southern District of Ohio',
 'South Carolina',
 'South Carolina',
 'Wooster St.',
 'New Milford',
 'Downtown Eastside',
 "San Francisco's",
 'Rhineland-Palatinate',
 'the South Pacific',
 'Singapore, Singapore',
 'the Central Coast',
 'West Coast',
 'Pearson International Airport',
 'Macdonald-Cartier International Airport',
 'Montreal-Pierre Elliott Trudeau International Airport',
 'Mont-Royal Avenue',
 'Etobicoke',
 'Man',
 'Churchill',
 'Churchill',
 'Churchill',
 'Winnipeg',
 'Little Portugal',
 'College Street',
 'Dufferin Street',
 'Phoenix',
 'Phoenix',
 'Gatineau',
 'Que',
 'Ottawa',
 'K

In [15]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Kurdish',
 'Turkish',
 'Russian',
 'Syrian',
 'Russian',
 'Turkey',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Russian',
 'Turkish',
 'Russian',
 'Russian',
 'Pyongyang',
 'Russian',
 'Syrian',
 'North Carolina',
 'West Virginia',
 'Granville County',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'Granville County',
 'New York',
 'New York',
 'New York City',
 'United States',
 'United States',
 'United States',
 'White House',
 'Texas',
 'College Station',
 'Texas',
 'Texas',
 'U.S.',
 'West Chester',
 'Xavier University',
 'United States',
 'British',
 'European',
 'Ohio',
 'New York',
 'U.S.',
 'United States',
 'Wisconsin',
 'New York',
 'Michigan',
 'Pennsylvania',
 'South Carolina',
 'South Carolina',
 'Cuban',
 'Bantam',
 'New Milford',
 'New Milford',
 'Red Deer',
 'Russian',

##### scoring overview TR-News

|   Geoparser Name                            | Precision | Recall  | F-Score |
|:-----------------                           |:---------:|:------: |:-------:|
| StanfordNERparser                           |   0.890   |  0.731  |  0.803  |
| TopoClusterparser                           |   0.883   |  0.714  |  0.790  |
|__Flair (all toponyms)__                     |__0.803__  |__0.699__|__0.748__|
|   CamCoderparser                            |   0.897   |  0.638  |  0.746  |
|__Flair MULTI ner (all toponyms)__           |__0.779__  |__0.694__|__0.739__|
|__Flair MULTI ner FAST (all toponyms)__      |__0.802__  |__0.683__|__0.738__|
|__Flair (filtered)__                         |__0.773__  |__0.695__|__0.732__|
|   DBpediaparser                             |   0.861   |  0.631  |  0.728  |
|__Flair MULTI ner (filtered)__               |__0.761__  |__0.691__|__0.724__|
|__Flair MULTI ner FAST (filtered)__          |__0.772__  |__0.679__|__0.722__|
|__SpaCy default transformer (filtered)__     |__0.719__  |__0.602__|__0.655__|
|    CLAVINparser                             |   0.908   |  0.505  |  0.649  |
|__SpaCy default transformer (all)__          |__0.718__  |__0.582__|__0.643__|
|__SpaCy default (filtered)__                 |__0.712__  |__0.590__|__0.645__|
|__SpaCy default (all)__                      |__0.711__  |__0.570__|__0.633__|
|  Edinburghparser                            |   0.709   |  0.538  |  0.612  |
|__SpaCy MULTI ner (filtered)__               |__0.612__  |__0.561__|__0.586__|
|__SpaCy MULTI ner  (all)__                   |__0.611__  |__0.542__|__0.575__|
|   SpaCyNERparser                            |   0.659   |  0.402  |  0.500  |      

## LGL

In [16]:
# Get file path LGL dataset
file_path = '../../data/LGL/LGL.xml'

In [17]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for LGL

In [18]:
predictions_all_toponyms = make_predictions_spacy(data_all_toponyms)
predictions_filtered_toponyms = make_predictions_spacy(data_filtered_toponyms)

588it [08:57,  1.09it/s]
588it [08:58,  1.09it/s]


#### Results LGL & Comparison

In [19]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

fp: 1769 | tp: 2332 | fn: 2044
precision: 0.569 | recall: 0.533 | f-score: 0.550


In [20]:
fps

['Kelleyland',
 'Orchard St.',
 'the Cottonport Fire Station',
 'Memphis St.',
 'Augusta St.',
 'Pointe Coupee',
 'Mahnomen County Road 3',
 'Mahnomen County Road 122',
 'Twin Lakes -',
 'Otter Tail County Road 80 - Highway',
 'Pelican Rapids',
 'Highway 59',
 'Star Lake - Highway',
 'Otter Tail/Grant',
 'Highway 55 -',
 'Grant',
 'County Road 43 - Highway 114',
 'Douglas County Road 4',
 'Lake Mary - Highway',
 'Highway 12',
 'Benson',
 'Highway 75',
 'Otter Tail',
 'Parkers Prairie',
 'Room 113',
 'Highway 29',
 'Parkers Prairie',
 'the Red River',
 'Fargo-Moorhead',
 'Red River',
 'the Red River Valley',
 '40th Avenue South',
 'Drain 27',
 'North Dakota',
 'Oakport Township',
 'the Red River',
 'Red River',
 'Fargodome',
 'Sandbag Central',
 'County Road 35',
 'Lake Vermont',
 'County Road 56',
 'County Road 15',
 'County Road 96',
 'Nokomis',
 'North Nokomis Street',
 'Darling Avenue',
 'County Road 109',
 'Van Dorn Street',
 'New Haven',
 'Sioux Falls',
 'New Haven',
 'the Durant 

In [21]:
fns

['Rapides Parish',
 'Cottonport',
 'Pointe Coupee',
 'Mansfield',
 'Shreveport',
 'Cook',
 'Minneapolis',
 'Minnesota',
 'Minnesota',
 'Chisholm',
 'Elbow Lake',
 'Twin Lakes',
 'Otter Tail County',
 'Pelican Rapids',
 'Star Lake',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Otter Tail',
 'Parkers Prairie',
 'Parkers Prairie',
 'Parkers Prairie',
 'Alexandria',
 'Red River',
 'Fargo',
 'Moorhead',
 'Red River',
 'North Dakota',
 'Oakport Township',
 'Red River',
 'Minnesota',
 'Fargo',
 'Red River',
 'Fargo',
 'Douglas County',
 'Lake Vermont',
 'Freeborn Lake',
 'Elbow Lake',
 'Alexandria',
 'New Haven',
 'Sioux Falls',
 'New Haven',
 'Sioux Falls',
 'Alexandria',
 'Alexandria',
 'New York City',
 'D.C.',
 'D.C.',
 'Sudanese',
 'Sudan',
 'Chinese',
 'Africa',
 'African',
 'Darfur',
 'Egyptian',
 'Egyptian',
 'Sudan',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Egyptian',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Egyptian',
 'Sudanes

In [22]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

fp: 1785 | tp: 2421 | fn: 2575
precision: 0.576 | recall: 0.485 | f-score: 0.526


In [23]:
fps

['Kelleyland',
 'Orchard St.',
 'the Cottonport Fire Station',
 'Memphis St.',
 'Augusta St.',
 'Pointe Coupee',
 'Mahnomen County Road 3',
 'Mahnomen County Road 122',
 'Twin Lakes -',
 'Otter Tail County Road 80 - Highway',
 'Pelican Rapids',
 'Highway 59',
 'Star Lake - Highway',
 'Otter Tail/Grant',
 'Highway 55 -',
 'Grant',
 'County Road 43 - Highway 114',
 'Douglas County Road 4',
 'Lake Mary - Highway',
 'Highway 12',
 'Highway 75',
 'Otter Tail',
 'Parkers Prairie',
 'Room 113',
 'Highway 29',
 'Parkers Prairie',
 'the Red River',
 'Fargo-Moorhead',
 'Red River',
 'the Red River Valley',
 '40th Avenue South',
 'Drain 27',
 'North Dakota',
 'Oakport Township',
 'the Red River',
 'Red River',
 'Fargodome',
 'Sandbag Central',
 'County Road 35',
 'Lake Vermont',
 'County Road 56',
 'County Road 15',
 'County Road 96',
 'Freeborn Lake',
 'North Nokomis Street',
 'Darling Avenue',
 'County Road 109',
 'Van Dorn Street',
 'New Haven',
 'Sioux Falls',
 'New Haven',
 'the Durant Cente

In [24]:
fns

['Rapides Parish',
 'Cottonport',
 'Memphis St.',
 'Augusta St.',
 'Memphis Street',
 'Pointe Coupee',
 'Shamrock Street',
 'Mansfield',
 'Shreveport',
 'DeSoto Parish',
 'Cook',
 'Minneapolis',
 'Minnesota',
 'Minnesota',
 'Chisholm',
 'Elbow Lake',
 'Highway 200',
 'Mahnomen County Road',
 'Mahnomen County Road',
 'Twin Lakes',
 'Highway 10',
 'Otter Tail County',
 'Highway 108',
 'Pelican Rapids',
 'Star Lake',
 'Otter Tail/Grant',
 'Grant/Wilkin',
 'County Road 43',
 'Highway 114',
 'Douglas County',
 'Lake Mary',
 'Minnesota',
 'Minnesota',
 'Otter Tail',
 'Parkers Prairie',
 'Parkers Prairie',
 'Parkers Prairie',
 'Alexandria',
 'Red River',
 'Fargo',
 'Moorhead',
 'Red River',
 'Red River Valley',
 '40th Avenue South',
 'North Dakota',
 'Oakport Township',
 'Red River',
 'Minnesota',
 'Fargo',
 'Red River',
 'Fargo',
 'Douglas County',
 'County Road 35',
 'Lake Vermont',
 'County Road 56',
 'County Road 15',
 'County Road 96',
 'Freeborn Lake',
 'County Road 42',
 'North Nokomis

##### scoring overview LGL

| Geoparser Name                          | Precision | Recall  | F-Score |
|:-------------------                     |-----------|-------- |---------|
|__Flair (all)__                          |__0.763__  |__0.676__|__0.717__|
| DBpediaparser                           | 0.813     | 0.635   |  0.713  |
|__Flair MULTI(all)__                     |__0.761__  |__0.663__|__0.708__|
|__Flair MULTI FAST(all)__                |__0.770__  |__0.644__|__0.701__|
| StanfordNERparser                       | 0.744     | 0.622   |  0.677  |
| TopoClusterparser                       | 0.763     | 0.577   |  0.657  |
|__Flair (filtered)__                     |__0.660__  |__0.653__|__0.657__|
|__Flair MULTI(filtered)__                |__0.666__  |__0.646__|__0.656__|
| CamCoderparser                          | 0.811     | 0.548   |  0.654  |
|__Flair MULTI FAST (filtered)__          |__0.674__  |__0.629__|__0.651__|
| CLAVINparser                            | 0.808     | 0.444   |  0.573  |
|__SpaCy default trans (filtered)__       |__0.569__  |__0.533__|__0.550__|
|__SpaCy default trans (all)__            |__0.576__  |__0.485__|__0.526__|
|__SpaCy default (filtered)__             |__0.553__  |__0.469__|__0.508__|
| Edinburghparser                         | 0.723     | 0.383   |  0.501  |
|__SpaCy default (all)__                  |__0.558__  |__0.424__|__0.482__|
| SpaCyNERparser                          | 0.493     | 0.371   |  0.423  |
|__SpaCy MULTI(filtered)__                |__0.415__  |__0.416__|__0.416__|
|__SpaCy MULTI FAST (all)__               |__0.419__  |__0.376__|__0.397__|





## GeoWebNews

In [25]:
# Get file path LGL dataset
file_path = '../../data/GeoWebNews/GeoWebNews.xml'

In [26]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for GeoWebNews

In [27]:
predictions_all_toponyms = make_predictions_spacy(data_all_toponyms)
predictions_filtered_toponyms = make_predictions_spacy(data_filtered_toponyms)

200it [03:57,  1.19s/it]
200it [03:56,  1.18s/it]


#### Results GeoWebNews & Comparison

In [29]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

fp: 492 | tp: 1264 | fn: 1258
precision: 0.720 | recall: 0.501 | f-score: 0.591


In [30]:
fps

['French Quarter',
 'the Mississippi River',
 'the Faubourg Marigny',
 'Rue d’Enghein',
 'the Marigny Plantation',
 'Press Street',
 'the Faubourg D’Aunoy',
 'Chartres/Franklin',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 'Elysian Fields',
 'Royal Street',
 'Desire',
 'Desire Street',
 'Elysian Fields',
 'Claiborne Power House',
 'the Marigny Canal',
 'Washington Square',
 'Champs-Élysées',
 'Elysian Fields',
 'Smoky Mary',
 'Royal Street',
 'Washington Square',
 'Frenchmen Street',
 'Granville',
 'Somerville',
 'Lake Manassas',
 'Turtle Point',
 'Old Town Manassas',
 'Vienna',
 'Turtle Creek Cir',
 'North Carolina',
 "St. Peter's Basilica",
 'Bollywood',
 'Fire Island',
 'Fort Salonga',
 'the Manila Cathedral',
 'New Orleans',
 'New York',
 'South Korea',
 'New York City',
 'New Orleans',
 'the Commonwealth of Independent States (',
 "the Islamic Republic's",
 'CIS',
 'Meydan Racecourse',
 'West Coast',
 'Gate 10',
 'North America',
 'Gat

In [31]:
fns

['Louisiana',
 'French',
 'French Quarter',
 'Mississippi River',
 'Faubourg Marigny',
 'German',
 'Irish',
 'Rue d’Enghein',
 'Marigny Plantation',
 'Press Street',
 'Faubourg D’Aunoy',
 'Chartres',
 'Franklin',
 'Methodist church',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 'Greek',
 'Elysian Fields',
 'Royal Street',
 'Desire Street',
 'Elysian Fields',
 'New Orleans Railways and Light Company Claiborne Power House',
 'French',
 'Marigny Canal',
 'Washington Square',
 'Champs-Élysées',
 'Elysian Fields',
 'Pontchartrain Railroad',
 'Royal Street',
 'Washington Square',
 'Holy Redeemer Church',
 'Third Presbyterian Church',
 'Frenchmen Street',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Syrian',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Lake Manassas',
 'Robert Trent Jones Golf Club',
 'Stonewall Golf Club',
 'Virginia Gateway',
 'Haymarket Village Center',
 'Buckland Elementary'

In [32]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

fp: 486 | tp: 1271 | fn: 4454
precision: 0.723 | recall: 0.222 | f-score: 0.340


In [33]:
fps

['French Quarter',
 'the Mississippi River',
 'the Faubourg Marigny',
 'Rue d’Enghein',
 'the Marigny Plantation',
 'Press Street',
 'the Faubourg D’Aunoy',
 'Chartres/Franklin',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 'Elysian Fields',
 'Royal Street',
 'Desire',
 'Desire Street',
 'Elysian Fields',
 'Claiborne Power House',
 'the Marigny Canal',
 'Washington Square',
 'Champs-Élysées',
 'Elysian Fields',
 'Smoky Mary',
 'Royal Street',
 'Washington Square',
 'Frenchmen Street',
 'Lake Manassas',
 'Turtle Point',
 'Old Town Manassas',
 'Vienna',
 'Turtle Creek Cir',
 'North Carolina',
 "St. Peter's Basilica",
 'Bollywood',
 'Fire Island',
 'Fort Salonga',
 'the Manila Cathedral',
 'the Manila Cathedral',
 'New Orleans',
 'New York',
 'South Korea',
 'New York City',
 'New Orleans',
 'the Commonwealth of Independent States (',
 "the Islamic Republic's",
 'CIS',
 'Meydan Racecourse',
 'West Coast',
 'Gate 10',
 'North America',
 'Gate 10

In [34]:
fns

['area',
 'plantation',
 'mansion',
 'substation',
 'Louisiana',
 'Louisiana Purchase',
 'parcel',
 'French',
 'plat',
 'French Quarter',
 'Mississippi River',
 'squares',
 'neighborhood',
 'city',
 'Faubourg Marigny',
 'community',
 'African Americans',
 'German',
 'Irish',
 'populations',
 'blocks',
 'residents',
 'blocks',
 'intersection',
 'mills',
 'plant',
 'stables',
 'factory',
 'barn',
 'streets',
 'Rue d’Enghein',
 'street',
 'avenue',
 'Marigny Plantation',
 'faubourg',
 'Press Street',
 'area',
 'Faubourg D’Aunoy',
 'neighbor',
 'building',
 'Chartres',
 'Franklin',
 'Methodist church',
 'restaurant',
 'street',
 'complex',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 '2231 Royal',
 'townhouse',
 'basement',
 'Greek',
 'structures',
 'Elysian Fields',
 'Royal Street',
 'Desire Street',
 'system',
 'neighborhoods',
 'edifice',
 'Elysian Fields',
 'New Orleans Railways and Light Company Claiborne Power House',
 'New Orleans Railway

##### scoring overview GeoWebNews
|   Geoparser Name                 | Precision | Recall  | F-Score |
|:-----------------                |:---------:|:------: |:-------:|
|__Flair(filtered)__               |__0.901__  |__0.662__|__0.763__|
|__Flair MULTI ner(filtered)__     |__0.901__  |__0.662__|__0.763__|
|__Flair MULTI FAST ner(filtered)__|__0.906__  |__0.638__|__0.749__|
| StanfordNERparser                |   0.885   |  0.635  |  0.739  |
|   CamCoderparser                 |   0.895   |  0.562  |  0.691  |
| TopoClusterparser                |   0.838   |  0.559  |  0.670  |
|  Edinburghparser                 |   0.819   |  0.538  |  0.650  |
|   DBpediaparser                  |   0.847   |  0.510  |  0.637  |
|__SpaCy default trans (filtered)__|__0.720__  |__0.501__|__0.591__|
|__SpaCy default (filtered)__      |__0.702__  |__0.472__|__0.564__|
|    CLAVINparser                  |   0.909   |  0.394  |  0.549  |
|__SpaCy MULTI ner(filtered)__     |__0.564__  |__0.454__|__0.503__|
|   SpaCyNERparser                 |   0.561   |  0.389  |  0.460  |
|__Flair(all)__                    |__0.911__  |__0.293__|__0.443__|
|__Flair MULTI ner(all)__          |__0.912__  |__0.289__|__0.439__|
|__Flair MULTI FAST ner(all)__     |__0.912__  |__0.280__|__0.428__|
|__SpaCy default trans (all)__     |__0.723__  |__0.222__|__0.340__|
|__SpaCy default (all)__           |__0.504__  |__0.206__|__0.318__|
|__SpaCy MULTI ner(all)__          |__0.582__  |__0.200__|__0.298__|