In [34]:
%%html
<style>
table {float:left}
</style>

In [2]:
import os
import spacy
from tqdm import tqdm
import xml.etree.ElementTree as et

In [3]:
import copy

def calc_precision(tp, fp):
    return tp/(tp + fp)

def calc_recall(tp, fn):
    return tp/(tp + fn)

def calc_fscore(precision, recall):
    return 2 * (precision * recall) / (precision + recall)

def evaluate(gold_truth_labels, predictions):
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    for gold, pred in zip(gold_truth_labels, predictions):
        
        tp_tmp, fp_tmp, fn_tmp, fns_temp, fps_temp  = evaluate_one_article(gold, pred)
        
        tp += tp_tmp
        fp += fp_tmp
        fn += fn_tmp
        
        fns.extend(fns_temp)
        fps.extend(fps_temp) 
        
    precision = calc_precision(tp, fp)
    recall = calc_recall(tp, fn)
    f_score = calc_fscore(precision, recall)    
    
    print(f'fp: {fp} | tp: {tp} | fn: {fn}')
    print(f'precision: {precision:.3f} | recall: {recall:.3f} | f-score: {f_score:.3f}')
    
    return fps, fns  
    

def evaluate_one_article(gold_truth, prediction):
    
    gold = gold_truth['entities'].copy()
    pred = prediction['entities'].copy()
    
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    
    i = 0
    
    while len(gold) > 0 and len(pred) > 0:
        i += 1

        # Check if the first two elements are the same
        if gold[0] == pred[0]:
            tp += 1
            gold.pop(0)
            pred.pop(0)
        
        else:
            # Grab the first appearing element
            element, source = (gold[0], 'gold') if gold[0]['start_pos'] < pred[0]['start_pos'] else (pred[0], 'pred')
            
            # Remove the element first appearing element
            if source == 'gold':
                fn += 1
                fns.append(element['text'])
                gold.remove(element)
            elif source == 'pred':
                fp += 1
                fps.append(element['text'])
                pred.remove(element)
    
    if len(gold) > 0:
        fn += 1
    elif len(pred) > 0:
        fp += 1
        
    return tp, fp, fn, fns, fps       

def run_flair(text):

    # make a sentence
    sentence = Sentence(text)

    # run NER over sentence
    tagger.predict(sentence)
    
    for entity in sentence.to_dict(tag_type='ner')['entities']:
        print(entity)

In [4]:
def load_file(file_path):
    """
    Loads file and returns all the articles
    """
    # Load the data
    tree = et.parse(file_path)
    root = tree.getroot()

    return root

def process_article(article, filtered, file_path):
    """
    Takes article and process into desired structure
    """
    if 'GeoWebNews' in file_path:
        if filtered:
            return {'text': article.find('text').text, 
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym') 
                                                     if top.find('latitude') != None and top.find('longitude') != None], key=lambda k: k['start_pos'])}
        
        else:
            return {'text': article.find('text').text, 
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')], key=lambda k: k['start_pos'])}
    
    
    elif not filtered:
        return {'text': article.find('text').text,
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                                 ], key=lambda k: k['start_pos'])}
        
    else:
        return {'text': article.find('text').text,
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                             if top.find('gaztag/lat') != None and top.find('gaztag/lon') != None
                                 ], key=lambda k: k['start_pos'])}

def process_articles(root, filtered, file_path):
    """
    Takes articles and processes them into desired structure
    """
    data = []
    
    for article in root:
        
        data.append(process_article(article, filtered, file_path))
    
    return data

def prepare_data(file_path, filtered):
    
    root = load_file(file_path)
    
    data = process_articles(root, filtered, file_path)
    
    return data

In [5]:
def make_predictions_spacy(data):
    
    predictions = []
    
    texts = [article['text'] for article in data]
    
    for doc in tqdm(nlp.pipe(texts)):
        # Do something with the doc here
        
        pred = {'entities': [{'text': ent.text, 
                              'start_pos': len(doc[0:ent.end].text) - len(doc[ent.start]), 
                              'end_pos': len(doc[0:ent.end].text)} for ent in doc.ents if ent.label_ == 'LOC']}
        
        if pred:
            pred['text'] = doc
            
            predictions.append(pred)
        
    return predictions

### Load the multi lingual spacy model

In [6]:
# Only enable the ner tagger
nlp = spacy.load("xx_ent_wiki_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

## TR-News

In [7]:
# Get file path LGL dataset
file_path = '../../data/TR-News/TR-News.xml'

In [8]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for TR-News

In [9]:
predictions_all_toponyms = make_predictions_spacy(data_all_toponyms)
predictions_filtered_toponyms = make_predictions_spacy(data_filtered_toponyms)

118it [00:01, 68.13it/s]
118it [00:01, 75.69it/s]


#### Results TR-News & Comparison

In [10]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

fp: 449 | tp: 709 | fn: 554
precision: 0.612 | recall: 0.561 | f-score: 0.586


In [11]:
fps

['White House',
 'Assad',
 'Turkish Embassy',
 'North Carolina',
 'West Virginia',
 'Granville County Fire',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'Interstate 64',
 'West Virginia State Police Lt',
 'Chevrolets',
 'South Regional Jail',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'Amazon',
 'New York Gov',
 'United States',
 'United States',
 'United States',
 'White House',
 'Rose Garden',
 'College Station',
 'Texas A&M',
 'Islamic State',
 'West Chester',
 'U.S. District',
 'United States',
 'Islamic State',
 'Southern District of Ohio',
 'No',
 'New York',
 '@HillaryClinton\n            Earlier Mr Trump',
 'South Carolina',
 'South Carolina',
 'Cubans',
 'Cubans',
 'Thursday',
 'Bantam Superior Court',
 'Wooster St',
 'Gomez',
 'New Milford',
 'Bingham',
 'Cumberland Farms',
 'Main St',
 'Calgary MLA',
 'North West MLA',
 'MLAs',
 'MLAs',
 'Downtown Eastside',
 'London City',
 'Swissport',
 'San Francisco',
 'Illegal',
 'Central',
 'Rhineland',
 'Palati

In [12]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Turkish',
 'Russian',
 'Syrian',
 'Russian',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Russian',
 'Turkish',
 'Russian',
 'Russian',
 'Russian',
 'Syrian',
 'North Carolina',
 'West Virginia',
 'Granville County',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'Granville County',
 'New York',
 'New York',
 'New York City',
 'United States',
 'United States',
 'United States',
 'White House',
 'WASHINGTON',
 'AUSTIN',
 'Texas',
 'Texas',
 'College Station',
 'Texas',
 'Texas',
 'U.S.',
 'West Chester',
 'U.S.',
 'Xavier University',
 'United States',
 'British',
 'European',
 'U.S.',
 'Ohio',
 'New York',
 'DETROIT',
 'U.S.',
 'U.S.',
 'United States',
 'New York',
 'Michigan',
 'South Carolina',
 'South Carolina',
 'Cuban',
 'BANTAM',
 'Danbury',
 'Bantam',
 'New Milford',
 'Danbury'

In [13]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

fp: 452 | tp: 709 | fn: 598
precision: 0.611 | recall: 0.542 | f-score: 0.575


In [14]:
fps

['White House',
 'Assad',
 'Turkish Embassy',
 'North Carolina',
 'West Virginia',
 'Granville County Fire',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'Interstate 64',
 'West Virginia State Police Lt',
 'Chevrolets',
 'South Regional Jail',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'Amazon',
 'New York Gov',
 'United States',
 'United States',
 'United States',
 'White House',
 'Rose Garden',
 'College Station',
 'Texas A&M',
 'Islamic State',
 'West Chester',
 'U.S. District',
 'United States',
 'Islamic State',
 'Southern District of Ohio',
 'No',
 'New York',
 '@HillaryClinton\n            Earlier Mr Trump',
 'South Carolina',
 'South Carolina',
 'Cubans',
 'Cubans',
 'Thursday',
 'Bantam Superior Court',
 'Wooster St',
 'Gomez',
 'New Milford',
 'Bingham',
 'Cumberland Farms',
 'Main St',
 'Calgary MLA',
 'North West MLA',
 'MLAs',
 'MLAs',
 'Downtown Eastside',
 'London City',
 'Swissport',
 'San Francisco',
 'Illegal',
 'Central',
 'Rhineland',
 'Palati

In [15]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Kurdish',
 'Turkish',
 'Russian',
 'Syrian',
 'Russian',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Turkish',
 'Turkish',
 'Russian',
 'Russian',
 'Turkish',
 'Russian',
 'Russian',
 'Russian',
 'Syrian',
 'North Carolina',
 'West Virginia',
 'Granville County',
 'West Virginia',
 'West Virginia',
 'North Carolina',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'West Virginia',
 'Granville County',
 'New York',
 'New York',
 'New York City',
 'United States',
 'United States',
 'United States',
 'White House',
 'WASHINGTON',
 'AUSTIN',
 'Texas',
 'Texas',
 'College Station',
 'Texas',
 'Texas',
 'U.S.',
 'West Chester',
 'U.S.',
 'Xavier University',
 'United States',
 'British',
 'European',
 'U.S.',
 'Ohio',
 'New York',
 'DETROIT',
 'U.S.',
 'U.S.',
 'United States',
 'New York',
 'Michigan',
 'South Carolina',
 'South Carolina',
 'Cuban',
 'BANTAM',
 'Danbury',
 'Bantam',
 'New Milford'

##### scoring overview

|   Geoparser Name                 | Precision | Recall | F-Score |
|:-----------------               |:---------:|:------:|:-------:|
| StanfordNERparser                     |   0.890   |  0.731 |  0.803  |
| TopoClusterparser                     |   0.883   |  0.714 |  0.790  |
|__Flair (all toponyms)__               |__0.803__  |__0.699__|__0.748__|
|   CamCoderparser                      |   0.897   |  0.638 |  0.746  |
|__Flair MULTI ner (all toponyms)__     |__0.779__  |__0.694__|__0.739__|
|__Flair MULTI ner FAST (all toponyms)__|__0.802__  |__0.683__|__0.738__|
|__Flair (filtered)__                   |__0.773__  |__0.695__|__0.732__|
|   DBpediaparser                       |   0.861   |  0.631 |  0.728  |
|__Flair MULTI ner (filtered)__         |__0.761__  |__0.691__|__0.724__|
|__Flair MULTI ner FAST (filtered)__    |__0.772__  |__0.679__|__0.722__|
|    CLAVINparser                       |   0.908   |  0.505 |  0.649  |
|  Edinburghparser                      |   0.709   |  0.538 |  0.612  |
|__SpaCy MULTI ner (filtered)__         |__0.612__  |__0.561__|__0.586__|
|__SpaCy MULTI ner  (all)__             |__0.611__  |__0.542__|__0.575__|
|   SpaCyNERparser                      |   0.659   |  0.402 |  0.500  |

## LGL

In [16]:
# Get file path LGL dataset
file_path = '../../data/LGL/LGL.xml'

In [17]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for LGL

In [18]:
predictions_all_toponyms = make_predictions_spacy(data_all_toponyms)
predictions_filtered_toponyms = make_predictions_spacy(data_filtered_toponyms)

588it [00:07, 77.03it/s]
588it [00:06, 86.92it/s]


#### Results LGL & Comparison

In [19]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

fp: 2564 | tp: 1820 | fn: 2555
precision: 0.415 | recall: 0.416 | f-score: 0.416


In [20]:
fps

['Kelleyland',
 'Cottonport Fire Station',
 'Memphis St',
 'Augusta St',
 'Minnesota House',
 'House',
 'House',
 'Highway 200',
 'Mahnomen County Road 3',
 'Mahnomen County Road',
 'Twin Lakes',
 'Highway',
 'Otter Tail County Road 80',
 'Highway 108',
 'Pelican Rapids',
 'Highway 59',
 'Star Lake',
 'Highway 59',
 'Otter Tail',
 'Highway 55',
 'Highway 55',
 'Grant',
 'Wilkin',
 'County Road 43',
 'Highway 114',
 'Douglas County Road 4',
 'Lake Mary',
 'Highway 9',
 'Highway 12',
 'Benson',
 'Highway 75',
 'Parkers Prairie',
 'Highway 29',
 'Parkers Prairie',
 'Red River',
 'Red River',
 'Red River Valley',
 'Avenue South',
 'Hesco',
 'North Dakota',
 'Barb Grothâ',
 'Oakport Township',
 'Red River',
 'Fargo City',
 'Red River',
 'Sandbag Central',
 'U.S. House of Representatives',
 'Legislature',
 'Itâ',
 'Itâ',
 'County Road 35',
 'Lake Vermont',
 'County Road 56',
 'County Road 15',
 'County Road 96',
 'North Nokomis Street',
 'Darling Avenue',
 'Re',
 'Sachs',
 'Douglas County Ho

In [21]:
fns

['Rapides Parish',
 'Avoyelles',
 'Cottonport',
 'Avoyelles Parish',
 'Alexandria',
 'Pointe Coupee',
 'Pointe Coupee',
 'Pineville',
 'Mansfield',
 'MANSFIELD',
 'Mansfield',
 'Mansfield',
 'Shreveport',
 'Mansfield',
 'Mansfield',
 'Mansfield',
 'Mansfield',
 'Cook',
 'Minnesota',
 'Minnesota',
 'Marshall',
 'Twin Lakes',
 'Otter Tail County',
 'Pelican Rapids',
 'Star Lake',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Parkers Prairie',
 'Parkers Prairie',
 'Parkers Prairie',
 'Minnesota',
 'Alexandria',
 'Red River',
 'Fargo',
 'Moorhead',
 'Red River',
 'Fargo',
 'Fargo',
 'North Dakota',
 'Oakport Township',
 'Red River',
 'Minnesota',
 'Fargo',
 'Fargo',
 'Red River',
 'Fargo',
 'Fargo',
 'U.S.',
 'Douglas County',
 'Lake Vermont',
 'Brandon',
 'Freeborn Lake',
 'Nelson',
 'Douglas County',
 'St. Cloud',
 'Brandon',
 'Douglas County',
 'Alexandria',
 'Alexandria',
 'America',
 'Ky.',
 'New Haven',
 'Conn.',
 'Sioux Falls',
 'Ariz.',
 'New Haven',
 'Sioux Falls',
 'Alexandria

In [22]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

fp: 2601 | tp: 1877 | fn: 3109
precision: 0.419 | recall: 0.376 | f-score: 0.397


In [23]:
fps

['Kelleyland',
 'Cottonport Fire Station',
 'Memphis St',
 'Augusta St',
 'Minnesota House',
 'House',
 'House',
 'Highway 200',
 'Mahnomen County Road 3',
 'Mahnomen County Road',
 'Twin Lakes',
 'Highway',
 'Otter Tail County Road 80',
 'Highway 108',
 'Pelican Rapids',
 'Highway 59',
 'Star Lake',
 'Highway 59',
 'Otter Tail',
 'Highway 55',
 'Highway 55',
 'Grant',
 'Wilkin',
 'County Road 43',
 'Highway 114',
 'Douglas County Road 4',
 'Lake Mary',
 'Highway 9',
 'Highway 12',
 'Highway 75',
 'Parkers Prairie',
 'Highway 29',
 'Parkers Prairie',
 'Red River',
 'Red River',
 'Red River Valley',
 'Avenue South',
 'Hesco',
 'North Dakota',
 'Barb Grothâ',
 'Oakport Township',
 'Red River',
 'Fargo City',
 'Red River',
 'Sandbag Central',
 'U.S. House of Representatives',
 'Legislature',
 'Itâ',
 'Itâ',
 'County Road 35',
 'Lake Vermont',
 'County Road 56',
 'County Road 15',
 'County Road 96',
 'Freeborn Lake',
 'Right',
 'Robley',
 'North Nokomis Street',
 'Darling Avenue',
 'Re',
 

In [24]:
fns

['Rapides Parish',
 'Avoyelles',
 'Cottonport',
 'Avoyelles Parish',
 'Memphis St.',
 'Augusta St.',
 'Alexandria',
 'Memphis Street',
 'Pointe Coupee',
 'Pointe Coupee',
 'Pineville',
 'Lakeview',
 'Harrison',
 'Shamrock Street',
 'Mansfield',
 'MANSFIELD',
 'Mansfield',
 'Mansfield',
 'Shreveport',
 'Mansfield',
 'Mansfield',
 'Mansfield',
 'DeSoto Parish',
 'Mansfield',
 'Cook',
 'Minnesota',
 'Minnesota',
 'Marshall',
 'Highway 200',
 'Mahnomen County Road',
 'Mahnomen County Road',
 'Twin Lakes',
 'Highway 10',
 'Otter Tail County',
 'Highway 108',
 'Pelican Rapids',
 'Star Lake',
 'Otter Tail/Grant',
 'Grant/Wilkin',
 'County Road 43',
 'Highway 114',
 'Douglas County',
 'Lake Mary',
 'Minnesota',
 'Minnesota',
 'Parkers Prairie',
 'Parkers Prairie',
 'Parkers Prairie',
 'Minnesota',
 'Alexandria',
 'Red River',
 'Fargo',
 'Moorhead',
 'Red River',
 'Red River Valley',
 'Fargo',
 'Fargo',
 '40th Avenue South',
 'North Dakota',
 'Oakport Township',
 'Red River',
 'Minnesota',
 'Fa

##### scoring overview
 
| Geoparser Name                          | Precision | Recall  | F-Score |
|:-------------------                     |-----------|-------- |---------|
|__Flair (all)__                          |__0.763__  |__0.676__|__0.717__|
| DBpediaparser                           | 0.813     | 0.635   | 0.713   |
|__Flair MULTI(all)__                     |__0.761__  |__0.663__|__0.708__|
|__Flair MULTI FAST(all)__                |__0.770__  |__0.644__|__0.701__|
| StanfordNERparser                       | 0.744     | 0.622   | 0.677   |
| TopoClusterparser                       | 0.763     | 0.577   | 0.657   |
|__Flair (filtered)__                     |__0.660__  |__0.653__|__0.657__|
|__Flair MULTI(filtered)__                |__0.666__  |__0.646__|__0.656__|
| CamCoderparser                          | 0.811     | 0.548   | 0.654   |
|__Flair MULTI FAST (filtered)__          |__0.674__  |__0.629__|__0.651__|
| CLAVINparser                            | 0.808     | 0.444   | 0.573   |
| Edinburghparser                         | 0.723     | 0.383   | 0.501   |
| SpaCyNERparser                          | 0.493     | 0.371   | 0.423   |
|__SpaCy MULTI(filtered)__                |__0.415__  |__0.416__|__0.416__|
|__SpaCy MULTI FAST (all)__               |__0.419__  |__0.376__|__0.397__|

## GeoWebNews

In [25]:
# Get file path LGL dataset
file_path = '../../data/GeoWebNews/GeoWebNews.xml'

In [26]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for GeoWebNews

In [27]:
predictions_all_toponyms = make_predictions_spacy(data_all_toponyms)
predictions_filtered_toponyms = make_predictions_spacy(data_filtered_toponyms)

200it [00:03, 61.76it/s]
200it [00:03, 66.19it/s]


#### Results GeoWebNews & Comparison

In [28]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

fp: 896 | tp: 1161 | fn: 1396
precision: 0.564 | recall: 0.454 | f-score: 0.503


In [29]:
fps

['Marigny',
 'Mississippi River',
 'Faubourg Marigny',
 'Rue d’Enghein',
 'Press Street',
 'Faubourg D’Aunoy',
 'D’Aunoy',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 'Royal',
 'Royal Street',
 'Desire',
 'Desire Street',
 'Marigny',
 'Dubreuil',
 'Dubreuil',
 'Marigny Canal',
 'Washington Square',
 'Champs',
 'Élysées',
 'Royal Street',
 'Washington Square',
 'Frenchmen Street',
 'Carnegie Library',
 'Norbury',
 'Granville',
 'Somerville',
 'Norbury',
 'Norbury',
 'Lake Manassas',
 'Virginia Gateway',
 'Giant Food',
 'Target',
 'Best Buy',
 'Haymarket Village Center',
 'Walmart',
 'Kohl’s',
 'Buckland',
 'Turtle Point',
 'Old Town Manassas',
 'Vienna',
 'Fairfax',
 'GMU Metro',
 'Orange',
 'Dulles International Airport',
 'Va',
 'Marjory',
 'Hulu',
 'Jos',
 'Trump',
 'Dr',
 'in Africa',
 'Olaopa',
 'in Africa',
 'in Africa',
 'Olaopa',
 'in Africa',
 'Weinstein',
 'Kremlin',
 'Dozhd',
 "Notre Dame's Arike Ogunbowale",
 'AP',
 'Mississippi State',
 'Bulld

In [30]:
fns

['Louisiana',
 'French',
 'French Quarter',
 'Mississippi River',
 'Faubourg Marigny',
 'German',
 'Irish',
 'Rue d’Enghein',
 'Lafayette',
 'Almonaster',
 'Franklin',
 'Marigny Plantation',
 'Press Street',
 'Franklin',
 'Faubourg D’Aunoy',
 'Franklin',
 'Methodist church',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 'Greek',
 'Elysian Fields',
 'Royal Street',
 'Bourbon',
 'Dauphine',
 'Desire Street',
 'Elysian Fields',
 'New Orleans Railways and Light Company Claiborne Power House',
 'French',
 'Marigny Canal',
 'Washington Square',
 'Champs-Élysées',
 'Elysian Fields',
 'Pontchartrain Railroad',
 'Appalachians',
 'Royal Street',
 'Washington Square',
 'Holy Redeemer Church',
 'Third Presbyterian Church',
 'Frenchmen Street',
 'Carnegie Library',
 'Christopher Inn',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Syrian',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Afrin',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Lake Manas

In [31]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

fp: 870 | tp: 1213 | fn: 4853
precision: 0.582 | recall: 0.200 | f-score: 0.298


In [32]:
fps

['Marigny',
 'Mississippi River',
 'Faubourg Marigny',
 'Rue d’Enghein',
 'Press Street',
 'Faubourg D’Aunoy',
 'D’Aunoy',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 'Royal',
 'Royal Street',
 'Desire',
 'Desire Street',
 'Marigny',
 'Dubreuil',
 'Dubreuil',
 'Marigny Canal',
 'Washington Square',
 'Champs',
 'Élysées',
 'Royal Street',
 'Washington Square',
 'Frenchmen Street',
 'Carnegie Library',
 'Christopher Inn',
 'Archdiocese’s',
 'Lake Manassas',
 'Virginia Gateway',
 'Giant Food',
 'Target',
 'Best Buy',
 'Haymarket Village Center',
 'Walmart',
 'Kohl’s',
 'Buckland',
 'Turtle Point',
 'Old Town Manassas',
 'Vienna',
 'Fairfax',
 'GMU Metro',
 'Orange',
 'Dulles International Airport',
 'Va',
 'Marjory',
 'Hulu',
 'Jos',
 'Trump',
 'Dr',
 'in Africa',
 'Olaopa',
 'in Africa',
 'in Africa',
 'Olaopa',
 'in Africa',
 'Weinstein',
 'Kremlin',
 'Dozhd',
 "Notre Dame's Arike Ogunbowale",
 'AP',
 'Mississippi State',
 'Bulldogs',
 'Notre Dame’s Arike 

In [33]:
fns

['area',
 'plantation',
 'mansion',
 'substation',
 'Louisiana',
 'Louisiana Purchase',
 'parcel',
 'French',
 'plat',
 'French Quarter',
 'Mississippi River',
 'squares',
 'neighborhood',
 'city',
 'Faubourg Marigny',
 'community',
 'African Americans',
 'German',
 'Irish',
 'populations',
 'blocks',
 'residents',
 'blocks',
 'intersection',
 'mills',
 'plant',
 'stables',
 'factory',
 'barn',
 'streets',
 'Rue d’Enghein',
 'street',
 'Lafayette',
 'Almonaster',
 'Franklin',
 'avenue',
 'Marigny Plantation',
 'faubourg',
 'Press Street',
 'area',
 'Franklin',
 'Faubourg D’Aunoy',
 'neighbor',
 'building',
 'Franklin',
 'Methodist church',
 'restaurant',
 'street',
 'complex',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 '2231 Royal',
 'townhouse',
 'basement',
 'Greek',
 'structures',
 'Elysian Fields',
 'Royal Street',
 'Bourbon',
 'Dauphine',
 'Desire Street',
 'system',
 'neighborhoods',
 'edifice',
 'Elysian Fields',
 'New Orleans Railw

##### scoring overview
|   Geoparser Name                 | Precision | Recall  | F-Score |
|:-----------------                |:---------:|:------: |:-------:|
|__Flair(filtered)__               |__0.901__  |__0.662__|__0.763__|
|__Flair MULTI ner(filtered)__     |__0.901__  |__0.662__|__0.763__|
|__Flair MULTI FAST ner(filtered)__|__0.906__  |__0.638__|__0.749__|
| StanfordNERparser                |   0.885   |  0.635  |  0.739  |
|   CamCoderparser                 |   0.895   |  0.562  |  0.691  |
| TopoClusterparser                |   0.838   |  0.559  |  0.670  |
|  Edinburghparser                 |   0.819   |  0.538  |  0.650  |
|   DBpediaparser                  |   0.847   |  0.510  |  0.637  |
|    CLAVINparser                  |   0.909   |  0.394  |  0.549  |
|__SpaCy MULTI ner(filtered)__     |__0.564__  |__0.454__|__0.503__|
|   SpaCyNERparser                 |   0.561   |  0.389  |  0.460  |
|__Flair(all)__                    |__0.911__  |__0.293__|__0.443__|
|__Flair MULTI ner(all)__          |__0.912__  |__0.289__|__0.439__|
|__Flair MULTI FAST ner(all)__     |__0.912__  |__0.280__|__0.428__|
|__SpaCy MULTI ner(all)__          |__0.582__  |__0.200__|__0.298__|