In [1]:
%%html
<style>
table {float:left}
</style>

In [2]:
import os
from tqdm import tqdm
import xml.etree.ElementTree as et

#### Functions

In [3]:
import copy

def calc_precision(tp, fp):
    return tp/(tp + fp)

def calc_recall(tp, fn):
    return tp/(tp + fn)

def calc_fscore(precision, recall):
    return 2 * (precision * recall) / (precision + recall)

def evaluate(gold_truth_labels, predictions):
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    for gold, pred in zip(gold_truth_labels, predictions):
        
        tp_tmp, fp_tmp, fn_tmp, fns_temp, fps_temp  = evaluate_one_article(gold, pred)
        
        tp += tp_tmp
        fp += fp_tmp
        fn += fn_tmp
        
        fns.extend(fns_temp)
        fps.extend(fps_temp) 
        
    precision = calc_precision(tp, fp)
    recall = calc_recall(tp, fn)
    f_score = calc_fscore(precision, recall)    
    
    print(f'fp: {fp} | tp: {tp} | fn: {fn}')
    print(f'precision: {precision:.3f} | recall: {recall:.3f} | f-score: {f_score:.3f}')
    
    return fps, fns  
    

def evaluate_one_article(gold_truth, prediction):
    
    gold = gold_truth['entities'].copy()
    pred = prediction['entities'].copy()
    
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    
    i = 0
    
    while len(gold) > 0 and len(pred) > 0:
        i += 1

        # Check if the first two elements are the same
        if gold[0] == pred[0]:
            tp += 1
            gold.pop(0)
            pred.pop(0)
        
        else:
            # Grab the first appearing element
            element, source = (gold[0], 'gold') if gold[0]['start_pos'] < pred[0]['start_pos'] else (pred[0], 'pred')
            
            # Remove the element first appearing element
            if source == 'gold':
                fn += 1
                fns.append(element['text'])
                gold.remove(element)
            elif source == 'pred':
                fp += 1
                fps.append(element['text'])
                pred.remove(element)
    
    if len(gold) > 0:
        fn += 1
    elif len(pred) > 0:
        fp += 1
        
    return tp, fp, fn, fns, fps       

def run_flair(text):

    # make a sentence
    sentence = Sentence(text)

    # run NER over sentence
    tagger.predict(sentence)
    
    for entity in sentence.to_dict(tag_type='ner')['entities']:
        print(entity)

In [4]:
def load_file(file_path):
    """
    Loads file and returns all the articles
    """
    # Load the data
    tree = et.parse(file_path)
    root = tree.getroot()

    return root

def process_article(article, filtered, file_path):
    """
    Takes article and process into desired structure
    """
    if 'GeoWebNews' in file_path:
        if filtered:
            return {'text': article.find('text').text, 
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym') 
                                                     if top.find('latitude') != None and top.find('longitude') != None], key=lambda k: k['start_pos'])}
        
        else:
            return {'text': article.find('text').text, 
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')], key=lambda k: k['start_pos'])}
    
    
    elif not filtered:
        return {'text': article.find('text').text,
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                                 ], key=lambda k: k['start_pos'])}
        
    else:
        return {'text': article.find('text').text,
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                             if top.find('gaztag/lat') != None and top.find('gaztag/lon') != None
                                 ], key=lambda k: k['start_pos'])}

def process_articles(root, filtered, file_path):
    """
    Takes articles and processes them into desired structure
    """
    data = []
    
    for article in root:
        
        data.append(process_article(article, filtered, file_path))
    
    return data

def prepare_data(file_path, filtered):
    
    root = load_file(file_path)
    
    data = process_articles(root, filtered, file_path)
    
    return data

In [5]:
def make_predictions(data):
    
    predictions = []

    for article in tqdm(data):

        text = article['text']

        # make a sentence
        sentence = Sentence(text)

        # run NER over sentence
        tagger.predict(sentence)

        pred = sentence.to_dict(tag_type='ner')
        
        pred['entities'] = [entity for entity in pred['entities'] if entity['labels'][0].value == 'LOC']
        [entity.pop('labels') for entity in pred['entities']]
        pred.pop('labels')
        

        predictions.append(pred)
        
    return predictions

### Load the ner-multi model

In [6]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/ner-multi-fast")

2021-05-18 22:04:47,596 loading file C:\Users\Bernard\.flair\models\ner-multi-fast\d0ca1daace2b097b04a886b4be80d82634229555eb2da7079b1b102579fd3835.7b305379b36567738bc455e399f4a4b341d8db8edabffc6807a0ff9cc4efb933


## TR-News

In [7]:
# Get file path LGL dataset
file_path = '../../data/TR-News/TR-News.xml'

In [8]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for TR-News

In [9]:
predictions_all_toponyms = make_predictions(data_all_toponyms)
predictions_filtered_toponyms = make_predictions(data_filtered_toponyms)

100%|████████████████████████████████████████████████████████████████████████████████| 118/118 [00:42<00:00,  2.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 118/118 [00:40<00:00,  2.94it/s]


#### Results TR-News & Comparison

In [10]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

fp: 251 | tp: 848 | fn: 400
precision: 0.7716105550500455 | recall: 0.6794871794871795 | f-score: 0.7226246271836386


In [11]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

fp: 218 | tp: 883 | fn: 409
precision: 0.8019981834695731 | recall: 0.68343653250774 | f-score: 0.7379857918930213


##### scoring overview

|   Geoparser Name                 | Precision | Recall | F-Score |
|:-----------------               |:---------:|:------:|:-------:|
| StanfordNERparser                     |   0.890   |  0.731 |  0.803  |
| TopoClusterparser                     |   0.883   |  0.714 |  0.790  |
|__Flair (all toponyms)__               |__0.803__  |__0.699__|__0.748__|
|   CamCoderparser                      |   0.897   |  0.638 |  0.746  |
|__Flair MULTI ner (all toponyms)__     |__0.779__  |__0.694__|__0.739__|
|__Flair MULTI ner FAST (all toponyms)__|__0.802__  |__0.683__|__0.738__|
|__Flair (filtered)__                   |__0.773__  |__0.695__|__0.732__|
|   DBpediaparser                       |   0.861   |  0.631 |  0.728  |
|__Flair MULTI ner (filtered)__         |__0.761__  |__0.691__|__0.724__|
|__Flair MULTI ner FAST (filtered)__    |__0.772__  |__0.679__|__0.722__|
|    CLAVINparser                       |   0.908   |  0.505 |  0.649  |
|  Edinburghparser                      |   0.709   |  0.538 |  0.612  |
|   SpaCyNERparser                      |   0.659   |  0.402 |  0.500  |

## LGL

In [12]:
# Get file path LGL dataset
file_path = '../../data/LGL/LGL.xml'

In [13]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for LGL

In [14]:
predictions_all_toponyms = make_predictions(data_all_toponyms)
predictions_filtered_toponyms = make_predictions(data_filtered_toponyms)

100%|████████████████████████████████████████████████████████████████████████████████| 588/588 [03:18<00:00,  2.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 588/588 [03:20<00:00,  2.94it/s]


#### Results LGL & comparison

In [15]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

fp: 1321 | tp: 2735 | fn: 1613
precision: 0.6743096646942801 | recall: 0.6290248390064398 | f-score: 0.6508805330794861


In [16]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

fp: 956 | tp: 3194 | fn: 1768
precision: 0.7696385542168674 | recall: 0.6436920596533656 | f-score: 0.7010535557506585


##### scoring overview
 
| Geoparser Name                          | Precision | Recall  | F-Score |
|:-------------------                     |-----------|-------- |---------|
|__Flair (all)__                          |__0.763__  |__0.676__|__0.717__|
| DBpediaparser                           | 0.813     | 0.635   | 0.713   |
|__Flair MULTI(all)__                     |__0.761__  |__0.663__|__0.708__|
|__Flair MULTI FAST(all)__                |__0.770__  |__0.644__|__0.701__|
| StanfordNERparser                       | 0.744     | 0.622   | 0.677   |
| TopoClusterparser                       | 0.763     | 0.577   | 0.657   |
|__Flair (filtered)__                     |__0.660__  |__0.653__|__0.657__|
|__Flair MULTI(filtered)__                |__0.666__  |__0.646__|__0.656__|
| CamCoderparser                          | 0.811     | 0.548   | 0.654   |
|__Flair MULTI FAST (filtered)__          |__0.674__  |__0.629__|__0.651__|
| CLAVINparser                            | 0.808     | 0.444   | 0.573   |
| Edinburghparser                         | 0.723     | 0.383   | 0.501   |
| SpaCyNERparser                          | 0.493     | 0.371   | 0.423   |

In [17]:
fns

['Rapides Parish',
 'Cottonport',
 'Alexandria',
 'Pointe Coupee',
 'MANSFIELD',
 'Mansfield',
 'Mansfield',
 'Shreveport',
 'Mansfield',
 'Mansfield',
 'DeSoto Parish',
 'Cook',
 'Minneapolis',
 'Minnesota',
 'Marshall',
 'Chisholm',
 'Highway 200',
 'Otter Tail County',
 'Otter Tail/Grant',
 'County Road 43',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Clay',
 'Douglas',
 'Grant',
 'Swift',
 'Parkers Prairie',
 'Alexandria',
 'Red River',
 'Fargo',
 'Moorhead',
 'Fargo',
 'Fargo',
 '40th Avenue South',
 'North Dakota',
 'Minnesota',
 'Fargo',
 'Fargo',
 'Fargo',
 'Douglas County',
 'County Road 56',
 'County Road 15',
 'Nelson',
 'Douglas County',
 'County Road 109',
 'America',
 'S.D.',
 'Surprise',
 'Alexandria',
 'Alexandria',
 'Washington',
 'D.C.',
 'D.C.',
 'Virginia',
 'Sudanese',
 'Sudan',
 'Chinese',
 'Africa',
 'African',
 'Darfur',
 'Egyptian',
 'Egyptian',
 'Sudan',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Egyptian',
 'Sudanese',
 'Sudanese',
 'Sudanese'

In [18]:
fps

['Kelleyland',
 'Orchard St.',
 'Cottonport Fire Station',
 'St. James Youth Detention Center',
 'Minnesota House',
 'Capitol',
 'Otter Tail County Road',
 'Highway 59',
 'Highway 59',
 'Otter Tail',
 'Grant county',
 'Highway 55',
 'Highway 55',
 'County Road',
 'Douglas County Road',
 'Highway 9',
 'Highway 12',
 'Parkers Prairie High School',
 'Room',
 'Highway 29',
 'Fargo-Moorhead',
 'Avenue South',
 'Hesco',
 'Drain',
 'Dakota',
 'Barb Grothâ',
 'Fargo City',
 'Fargodome',
 'Sandbag Central',
 'County Road',
 'County Road',
 'County Road',
 'Douglas County Hospital',
 'St. Cloud Hospital',
 'County Road',
 'R-Elbow Lake',
 'Juantissa Hill',
 'City Named One of America',
 'District A',
 'Durant Center',
 'Santa Claus',
 'City Hall',
 'Hill',
 'Northwest D.C.',
 'District CRonnie',
 'District A',
 'East End',
 'Southern Sudan',
 'Sub-Saharan Africa',
 'Al-Hedoud',
 'Gulf',
 'Gulf',
 'US',
 'US',
 'US',
 'Gulf',
 'US',
 'Gulf',
 'US',
 'St. John',
 'Sheldon Peck Homestead',
 'Woodfi

## GeoWebNews

In [19]:
# Get file path LGL dataset
file_path = '../../data/GeoWebNews/GeoWebNews.xml'

In [20]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions GeoWebNews

In [21]:
predictions_all_toponyms = make_predictions(data_all_toponyms)
predictions_filtered_toponyms = make_predictions(data_filtered_toponyms)

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [01:26<00:00,  2.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [01:28<00:00,  2.26it/s]


#### Results GeoWebNews

In [26]:
# only toponyms with long / lat info
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

fp: 168 | tp: 1623 | fn: 921
precision: 0.906 | recall: 0.638 | f-score: 0.749


In [27]:
# all toponyms --> fn much higher because many annotated toponyms aren't locations (not sure why this is)
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

fp: 157 | tp: 1636 | fn: 4213
precision: 0.912 | recall: 0.280 | f-score: 0.428


##### scoring overview
|   Geoparser Name                 | Precision | Recall  | F-Score |
|:-----------------                |:---------:|:------: |:-------:|
|__Flair(filtered)__               |__0.901__  |__0.662__|__0.763__|
|__Flair MULTI ner(filtered)__     |__0.901__  |__0.662__|__0.763__|
|__Flair MULTI FAST ner(filtered)__|__0.906__  |__0.638__|__0.749__|
| StanfordNERparser                |   0.885   |  0.635  |  0.739  |
|   CamCoderparser                 |   0.895   |  0.562  |  0.691  |
| TopoClusterparser                |   0.838   |  0.559  |  0.670  |
|  Edinburghparser                 |   0.819   |  0.538  |  0.650  |
|   DBpediaparser                  |   0.847   |  0.510  |  0.637  |
|    CLAVINparser                  |   0.909   |  0.394  |  0.549  |
|   SpaCyNERparser                 |   0.561   |  0.389  |  0.460  |
|__Flair(all)__                    |__0.911__  |__0.293__|__0.443__|
|__Flair MULTI ner(filtered)__     |__0.912__  |__0.289__|__0.439__|
|__Flair MULTI FAST ner(filtered)__|__0.912__  |__0.280__|__0.428__|