In [1]:
%%html
<style>
table {float:left}
</style>

In [2]:
import os
import spacy
from tqdm import tqdm
import xml.etree.ElementTree as et

In [3]:
import copy

def calc_precision(tp, fp):
    return tp/(tp + fp)

def calc_recall(tp, fn):
    return tp/(tp + fn)

def calc_fscore(precision, recall):
    return 2 * (precision * recall) / (precision + recall)

def evaluate(gold_truth_labels, predictions):
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    for gold, pred in zip(gold_truth_labels, predictions):
        
        tp_tmp, fp_tmp, fn_tmp, fns_temp, fps_temp  = evaluate_one_article(gold, pred)
        
        tp += tp_tmp
        fp += fp_tmp
        fn += fn_tmp
        
        fns.extend(fns_temp)
        fps.extend(fps_temp) 
        
    precision = calc_precision(tp, fp)
    recall = calc_recall(tp, fn)
    f_score = calc_fscore(precision, recall)    
    
    print(f'fp: {fp} | tp: {tp} | fn: {fn}')
    print(f'precision: {precision:.3f} | recall: {recall:.3f} | f-score: {f_score:.3f}')
    
    return fps, fns  
    

def evaluate_one_article(gold_truth, prediction):
    
    gold = gold_truth['entities'].copy()
    pred = prediction['entities'].copy()
    
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    
    i = 0
    
    while len(gold) > 0 and len(pred) > 0:
        i += 1

        # Check if the first two elements are the same
        if gold[0] == pred[0]:
            tp += 1
            gold.pop(0)
            pred.pop(0)
        
        else:
            # Grab the first appearing element
            element, source = (gold[0], 'gold') if gold[0]['start_pos'] < pred[0]['start_pos'] else (pred[0], 'pred')
            
            # Remove the element first appearing element
            if source == 'gold':
                fn += 1
                fns.append(element['text'])
                gold.remove(element)
            elif source == 'pred':
                fp += 1
                fps.append(element['text'])
                pred.remove(element)
    
    if len(gold) > 0:
        fn += 1
    elif len(pred) > 0:
        fp += 1
        
    return tp, fp, fn, fns, fps       

def run_flair(text):

    # make a sentence
    sentence = Sentence(text)

    # run NER over sentence
    tagger.predict(sentence)
    
    for entity in sentence.to_dict(tag_type='ner')['entities']:
        print(entity)

In [4]:
def load_file(file_path):
    """
    Loads file and returns all the articles
    """
    # Load the data
    tree = et.parse(file_path)
    root = tree.getroot()

    return root

def process_article(article, filtered, file_path):
    """
    Takes article and process into desired structure
    """
    if 'GeoWebNews' in file_path:
        if filtered:
            return {'text': article.find('text').text, 
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym') 
                                                     if top.find('latitude') != None and top.find('longitude') != None], key=lambda k: k['start_pos'])}
        
        else:
            return {'text': article.find('text').text, 
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')], key=lambda k: k['start_pos'])}
    
    
    elif not filtered:
        return {'text': article.find('text').text,
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                                 ], key=lambda k: k['start_pos'])}
        
    else:
        return {'text': article.find('text').text,
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                             if top.find('gaztag/lat') != None and top.find('gaztag/lon') != None
                                 ], key=lambda k: k['start_pos'])}

def process_articles(root, filtered, file_path):
    """
    Takes articles and processes them into desired structure
    """
    data = []
    
    for article in root:
        
        data.append(process_article(article, filtered, file_path))
    
    return data

def prepare_data(file_path, filtered):
    
    root = load_file(file_path)
    
    data = process_articles(root, filtered, file_path)
    
    return data

In [6]:
from nltk.tokenize import sent_tokenize

In [51]:
def make_predictions_spacy(data):
    
    predictions = []
    
    texts = [article['text'] for article in data]
    
    tmp = [sent_tokenize(article) for article in texts]
    
    for article in tmp:
        
        pos = 0
        
        predicted_entities = []
        
        for sentence in nlp.pipe(article):
            
            pred = [{'text': ent.text, 
                     'start_pos': pos + len(sentence[0:ent.end].text) - len(sentence[ent.start]), 
                     'end_pos': pos + len(sentence[0:ent.end].text)} for ent in sentence.ents if 
                                                             ent.label_ == 'LOC' or ent.label_ == 'FAC' or ent.label_ == 'GPE']
            
            predicted_entities.extend(pred)
            
            pos += len(sentence.text)
            
            
        article_tags = {'text': sentence, 'entities': predicted_entities}

        predictions.append(article_tags)
    
    return predictions

### Load the default transfomer spacy model

In [8]:
# Only enable the ner tagger
nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

## TR-News

In [9]:
# Get file path LGL dataset
file_path = '../../data/TR-News/TR-News.xml'

In [13]:
data_all_toponyms = prepare_data(file_path, filtered=False)
# data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for TR-News

In [52]:
predictions_all_toponyms = make_predictions_spacy(data_all_toponyms[:2])
# predictions_filtered_toponyms = make_predictions_spacy(data_filtered_toponyms)

In [53]:
predictions_all_toponyms[0]['entities']

[{'text': 'Russia', 'start_pos': 33, 'end_pos': 39},
 {'text': 'Turkey', 'start_pos': 56, 'end_pos': 62},
 {'text': 'Russia', 'start_pos': 190, 'end_pos': 196},
 {'text': 'Syria', 'start_pos': 216, 'end_pos': 221},
 {'text': 'Aleppo', 'start_pos': 247, 'end_pos': 253},
 {'text': 'Syria', 'start_pos': 267, 'end_pos': 272},
 {'text': 'Turkey', 'start_pos': 290, 'end_pos': 296},
 {'text': 'Russia', 'start_pos': 301, 'end_pos': 307},
 {'text': 'Ankara', 'start_pos': 327, 'end_pos': 333},
 {'text': 'Ankara', 'start_pos': 843, 'end_pos': 849},
 {'text': 'Russia', 'start_pos': 1044, 'end_pos': 1050},
 {'text': 'Russia', 'start_pos': 1115, 'end_pos': 1121},
 {'text': 'Aleppo', 'start_pos': 1167, 'end_pos': 1173},
 {'text': 'Syria', 'start_pos': 1175, 'end_pos': 1180},
 {'text': 'Aleppo', 'start_pos': 1221, 'end_pos': 1227},
 {'text': 'Turkey', 'start_pos': 1502, 'end_pos': 1508},
 {'text': 'U.S.-led', 'start_pos': 1545, 'end_pos': 1553},
 {'text': 'Syria', 'start_pos': 1658, 'end_pos': 1663},


#### Results TR-News & Comparison

In [None]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

In [None]:
fps

In [None]:
fns

In [None]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

In [None]:
fps

In [None]:
fns

##### scoring overview

|   Geoparser Name                 | Precision | Recall | F-Score |
|:-----------------               |:---------:|:------:|:-------:|
| StanfordNERparser                     |   0.890   |  0.731 |  0.803  |
| TopoClusterparser                     |   0.883   |  0.714 |  0.790  |
|__Flair (all toponyms)__               |__0.803__  |__0.699__|__0.748__|
|   CamCoderparser                      |   0.897   |  0.638 |  0.746  |
|__Flair MULTI ner (all toponyms)__     |__0.779__  |__0.694__|__0.739__|
|__Flair MULTI ner FAST (all toponyms)__|__0.802__  |__0.683__|__0.738__|
|__Flair (filtered)__                   |__0.773__  |__0.695__|__0.732__|
|   DBpediaparser                       |   0.861   |  0.631 |  0.728  |
|__Flair MULTI ner (filtered)__         |__0.761__  |__0.691__|__0.724__|
|__Flair MULTI ner FAST (filtered)__    |__0.772__  |__0.679__|__0.722__|
|    CLAVINparser                       |   0.908   |  0.505 |  0.649  |
|  Edinburghparser                      |   0.709   |  0.538 |  0.612  |
|__SpaCy MULTI ner (filtered)__         |__0.612__  |__0.561__|__0.586__|
|__SpaCy MULTI ner  (all)__             |__0.611__  |__0.542__|__0.575__|
|   SpaCyNERparser                      |   0.659   |  0.402 |  0.500  |

## LGL

In [None]:
# Get file path LGL dataset
file_path = '../../data/LGL/LGL.xml'

In [None]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for LGL

In [None]:
predictions_all_toponyms = make_predictions_spacy(data_all_toponyms)
predictions_filtered_toponyms = make_predictions_spacy(data_filtered_toponyms)

#### Results LGL & Comparison

In [None]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

In [None]:
fps

In [None]:
fns

In [None]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

In [None]:
fps

In [None]:
fns

##### scoring overview
 
| Geoparser Name                          | Precision | Recall  | F-Score |
|:-------------------                     |-----------|-------- |---------|
|__Flair (all)__                          |__0.763__  |__0.676__|__0.717__|
| DBpediaparser                           | 0.813     | 0.635   | 0.713   |
|__Flair MULTI(all)__                     |__0.761__  |__0.663__|__0.708__|
|__Flair MULTI FAST(all)__                |__0.770__  |__0.644__|__0.701__|
| StanfordNERparser                       | 0.744     | 0.622   | 0.677   |
| TopoClusterparser                       | 0.763     | 0.577   | 0.657   |
|__Flair (filtered)__                     |__0.660__  |__0.653__|__0.657__|
|__Flair MULTI(filtered)__                |__0.666__  |__0.646__|__0.656__|
| CamCoderparser                          | 0.811     | 0.548   | 0.654   |
|__Flair MULTI FAST (filtered)__          |__0.674__  |__0.629__|__0.651__|
| CLAVINparser                            | 0.808     | 0.444   | 0.573   |
| Edinburghparser                         | 0.723     | 0.383   | 0.501   |
| SpaCyNERparser                          | 0.493     | 0.371   | 0.423   |
|__SpaCy MULTI(filtered)__                |__0.415__  |__0.416__|__0.416__|
|__SpaCy MULTI FAST (all)__               |__0.419__  |__0.376__|__0.397__|

## GeoWebNews

In [None]:
# Get file path LGL dataset
file_path = '../../data/GeoWebNews/GeoWebNews.xml'

In [None]:
data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

#### Predictions for GeoWebNews

In [None]:
predictions_all_toponyms = make_predictions_spacy(data_all_toponyms)
predictions_filtered_toponyms = make_predictions_spacy(data_filtered_toponyms)

#### Results GeoWebNews & Comparison

In [None]:
# only toponyms w/ lat/long
fps, fns = evaluate(data_filtered_toponyms, predictions_filtered_toponyms)

In [None]:
fps

In [None]:
fns

In [None]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, predictions_all_toponyms)

In [None]:
fps

In [None]:
fns

##### scoring overview
|   Geoparser Name                 | Precision | Recall  | F-Score |
|:-----------------                |:---------:|:------: |:-------:|
|__Flair(filtered)__               |__0.901__  |__0.662__|__0.763__|
|__Flair MULTI ner(filtered)__     |__0.901__  |__0.662__|__0.763__|
|__Flair MULTI FAST ner(filtered)__|__0.906__  |__0.638__|__0.749__|
| StanfordNERparser                |   0.885   |  0.635  |  0.739  |
|   CamCoderparser                 |   0.895   |  0.562  |  0.691  |
| TopoClusterparser                |   0.838   |  0.559  |  0.670  |
|  Edinburghparser                 |   0.819   |  0.538  |  0.650  |
|   DBpediaparser                  |   0.847   |  0.510  |  0.637  |
|    CLAVINparser                  |   0.909   |  0.394  |  0.549  |
|__SpaCy MULTI ner(filtered)__     |__0.564__  |__0.454__|__0.503__|
|   SpaCyNERparser                 |   0.561   |  0.389  |  0.460  |
|__Flair(all)__                    |__0.911__  |__0.293__|__0.443__|
|__Flair MULTI ner(all)__          |__0.912__  |__0.289__|__0.439__|
|__Flair MULTI FAST ner(all)__     |__0.912__  |__0.280__|__0.428__|
|__SpaCy MULTI ner(all)__          |__0.582__  |__0.200__|__0.298__|