In [1]:
def convert_predictions(prediction_label_ids, tokenized_input_ids):
    
    prediction_labels = [[label_list[p] for t, p in zip(tokens, pred) if t != 101 and t != 102] for tokens, pred in zip(tokenized_input_ids, prediction_label_ids)]
    tokens = [tokenizer.convert_ids_to_tokens(i, skip_special_tokens=True) for i in tokenized_input_ids]

    predictions = []

    for token_set, label_set in zip(tokens, prediction_labels):

        text = tokenizer.convert_tokens_to_string(token_set)

        pred = {'text': text, 'entities': []}

        adjust_start_pos = 0

        for idx in range(len(token_set)):
            if label_set[idx] == 'B-LOC' or label_set[idx] == 'I-LOC':


                if idx == len(label_set)-1:
                    pass

                # Case 1: B-LOC followed by I-LOC --> CONTINUE
                elif label_set[idx+1] == 'I-LOC':
                    adjust_start_pos += 1
                    continue

                # Case 2: B-LOC followed by other B-LOC (together) --> CONTINUE    
                elif label_set[idx+1] == 'B-LOC' and '#' in token_set[idx+1]:
                    adjust_start_pos += 1
                    continue

                current_pos = idx            
                toponym_tokens = tokenizer.convert_tokens_to_string(token_set[current_pos-adjust_start_pos:current_pos+1])
                sub_sentence = tokenizer.convert_tokens_to_string(token_set[:current_pos+1])
                end = len(sub_sentence)
                start = end - len(toponym_tokens)

                pred['entities'].append({'text': toponym_tokens, 'start_pos': start, 'end_pos': end})

                adjust_start_pos = 0


        predictions.append(pred)
        
    return predictions 

def process_pred_results(pred_results, original_text_inputs):
    
    final_results = [align_pred_and_original_text(pred_result, original_text) 
                     for pred_result, original_text in zip(pred_results, original_text_inputs)]
    
    
    return final_results

def align_pred_and_original_text(pred_result, original_text):
    
    pred_text = pred_result['text']
    
    idx = 0
    removed_indices = []
    add_indices = []

    while pred_text != original_text:
        
        char_post, char_original = pred_text[idx], original_text[idx]
        
        if char_post != char_original:
            
            if char_original == ' ':
                pred_text = pred_text[:idx] + ' ' + pred_text[idx:]
                
                add_indices.append(idx)
                
                continue
            
            pred_text = pred_text[:idx] + pred_text[idx+1:]

            removed_indices.append(idx)
            
            if idx > len(pred_text) - 1:
                break
            
            continue

        idx += 1

        if idx > len(pred_text) - 1:
            break
    
    pred_entities = pred_result['entities']
    
    for entity in pred_entities:
        for index in removed_indices:
            if index > entity['start_pos']:
                break
            else:  
                entity['start_pos'] -= 1
                entity['end_pos'] -= 1

    
    for entity in pred_entities:
        for index in add_indices:
            if index > entity['start_pos']:
                break
            else:  
                entity['start_pos'] += 1
                entity['end_pos'] += 1

    return {'text': pred_text, 'entities': pred_entities}
    

In [2]:
import copy

def calc_precision(tp, fp):
    return tp/(tp + fp)

def calc_recall(tp, fn):
    return tp/(tp + fn)

def calc_fscore(precision, recall):
    return 2 * (precision * recall) / (precision + recall)

def evaluate(gold_truth_labels, predictions):
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    for gold, pred in zip(gold_truth_labels, predictions):
        
        tp_tmp, fp_tmp, fn_tmp, fns_temp, fps_temp  = evaluate_one_article(gold, pred)
        
        tp += tp_tmp
        fp += fp_tmp
        fn += fn_tmp
        
        fns.extend(fns_temp)
        fps.extend(fps_temp) 
        
    precision = calc_precision(tp, fp)
    recall = calc_recall(tp, fn)
    f_score = calc_fscore(precision, recall)    
    
    print(f'fp: {fp} | tp: {tp} | fn: {fn}')
    print(f'precision: {precision:.3f} | recall: {recall:.3f} | f-score: {f_score:.3f}')
    
    return fps, fns  
    

def evaluate_one_article(gold_truth, prediction):
    
    gold = gold_truth['entities'].copy()
    pred = prediction['entities'].copy()
    
    # Counts of true positives, false positives & false negatives
    tp, fp, fn = 0, 0, 0
    
    # List with false positives and false negatives
    fps, fns = [], []
    
    
    i = 0
    
    while len(gold) > 0 and len(pred) > 0:
        i += 1

        # Check if the first two elements are the same
        if gold[0] == pred[0]:
            tp += 1
            gold.pop(0)
            pred.pop(0)
        
        else:
            # Grab the first appearing element
            element, source = (gold[0], 'gold') if gold[0]['start_pos'] < pred[0]['start_pos'] else (pred[0], 'pred')
            
            # Remove the element first appearing element
            if source == 'gold':
                fn += 1
                fns.append(element['text'])
                gold.remove(element)
            elif source == 'pred':
                fp += 1
                fps.append(element['text'])
                pred.remove(element)
    
    if len(gold) > 0:
        fn += 1
    elif len(pred) > 0:
        fp += 1
        
    return tp, fp, fn, fns, fps   

In [3]:
def load_file(file_path):
    """
    Loads file and returns all the articles
    """
    # Load the data
    tree = et.parse(file_path)
    root = tree.getroot()

    return root

def process_article(article, filtered, file_path):
    """
    Takes article and process into desired structure
    """
    if 'GeoWebNews' in file_path:
        if filtered:
            return {'text': re.sub(' +', ' ', article.find('text').text.replace('\n', ' ')),
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym') 
                                                     if top.find('latitude') != None and top.find('longitude') != None], key=lambda k: k['start_pos'])}
        
        else:
            return {'text': re.sub(' +', ' ', article.find('text').text.replace('\n', ' ')),
                    'entities': sorted([{'text': top.find('extractedName').text, 
                                                      'start_pos': int(top.find('start').text), 
                                                      'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')], key=lambda k: k['start_pos'])}
    
    
    elif not filtered:
        return {'text': re.sub(' +', ' ', article.find('text').text.replace('\n', ' ')),
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                                 ], key=lambda k: k['start_pos'])}
        
    else:
        return {'text': re.sub(' +', ' ', article.find('text').text.replace('\n', ' ')),
                'entities': sorted([{'text': top.find('phrase').text,
                            'start_pos': int(top.find('start').text),
                            'end_pos': int(top.find('end').text)} for top in article.findall('toponyms/toponym')
                             if top.find('gaztag/lat') != None and top.find('gaztag/lon') != None
                                 ], key=lambda k: k['start_pos'])}

def process_articles(root, filtered, file_path):
    """
    Takes articles and processes them into desired structure
    """
    data = []
    
    for article in root:
        
        data.append(process_article(article, filtered, file_path))
    
    return data

def prepare_data(file_path, filtered):
    
    root = load_file(file_path)
    
    data = process_articles(root, filtered, file_path)
    
    return data

## Loading Fine-tuned mBERT model and predictions for non-labelled dataset

In [4]:
model_path = '../models/ner-multilingual-bert-fine-tuned-conll-2003'

label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [5]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=len(label_list))

## TR-News

### Loading the dataset

In [6]:
import os
import re
import xml.etree.ElementTree as et

In [7]:
# Get file path LGL dataset
file_path = '../../../data/TR-News/TR-News.xml'

data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

### Processing the data

In [8]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [9]:
# Format dataset into Hugginface Dataset structure
from datasets import Dataset

list_input_data = [i['text'] for i in data_all_toponyms]

TRN = Dataset.from_dict({'tokens': list_input_data})

In [10]:
TRN = TRN.map(tokenizer, input_columns='tokens', batched=True, fn_kwargs={'truncation': True})

TRN

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'token_type_ids', 'tokens'],
    num_rows: 118
})

### Prepare evaluation trainer for predictions

In [11]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [12]:
from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator,
#                        compute_metrics=compute_metrics
                      )

In [13]:
raw_pred, _, _ = test_trainer.predict(TRN)

In [14]:
import numpy as np

predictions = np.argmax(raw_pred, axis=2)

In [15]:
results = convert_predictions(predictions, TRN['input_ids'])

# Filter out results mistaken erronous results
results = [{'text': result['text'], 'entities': [entity for entity in result['entities'] if '#' not in entity['text']]} for result in results]

processed_results = process_pred_results(results, TRN['tokens'])

In [16]:
# # Check mistakes

# mistakes = []

# for idx, test in enumerate(testing):
    
#     for entity in test['entities']:
        
#         if test['text'][entity['start_pos']:entity['end_pos']] != entity['text']:
#             print(idx)
#             mistakes.append(idx)
            
#             print('position', entity['start_pos'])
#             print('entity:   ', entity['text'])
#             print('location: ',test['text'][entity['start_pos']:entity['end_pos']])
        
        

### Evaluation TR-News

In [17]:
# filtered toponyms
fps, fns = evaluate(data_filtered_toponyms, processed_results)

fp: 537 | tp: 518 | fn: 495
precision: 0.491 | recall: 0.511 | f-score: 0.501


In [18]:
fps

['White House',
 'New York',
 'Queens',
 'New York City',
 'U',
 '.',
 'S',
 '.',
 'Rose Garden',
 'Southern Poverty Law Center',
 'Ronald Reagan Building',
 'U',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Southern',
 'United States',
 'Texas',
 'US',
 'Wisconsin',
 'Wisconsin',
 'Wisconsin',
 'Wisconsin',
 'Pennsylvania',
 'Michigan',
 'Washington',
 'South Carolina',
 'US',
 'Manhattan',
 'South Carolina',
 'Florida',
 'Texas',
 'US',
 'Cuba',
 'Cuba',
 'US',
 'Cuba',
 'America',
 'Michigan',
 'Pennsylvania',
 'Wisconsin',
 'US',
 'US',
 'Russia',
 'US',
 'Wisconsin',
 'Michigan',
 'Pennsylvania',
 'Redding',
 'California',
 'Wooster St.',
 'Cumberland Farms',
 'South Main St',
 'Torrington',
 'Calgary',
 'Calgary',
 '-',
 'North West',
 'Alberta',
 'Kremlin',
 'Kremlin',
 'Canada',
 'Vancouver',
 'Downtown Eastside',
 'US',
 'London City',
 'Heathrow',
 'Gatwick',

In [19]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Turkish',
 'Russian',
 'Granville County',
 'New York',
 'Queens',
 'WASHINGTON',
 'U.S.',
 'Texas',
 'Texas',
 'Texas',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'Xavier University',
 'British',
 'U.S.',
 'European',
 'U.S.',
 'U.S.',
 'U.S.',
 'DETROIT',
 'U.S.',
 'U.S.',
 'US',
 'Wisconsin',
 'Wisconsin',
 'Wisconsin',
 'New York',
 'Michigan',
 'Washington',
 'South Carolina',
 'US',
 'Manhattan',
 'US',
 'Cuba',
 'Cuba',
 'Cuban',
 'US',
 'Cuba',
 'Michigan',
 'Pennsylvania',
 'Wisconsin',
 'US',
 'US',
 'Russia',
 'US',
 'BANTAM',
 'Bantam',
 'New Milford',
 'TORRINGTON',
 'Torrington',
 'Calgary',
 'Calgary',
 'Russian',
 'Russian',
 'Russian',
 'Russia',
 'Russian',
 'Canada',
 'Vancouver',
 'London',
 'Heathrow',
 'Gatwick',
 'England',
 'Heathrow',
 'Heathrow',
 'London',
 'London',
 'London',
 'California',
 'California',
 'Kayseri',
 'Istanbul',
 'Kayseri',
 'Anatolia',
 'Istanbul',
 'German',
 'Iraqi'

In [20]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, processed_results)

fp: 512 | tp: 545 | fn: 511
precision: 0.516 | recall: 0.516 | f-score: 0.516


In [21]:
fps

['White House',
 'New York',
 'Queens',
 'New York City',
 'U',
 '.',
 'S',
 '.',
 'Rose Garden',
 'Southern Poverty Law Center',
 'Ronald Reagan Building',
 'U',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Islamic State',
 'U',
 '.',
 'S',
 '.',
 'U',
 '.',
 'S',
 '.',
 'Southern',
 'United States',
 'Texas',
 'US',
 'Wisconsin',
 'Wisconsin',
 'Wisconsin',
 'Wisconsin',
 'Pennsylvania',
 'Michigan',
 'Washington',
 'South Carolina',
 'US',
 'Manhattan',
 'South Carolina',
 'Florida',
 'Texas',
 'US',
 'Cuba',
 'Cuba',
 'US',
 'Cuba',
 'America',
 'Michigan',
 'Pennsylvania',
 'Wisconsin',
 'US',
 'US',
 'Russia',
 'US',
 'Wisconsin',
 'Michigan',
 'Pennsylvania',
 'Redding',
 'California',
 'Wooster St.',
 'Cumberland Farms',
 'South Main St',
 'Torrington',
 'Calgary',
 'Calgary',
 '-',
 'North West',
 'Alberta',
 'Kremlin',
 'Kremlin',
 'Canada',
 'Vancouver',
 'Downtown Eastside',
 'US',
 'London City',
 'Heathrow',
 'Gatwick',

In [22]:
fns

['Turkish',
 'Turkish',
 'Syrian',
 'Syrian',
 'U.S.',
 'Turkish',
 'Kurdish',
 'Turkish',
 'Russian',
 'Granville County',
 'New York',
 'Queens',
 'WASHINGTON',
 'U.S.',
 'Texas',
 'Texas',
 'Texas',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'U.S.',
 'Xavier University',
 'British',
 'U.S.',
 'European',
 'U.S.',
 'U.S.',
 'U.S.',
 'DETROIT',
 'U.S.',
 'U.S.',
 'US',
 'Wisconsin',
 'Wisconsin',
 'Wisconsin',
 'New York',
 'Michigan',
 'Washington',
 'South Carolina',
 'US',
 'Manhattan',
 'US',
 'Cuba',
 'Cuba',
 'Cuban',
 'US',
 'Cuba',
 'Michigan',
 'Pennsylvania',
 'Wisconsin',
 'US',
 'US',
 'Russia',
 'US',
 'BANTAM',
 'Bantam',
 'New Milford',
 'TORRINGTON',
 'Torrington',
 'Calgary',
 'Calgary',
 'Russian',
 'Russian',
 'Russian',
 'Russia',
 'Russian',
 'Canada',
 'Vancouver',
 'London',
 'Heathrow',
 'Gatwick',
 'England',
 'Heathrow',
 'Heathrow',
 'London',
 'London',
 'London',
 'California',
 'California',
 'Kayseri',
 'Istanbul',
 'Kayseri',
 'Anatolia',
 'Istanbul',
 'Germa

## LGL

### Loading the dataset

In [23]:
import os
import re
import xml.etree.ElementTree as et

In [24]:
# Get file path LGL dataset
file_path = '../../../data/LGL/LGL.xml'

data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

### Processing the data

In [25]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [26]:
# Format dataset into Hugginface Dataset structure
from datasets import Dataset

list_input_data = [i['text'] for i in data_all_toponyms]

LGL = Dataset.from_dict({'tokens': list_input_data})

In [27]:
LGL = LGL.map(tokenizer, input_columns='tokens', batched=True, fn_kwargs={'truncation': True})

LGL

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'token_type_ids', 'tokens'],
    num_rows: 588
})

### Prepare evaluation trainer for predictions

In [28]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [29]:
from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator,
#                        compute_metrics=compute_metrics
                      )

In [30]:
raw_pred, _, _ = test_trainer.predict(LGL)

In [31]:
import numpy as np

predictions = np.argmax(raw_pred, axis=2)

In [32]:
results = convert_predictions(predictions, LGL['input_ids'])

# Filter out results mistaken erronous results
results = [{'text': result['text'], 'entities': [entity for entity in result['entities'] if '#' not in entity['text']]} for result in results]

processed_results = process_pred_results(results, LGL['tokens'])

In [33]:
# # Check mistakes

# mistakes = []

# for idx, test in enumerate(processed_results):
    
#     for entity in test['entities']:
        
#         if test['text'][entity['start_pos']:entity['end_pos']] != entity['text']:
#             print(idx)
#             mistakes.append(idx)
            
#             print('position', entity['start_pos'])
#             print('entity:   ', entity['text'])
#             print('location: ',test['text'][entity['start_pos']:entity['end_pos']])
#             print()
        
        

### Evaluation LGL

In [34]:
# filtered toponyms
fps, fns = evaluate(data_filtered_toponyms, processed_results)

fp: 1732 | tp: 2297 | fn: 1431
precision: 0.570 | recall: 0.616 | f-score: 0.592


In [35]:
fps

['Orchard St.',
 'Cottonport Fire Station',
 'Memphis St.',
 'Augusta St.',
 'St',
 '. James Youth Detention Center',
 'Minneapolis',
 'Minnesota',
 'Minnesota',
 'Otter Tail',
 'Highway',
 'Otter Tail / Grant',
 'Grant /',
 'Wilkin',
 'Lake Mary',
 'Benson',
 'Otter',
 'Tail',
 'Highway',
 '-',
 'Red River Valley',
 '40th Avenue South',
 'Hesco',
 'Oakport',
 'North Nokomis Street',
 'Darling Avenue',
 'Douglas County Hospital',
 'St',
 'County Road 109',
 'R',
 '-',
 'Van Dorn Street',
 'Ky',
 '.',
 'Conn',
 '.',
 'S',
 '.',
 'D',
 '.',
 'Ariz',
 '.',
 'Cora Kelly',
 'Old Town',
 'Old Town',
 'Washington, D. C.',
 'Old Town',
 'East End',
 'Southern Sudan',
 'Sub',
 'Sri',
 'Cairo Stadium',
 'Gulf',
 'Madrid',
 'Gulf',
 'US',
 'US',
 'US',
 'US',
 'Sharm El - Sheikh',
 'North Monticello',
 'St',
 ". John's Lutheran School",
 'Babcock Grove',
 'Sheldon Peck Homestead',
 'Woodfield',
 'Martingale',
 'Woodfield Shopping Center',
 'Streets of Woodfield',
 'Hudson',
 'Evergreen Lake',
 'D

In [36]:
fns

['Rapides Parish',
 'Cottonport',
 'Alexandria',
 'MANSFIELD',
 'Mansfield',
 'Shreveport',
 'Cook',
 'Minneapolis',
 'Minnesota',
 'Minnesota',
 'Otter Tail County',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Otter Tail',
 'Alexandria',
 'Oakport Township',
 'Douglas County',
 'Douglas County',
 'St. Cloud',
 'Alexandria',
 'Ky.',
 'Conn.',
 'S.D.',
 'Ariz.',
 'Alexandria',
 'Washington',
 'D.C.',
 'Virginia',
 'Sudanese',
 'Sudan',
 'Chinese',
 'Africa',
 'African',
 'Egyptian',
 'Egyptian',
 'Sudan',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Sri Lankan',
 'Sri Lankan',
 'Sinhalese',
 'Sinhalese',
 'Sinhalese',
 'Sri Lankan',
 'Egypt',
 'Egyptian',
 'US',
 'Egyptian',
 'US',
 'Egyptian',
 'Israeli',
 'Palestinian',
 'Israeli',
 'Palestinian',
 'Palestinian',
 'Palestinian',
 'Palestinian',
 'Syrians',
 'Syrians',
 'Palestinians',
 'Palestinians',
 'Iranian',
 'Israeli',
 'Sharm El-Sheikh',
 'Americans',
 'Israeli',
 'Lombard',
 'Roselle',
 'Roselle',
 'Illinois',
 'Roselle',
 'N

In [37]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, processed_results)

fp: 1422 | tp: 2704 | fn: 1554
precision: 0.655 | recall: 0.635 | f-score: 0.645


In [38]:
fps

['Orchard St.',
 'Cottonport Fire Station',
 'St',
 '. James Youth Detention Center',
 'Minneapolis',
 'Minnesota',
 'Minnesota',
 'Otter Tail',
 'Highway',
 'Otter Tail / Grant',
 'Grant /',
 'Wilkin',
 'Otter',
 'Tail',
 'Highway',
 '-',
 'Hesco',
 'Oakport',
 'Douglas County Hospital',
 'St',
 'R',
 '-',
 'Ky',
 '.',
 'Conn',
 '.',
 'S',
 '.',
 'D',
 '.',
 'Ariz',
 '.',
 'Cora Kelly',
 'City Hall',
 'Washington, D. C.',
 'East End',
 'Southern Sudan',
 'Sub',
 'Sri',
 'Gulf',
 'Madrid',
 'Gulf',
 'US',
 'US',
 'US',
 'US',
 'Sharm El - Sheikh',
 'St',
 ". John's Lutheran School",
 'Sheldon Peck Homestead',
 'Woodfield Shopping Center',
 'Streets of Woodfield',
 'Hudson',
 'Decatur',
 'U',
 '.',
 'S',
 '.',
 'Ill',
 '.',
 'Pe',
 'Tarrant',
 'Houston',
 'Big',
 'Law Enforcement Center',
 'City Hall',
 'E',
 '. Broad St.',
 'Mansfield Law Enforcement Center',
 'North Texas',
 'City Hall',
 'North Arlington',
 'Cooper Street',
 'Silkwood Trail',
 'South Arlington',
 'Sports Center',
 'E

In [39]:
fns

['Rapides Parish',
 'Cottonport',
 'Alexandria',
 'MANSFIELD',
 'Mansfield',
 'Shreveport',
 'Cook',
 'Minneapolis',
 'Minnesota',
 'Minnesota',
 'Highway 200',
 'Mahnomen County Road',
 'Mahnomen County Road',
 'Highway 10',
 'Otter Tail County',
 'Highway 108',
 'Otter Tail/Grant',
 'Grant/Wilkin',
 'County Road 43',
 'Highway 114',
 'Douglas County',
 'Minnesota',
 'Minnesota',
 'Otter Tail',
 'Alexandria',
 'Oakport Township',
 'Douglas County',
 'County Road 35',
 'County Road 56',
 'County Road 15',
 'County Road 96',
 'Nokomis',
 'Douglas County',
 'St. Cloud',
 'Alexandria',
 'Ky.',
 'Conn.',
 'S.D.',
 'Ariz.',
 'Alexandria',
 'Washington',
 'D.C.',
 'Virginia',
 'Sudanese',
 'Sudan',
 'Chinese',
 'Africa',
 'African',
 'Egyptian',
 'Egyptian',
 'Sudan',
 'Sudanese',
 'Sudanese',
 'Sudanese',
 'Godapitiya',
 'Sri Lankan',
 'Sri Lankan',
 'Sinhalese',
 'Sinhalese',
 'Sinhalese',
 'Sri Lankan',
 'Egypt',
 'Egyptian',
 'US',
 'Egyptian',
 'US',
 'Egyptian',
 'Israeli',
 'Madrid Co

## GeoWebNews

### Loading the dataset

In [40]:
import os
import re
import xml.etree.ElementTree as et

In [41]:
# Get file path LGL dataset
file_path = '../../../data/GeoWebNews/GeoWebNews.xml'

data_all_toponyms = prepare_data(file_path, filtered=False)
data_filtered_toponyms = prepare_data(file_path, filtered=True)

### Processing the data

In [42]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [43]:
# Format dataset into Hugginface Dataset structure
from datasets import Dataset

list_input_data = [i['text'] for i in data_all_toponyms]

GWN = Dataset.from_dict({'tokens': list_input_data})

In [44]:
GWN = GWN.map(tokenizer, input_columns='tokens', batched=True, fn_kwargs={'truncation': True})

GWN

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'token_type_ids', 'tokens'],
    num_rows: 200
})

### Prepare evaluation trainer for predictions

In [45]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [46]:
from transformers import Trainer

test_trainer = Trainer(model, 
                       data_collator=data_collator,
#                        compute_metrics=compute_metrics
                      )

In [47]:
raw_pred, _, _ = test_trainer.predict(GWN)

In [48]:
import numpy as np

predictions = np.argmax(raw_pred, axis=2)

In [49]:
results = convert_predictions(predictions, GWN['input_ids'])

# Filter out results mistaken erronous results
results = [{'text': result['text'], 'entities': [entity for entity in result['entities'] if '#' not in entity['text']]} for result in results]

processed_results = process_pred_results(results, GWN['tokens'])

In [50]:
# # Check mistakes

# mistakes = []

# for idx, test in enumerate(processed_results):
    
#     for entity in test['entities']:
        
#         if test['text'][entity['start_pos']:entity['end_pos']] != entity['text']:
#             print(idx)
#             mistakes.append(idx)
            
#             print('position', entity['start_pos'])
#             print('entity:   ', entity['text'])
#             print('location: ',test['text'][entity['start_pos']:entity['end_pos']])
#             print()
        
        

### Evaluation GeoWebnews

In [51]:
# filtered toponyms
fps, fns = evaluate(data_filtered_toponyms, processed_results)

fp: 806 | tp: 752 | fn: 1163
precision: 0.483 | recall: 0.393 | f-score: 0.433


In [52]:
fps

["Rue d'Enghein",
 'Lafayette',
 'Almonaster',
 'Franklin',
 'Marigny Plantation',
 'Press Street',
 'Franklin',
 "Faubourg D'Aunoy",
 'Chartres',
 'Franklin',
 'D',
 "'",
 'Aunoy',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 'Royal',
 'Elysian Fields',
 'Royal Street',
 'Bourbon',
 'Turkey',
 'Syria',
 'Turkey',
 'Qamishli',
 'Turkey',
 'Afrin',
 'Afrin',
 'Afrin',
 'Jamison',
 'Norbury',
 'Granville',
 'Somerville',
 'Jamison',
 'Norbury',
 'Norbury',
 'Lake Manassas',
 'Africa',
 'Africa',
 'Nigeria',
 'Africa',
 'Africa',
 'Russia',
 'Kremlin',
 'Russia',
 'COLUM',
 'Ohio',
 'Louisville',
 'St',
 ". Peter's Basilica",
 'White House',
 'White House',
 'White House',
 'Illinois',
 'Fort Salonga',
 'Bellmore',
 'Europe',
 'Europe',
 'New York City',
 'New Orleans',
 'Russia',
 'Commonwealth of Independent States',
 'Islamic Republic',
 'Iran',
 'Iran',
 'Iran',
 'CIS',
 'Iran',
 'Iran',
 'Emir',
 'North America',
 'Damascus',
 'Douma',
 'Ghouta',
 'Douma

In [53]:
fns

['Louisiana',
 'French',
 'German',
 'Irish',
 "Rue d'Enghein",
 'Lafayette',
 'Almonaster',
 'Franklin',
 'Marigny Plantation',
 'Press Street',
 'Franklin',
 "Faubourg D'Aunoy",
 'Chartres',
 'Franklin',
 'Methodist church',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 'Greek',
 'Elysian Fields',
 'Royal Street',
 'BEIRUT',
 'Kurdish',
 'Turkey',
 'Syria',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Turkey',
 'Qamishli',
 'Syrian',
 'Turkish',
 'Turkey',
 'Syrian',
 'Kurdish',
 'Afrin',
 'Turkish',
 'Afrin',
 'Britain',
 'Syrian',
 'Kurdish',
 'Turkish',
 'Syrian',
 'Washington',
 'Washington',
 'African',
 'African',
 'Africa',
 'African',
 'Africa',
 'Nigeria',
 'Africa',
 'African',
 'Russian',
 'Russian',
 'Russia',
 'Russian Higher School of Economics',
 'COLUMBUS',
 'Ohio',
 'Mississippi',
 "St. Peter's Basilica",
 'Nigerian',
 'Nigerian',
 'Muscat',
 'Omani',
 'Oman',
 'White House',
 'New York',
 'Mediterranean',
 'Fort Salonga',
 'France

In [54]:
# all toponyms
fps, fns = evaluate(data_all_toponyms, processed_results)

fp: 798 | tp: 763 | fn: 3440
precision: 0.489 | recall: 0.182 | f-score: 0.265


In [55]:
fps

["Rue d'Enghein",
 'Lafayette',
 'Almonaster',
 'Franklin',
 'Marigny Plantation',
 'Press Street',
 'Franklin',
 "Faubourg D'Aunoy",
 'Chartres',
 'Franklin',
 'D',
 "'",
 'Aunoy',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 'Royal',
 'Elysian Fields',
 'Royal Street',
 'Bourbon',
 'Turkey',
 'Syria',
 'Turkey',
 'Qamishli',
 'Turkey',
 'Afrin',
 'Afrin',
 'Afrin',
 'Jamison',
 'Norbury',
 'Norbury',
 'Lake Manassas',
 'Africa',
 'Africa',
 'Nigeria',
 'Africa',
 'Africa',
 'Russia',
 'Kremlin',
 'Russia',
 'COLUM',
 'Ohio',
 'Louisville',
 'St',
 ". Peter's Basilica",
 'White House',
 'White House',
 'White House',
 'Illinois',
 'Fort Salonga',
 'Bellmore',
 'Bethany',
 'Europe',
 'Europe',
 'New York City',
 'New Orleans',
 'Russia',
 'Commonwealth of Independent States',
 'Islamic Republic',
 'Iran',
 'Iran',
 'Iran',
 'CIS',
 'Iran',
 'Iran',
 'Emir',
 'North America',
 'Damascus',
 'Douma',
 'Ghouta',
 'Douma',
 'Damascus',
 'Ghouta',
 'Douma',
 'Mo

In [56]:
fns

['area',
 'plantation',
 'mansion',
 'substation',
 'Louisiana',
 'Louisiana Purchase',
 'parcel',
 'French',
 'plat',
 'squares',
 'neighborhood',
 'city',
 'community',
 'African Americans',
 'German',
 'Irish',
 'populations',
 'blocks',
 'residents',
 'blocks',
 'intersection',
 'mills',
 'plant',
 'stables',
 'factory',
 'barn',
 'streets',
 "Rue d'Enghein",
 'street',
 'Lafayette',
 'Almonaster',
 'Franklin',
 'avenue',
 'Marigny Plantation',
 'faubourg',
 'Press Street',
 'area',
 'Franklin',
 "Faubourg D'Aunoy",
 'neighbor',
 'building',
 'Chartres',
 'Franklin',
 'Methodist church',
 'restaurant',
 'street',
 'complex',
 'Royal Street',
 'Royal Street',
 'Rue Casa Calvo',
 'Faubourg Marigny',
 '2231 Royal',
 '2231 Royal',
 'townhouse',
 'basement',
 'Greek',
 'structures',
 'Elysian Fields',
 'Royal Street',
 'BEIRUT',
 'Kurdish',
 'Turkey',
 'city',
 'Syria',
 'Turkish',
 'Syrian',
 'Kurdish',
 'Turkey',
 'Qamishli',
 'towns',
 'Syrian',
 'Turkish',
 'campaign',
 'Turkey',
 '