In [1]:

from transformers import BertTokenizer
import torch
from whg_dataset_loader import WhgDataset
from spabert.datasets.usgs_os_sample_loader import USGS_MapDataset
from spabert.datasets.wikidata_sample_loader import Wikidata_Geocoord_Dataset, Wikidata_Random_Dataset
from spabert.models.spatial_bert_model import SpatialBertModel
from spabert.models.spatial_bert_model import SpatialBertConfig
from transformers.models.bert.modeling_bert import BertForMaskedLM
from spabert.models.spatial_bert_model import  SpatialBertForMaskedLM

from spabert.utils.find_closest import find_ref_closest_match, sort_ref_closest_match
from spabert.utils.common_utils import load_spatial_bert_pretrained_weights, get_spatialbert_embedding, get_bert_embedding, write_to_csv
from spabert.utils.baseline_utils import get_baseline_model


# load our spabert model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
config = SpatialBertConfig()
model = SpatialBertModel(config)

model.to(device)
model.eval()

# load pretrained weights
pre_trained_model=torch.load('data/fine-tuned_model.pth')
cnt_layers = 0
model_keys = model.state_dict()
for key in model_keys:
    if 'bert.'+ key in pre_trained_model:
        model_keys[key] = pre_trained_model["bert."+key]
        cnt_layers += 1
    else:
        print("No weight for", key)
print(cnt_layers, 'layers loaded')

model.load_state_dict(model_keys)

  from .autonotebook import tqdm as notebook_tqdm


No weight for pooler.dense.weight
No weight for pooler.dense.bias
198 layers loaded


<All keys matched successfully>

In [2]:
# load entity-linking datasets

sep_between_neighbors = False
wikidata_dict_per_map = {}
wikidata_dict_per_map['wikidata_emb_list'] = []
wikidata_dict_per_map['wikidata_qid_list'] = []
wikidata_dict_per_map['names'] = []


whg_dataset = WhgDataset(
    data_file_path = 'data/whg.json',
    tokenizer = tokenizer,
    max_token_len = 512, 
    distance_norm_factor = 25, 
    spatial_dist_fill=100,
    sep_between_neighbors = sep_between_neighbors)

wikidata_dataset = WhgDataset(
    data_file_path='data/wikidata.json',
    tokenizer=tokenizer,
    max_token_len=512,
    distance_norm_factor=50000,
    spatial_dist_fill=20,
    sep_between_neighbors=sep_between_neighbors)


matched_wikid_dataset = []
for i in range(len(wikidata_dataset)):
    emb = wikidata_dataset[i]
    matched_wikid_dataset.append(emb)
    max_dist_lng = max(emb['norm_lng_list'])
    max_dist_lat = max(emb['norm_lat_list'])


In [10]:
from spabert.experiments.entity_matching.data_processing import request_wrapper
import scipy.spatial as sp
import numpy as np
## ENTITY LINKING ##


# disambigufy
def disambiguify(model, model_name, usgs_dataset, wikidata_dict_list, candset_mode = 'all_map', if_use_distance = True, select_indices = None): 

    if select_indices is None: 
        select_indices = range(0, len(wikidata_dict_list))


    assert(candset_mode in ['all_map','per_map'])
    wikidata_emb_list = wikidata_dict_list['wikidata_emb_list']
    wikidata_qid_list = wikidata_dict_list['wikidata_qid_list'] 
    ret_list = []
    for i in range(len(usgs_dataset)):
        if (i % 1000) == 0:
            print("disambigufy at " + str((i/len(usgs_dataset))*100)+"%")
        if model_name == 'spatial_bert-base' or model_name == 'spatial_bert-large':
            usgs_emb = get_spatialbert_embedding(usgs_dataset[i], model, use_distance = if_use_distance)
        else:
            usgs_emb = get_bert_embedding(usgs_dataset[i], model)
        sim_matrix = 1 - sp.distance.cdist(np.array(wikidata_emb_list), np.array([usgs_emb]), 'cosine')
        closest_match_qid = sort_ref_closest_match(sim_matrix, wikidata_qid_list)
        #print(closest_match_qid)
            
        sorted_sim_matrix = np.sort(sim_matrix, axis = 0)[::-1] # descending order

        ret_dict = dict()
        ret_dict['pivot_name'] = usgs_dataset[i]['pivot_name']

        ret_dict['sorted_match_qid'] = [a[0] for a in closest_match_qid]
        ret_dict['sorted_sim_matrix'] = [a[0] for a in sorted_sim_matrix]

        ret_list.append(ret_dict)

    return ret_list 


candset_mode = 'all_map'
for i in range(0, len(matched_wikid_dataset)):
    if (i % 1000) == 0:
        print("processing at: "+ str(i/len(matched_wikid_dataset)*100) + "%")
        #print(matched_wikid_dataset[i])
    entity = matched_wikid_dataset[i]
    wikidata_emb = get_spatialbert_embedding(matched_wikid_dataset[i], model)
    wikidata_dict_per_map['wikidata_emb_list'].append(wikidata_emb)
    wikidata_dict_per_map['wikidata_qid_list'].append(matched_wikid_dataset[i]['qid'])
    wikidata_dict_per_map['names'].append(wikidata_dataset[i]['pivot_name'])

ret_list = disambiguify(model, 'spatial_bert-base', whg_dataset, wikidata_dict_per_map, candset_mode= candset_mode, if_use_distance = not False, select_indices = None)
write_to_csv('data/', "output.csv", ret_list)

processing at: 0.0%
processing at: 21.62629757785467%
processing at: 43.25259515570934%
processing at: 64.87889273356402%
processing at: 86.50519031141869%
['Q4073426', 'Q3313715', 'Q654318', 'Q2596889', 'Q3175234', 'Q4073426', 'Q3313715', 'Q654318', 'Q2596889', 'Q3175234', 'Q4073426', 'Q3313715', 'Q654318', 'Q2596889', 'Q3175234', 'Q4073426', 'Q3313715', 'Q654318', 'Q2596889', 'Q3175234', 'Q4073426', 'Q1017', 'Q25319', 'Q28520', 'Q36405', 'Q213154', 'Q242587', 'Q1519', 'Q6655437', 'Q181109', 'Q81398', 'Q416070', 'Q25247498', 'Q6487', 'Q43387', 'Q378940', 'Q21922661', 'Q13450', 'Q889', 'Q170525', 'Q170454', 'Q191396', 'Q200805', 'Q2011761', 'Q134762', 'Q530171', 'Q672990', 'Q163132', 'Q2125333', 'Q1164081', 'Q839456', 'Q310799', 'Q558643', 'Q558861', 'Q704257', 'Q1951', 'Q389205', 'Q3561', 'Q3612324', 'Q4736810', 'Q2722762', 'Q244451', 'Q219608', 'Q16641', 'Q83081', 'Q217414', 'Q727', 'Q12824780', 'Q489660', 'Q5783', 'Q175741', 'Q1155818', 'Q38380', 'Q1049', 'Q916', 'Q26906', 'Q134346'

In [6]:
# Evaluate entity linking
import os
import pandas as pd
import json

# define the ground truth directory for evaluation
gt_dir = os.path.abspath("data/wikidata.json")


# define the file where we wrote out predictions
prediction_path = os.path.abspath('data/output.csv.json')


# define ground truth dictionary
gt_dict = dict()

with open(gt_dir) as f:
    data = f.readlines()
    for line in data:
        d = json.loads(line)
        gt_dict[d['info']['name']] = d['info']['qid']



rank_list = []
hits_at_1 = 0
hits_at_5 = 0
hits_at_10 = 0
out_dict = {'title':[],'rank':[]}

with open(prediction_path) as f:
    data = f.readlines()
    for line in data:
        pred_dict = json.loads(line)
        pivot_name = pred_dict['pivot_name']
        sorted_matched_uri = pred_dict['sorted_match_qid']
        sorted_sim_matrix = pred_dict['sorted_sim_matrix']
        if pivot_name in gt_dict:
            gt_uri = gt_dict[pivot_name]
            rank = sorted_matched_uri.index(gt_uri) +1
            if rank == 1:
                hits_at_1 += 1
            if rank <= 5:
                hits_at_5 += 1
            if rank <= 10:
                hits_at_10 +=1
            rank_list.append(rank)
            out_dict['title'].append(pivot_name)
            out_dict['rank'].append(rank)

hits_at_1 = hits_at_1/len(rank_list)
hits_at_5 = hits_at_5/len(rank_list)
hits_at_10 = hits_at_10/len(rank_list)

print(hits_at_1)
print(hits_at_5)
print(hits_at_10)

out_df = pd.DataFrame(out_dict)
out_df
        



0.6570069204152249
0.7182093425605537
0.7407006920415224


Unnamed: 0,title,rank
0,A Shau,1
1,Aachen,1
2,Aarhus,1
3,Abbeville,1
4,Aberdeen,1
...,...,...
4619,Budta,3275
4620,Munich,1
4621,Milan,1
4622,Depok,1


Mean Reciprocal Rank is a statistical measure for evaluating processes that produce a list of possible responses of a query in order of probability of correctness.

First we obtain the rank from the ranked list shown above.

Next we calculate the reciprocal rank for each rank. The reciprocal is the inverse of the rank. So for a rank of 1 the recprocal rank would be 1/1, for a rank of 2 the reciprocal rank would be 1/2.

The mean reciprocal rank is the average of the reciprocal ranks. 

This measure gives us a general conceptualization of how well our model predicts entities based on their embeddings.

An in-depth description of Mean Reciprocal Rank can be found here https://en.wikipedia.org/wiki/Mean_reciprocal_rank

An import thing to keep in mind when caclulating mean reciprocal rank is that it tends to inversely scale with your candidate set size

Our candidate set is has a length of 4624 

In [8]:
# calculating the mean reciprocal rank (MRR)
import numpy as np

reciprocal_list = [1./rank for rank in rank_list]

MRR = np.mean(reciprocal_list)

print(MRR)


0.6860230113136482
