In [7]:
import math
import re
from collections import Counter
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians
from sklearn.metrics.pairwise import haversine_distances

# Files were cleaned and organized in Excel so the cleaned files were uploaded

osm = pd.read_excel('osm_edit.xlsx')
google = pd.read_excel('googlepoi.xlsx')
match = pd.read_excel('matching.xlsx')



osm_clean = osm # An edited copy

# I merged addresses on the OpenStreetMap dataset
osm_clean['address'] = osm_clean[osm_clean.columns[8:24]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

# Then I drop these columns, leaving only one address column
osm_clean.drop(["address_details_level", "address_house_nr", "address_street", "address_zip_code",
 "address_city", "address_country", "address_full", "address_region_neighborhood", 
 "address_region_suburb", "address_region_district", "address_region_province", "address_region_state", 
 "address_house_name", "address_place", "address_block", "address_details_level", "address_details_flats", 'address_details_unit'], axis=1, inplace=True)

WORD = re.compile(r"\w+")

# A function to calculate cosine distances using haversine_distances
def check_distance(osm_lat, osm_long, google_lat, google_long):
    osm_geodata = (osm_lat, osm_long)
    google_geodata = (google_lat, google_long)
    osm_in_radians = [radians(_) for _ in osm_geodata]
    google_in_radians = [radians(_) for _ in google_geodata]
    result = haversine_distances([osm_in_radians, google_in_radians])
    distance = result * 6371000/1000
    distance_km = distance.item(1)
    
    return distance_km

# A function to get cosine similarities
def calculate_cosine(vect1, vect2):
    joint_var = set(vect1.keys()) & set(vect2.keys())
    numerator = sum([vect1[x] * vect2[x] for x in joint_var])

    sum1 = sum([vect1[x] ** 2 for x in list(vect1.keys())])
    sum2 = sum([vect2[x] ** 2 for x in list(vect2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

    
# Change texts to vectors
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

# Get unique dataframe objects for specific OSM POI
def get_unique_df(id):
    unique_df = osm_clean[osm_clean['osm_id']== id]
    return unique_df

# Get unique dataframes for Google POI
def google_df(id):
    iD = str(id)
    google_df = google[google['internal_id'] == iD]
    return google_df

# Turn dataframe objects to strings
def get_strings(df):
    dflist = df[['name','tags','categories', 'address']].values.tolist()
    dfstring = ' '.join([str(item) for item in dflist])
    purestring = " ".join(re.split("[^a-zA-Z]*", dfstring))
    finalstring = purestring.replace("n a n","")
    return finalstring

# Check cosine similarities
def check_similarity(text1, text2):
    text1 = text_osm
    text2 = text_google
    vect1 = text_to_vector(text1)
    vect2 = text_to_vector(text2)

    cosine = calculate_cosine(vect1, vect2)
    return cosine


sim_score = []
geoDist = []

# We iterate through every query, evaluate their geospatial distances using longitudinal data
for row in range(len(match)):
    _osm_id = match.loc[row, 'osm_id']
    _google_id = match.loc[row, 'internal_id']
    
    
    google_lat = google.loc[google['internal_id'] == _google_id, 'latitude']
    google_long = google.loc[google['internal_id'] == _google_id, 'longitude']
    osm_lat = osm.loc[osm['osm_id'] == _osm_id, 'latitude']
    osm_long = osm.loc[osm['osm_id'] == _osm_id, 'longitude']
    
    dist = check_distance(osm_lat, osm_long, google_lat, google_long)
    geoDist.append(dist)
 
 
    osm_search = get_unique_df(_osm_id)
    google_search = google_df(_google_id)
    text_osm = get_strings(osm_search)
    text_google = get_strings(google_search)
    similarities = check_similarity(text_osm, text_google)
    sim_score.append(similarities)
    
    
# To obtain confidence scores. 
pred = zip(geoDist, sim_score)
confidence_score = []
for x, y in pred: 
    cos_sim_scaled = (1 / (1 + x)) * 0.7 # I scaled OSM entries by this factor
    sim_score_scaled = y * 0.3 # and scaled text similarities by this factor. This is to give more weight to location data
    
    score = cos_sim_scaled + sim_score_scaled
    confidence_score.append(score)
    
match['confidence_score'] = confidence_score
match.to_csv("match_results.csv")
match.head(30) 

Unnamed: 0,osm_type,osm_id,internal_id,query,confidence_score
0,way,154470603,0x130e44cd6e20475f:0x671441b8dc03be60,Kalkara,0.690611
1,node,5896564791,0x130e45014da2fa03:0xf743a120b9194c06,"Spar, Triq TignÃ©",0.954448
2,node,2471609507,0x130e4501edfb329b:0x9edcdba888218c47,"Bayview Hotel & Apartments, The Strand 143, Sl...",0.944391
3,node,6222651588,0x130e450a6e7438bd:0x5af47e8f69212d52,"David Hardware Store, Triq Carlo Manche",0.963236
4,node,6635172974,0x130e451ad5f0c673:0x1973502978d5c025,"Chef Lee, Triq d'Argens",0.951962
5,relation,12634993,0x130e451d009c771d:0x1cf6ec5b812ebc75,"Consulate General of Austria, Vjal Sir Temi Za...",0.975533
6,node,7442975758,0x130e451fb816909b:0xd8a592326a69a444,"111 Art Gallery, Triq il-Gnien 38/40, GZR1411,...",0.967438
7,way,532280196,0x130e4524d863accb:0x59b5320a6f75ae7b,Triq TignÃ©,0.838723
8,node,6762940584,0x130e4525313284cd:0x74fab348121af42f,"HB mini market, Mattew Pulis Street, Sliema",0.964325
9,way,339461449,0x130e4525391b83bd:0x8e64cfd286b0304a,"Rocca Nettuno Suites, Mattew Pulis Street, SLM...",0.96584


In [3]:
match

Unnamed: 0,osm_type,osm_id,internal_id,query,confidence_score
0,way,154470603,0x130e44cd6e20475f:0x671441b8dc03be60,Kalkara,0.690611
1,node,5896564791,0x130e45014da2fa03:0xf743a120b9194c06,"Spar, Triq TignÃ©",0.927404
2,node,2471609507,0x130e4501edfb329b:0x9edcdba888218c47,"Bayview Hotel & Apartments, The Strand 143, Sl...",0.897853
3,node,6222651588,0x130e450a6e7438bd:0x5af47e8f69212d52,"David Hardware Store, Triq Carlo Manche",0.950928
4,node,6635172974,0x130e451ad5f0c673:0x1973502978d5c025,"Chef Lee, Triq d'Argens",0.920129
...,...,...,...,...,...
804,node,6644698685,0x80dd4ae5d53cf7db:0xc37525eaf8510bc8,"The Brew, The Strand",0.218432
805,node,4813878426,0x812611d8e83c1d33:0x413c6b46ee1830fa,"San Quintin Bakery, Triq ÄŠensu Borg",0.246063
806,way,342698116,0x8816d3501297c131:0x3c9f57ffc1d42ba5,"Police Garage, Il-Foss Ta'Notre Dame",0.226866
807,node,7043230414,0x88388a870b71620b:0xe94a08098b3ebeca,"L'Occitan en Provence, The Strand",0.172332
