In [1]:
## import data manipulation packages
import pandas as pd
import numpy as np
from sklearn.neighbors import DistanceMetric
from math import radians

In [2]:
## import dataset as variable 'city' and drop NaN
cities = pd.read_excel('data/worldcities.xlsx')
ct = cities.dropna(axis = 'rows', how = 'any')

In [3]:
## drop not usable columns
ct = ct.drop(['city_ascii', 'country', 'iso2', 'admin_name', 'capital', 'population'], axis = 1)

In [4]:
## coversion of coordinated in degrees to radians
ct['lat'] = np.radians(ct['lat'])
ct['lng'] = np.radians(ct['lng'])

In [5]:
## get the scipy metrics
dist = DistanceMetric.get_metric('haversine')

In [6]:
## prepare the array to get in input for the distance calculation
ct[['lat','lng']].to_numpy()

array([[ 0.62290277,  2.43808883],
       [-0.10846523,  1.86479878],
       [ 0.50021136,  1.34791778],
       ...,
       [-0.57810541, -1.12308098],
       [ 0.05539326, -1.14400398],
       [ 0.8546249 ,  1.68918725]])

In [7]:
## pass pairwise function to get the distance / multiply for 6373 as earth sperical radius is 6373kms
dist.pairwise(ct [['lat','lng']].to_numpy())*6373

array([[    0.        ,  5787.90376514,  5835.24850156, ...,
        17802.44164785, 14963.27221178,  3763.88990049],
       [ 5787.90376514,     0.        ,  5011.26756513, ...,
        15548.07263324, 19112.74637668,  6215.36837225],
       [ 5835.24850156,  5011.26756513,     0.        , ...,
        16340.95395707, 14699.22965152,  2805.95968927],
       ...,
       [17802.44164785, 15548.07263324, 16340.95395707, ...,
            0.        ,  4039.25704802, 17665.78252814],
       [14963.27221178, 19112.74637668, 14699.22965152, ...,
         4039.25704802,     0.        , 13975.81579146],
       [ 3763.88990049,  6215.36837225,  2805.95968927, ...,
        17665.78252814, 13975.81579146,     0.        ]])

In [8]:
## create the distance matrix with cities in the indexes
distance = pd.DataFrame(dist.pairwise(ct[['lat','lng']].to_numpy())*6373, columns = ct['id'], index = ct['id'])
print(distance)

id            1392685764    1360771077    1356872604    1356226629  \
id                                                                   
1392685764      0.000000   5787.903765   5835.248502   6736.368099   
1360771077   5787.903765      0.000000   5011.267565   4662.287622   
1356872604   5835.248502   5011.267565      0.000000   1166.997459   
1356226629   6736.368099   4662.287622   1166.997459      0.000000   
1608618140   2997.194340   2790.717691   4755.337989   5135.012933   
...                  ...           ...           ...           ...   
1558125707  13121.910379  18749.404488  15165.430715  15817.752584   
1887613470   8894.127127   6573.440879   3059.249501   2331.235749   
1032552145  17802.441648  15548.072633  16340.953957  15498.485119   
1862255876  14963.272212  19112.746377  14699.229652  14845.181740   
1496403046   3763.889900   6215.368372   2805.959689   3965.565663   

id            1608618140    1156073548    1076532519    1410836482  \
id                 

In [9]:
## identify 'London' ID 
ct.loc[(ct['city'] == 'London') & (ct['iso3'] == 'GBR')]

Unnamed: 0,city,lat,lng,iso3,id
34,London,0.89897,-0.002225,GBR,1826645935


In [10]:
## get starting line from distance matrix
distance.loc[[1826645935]]

id,1392685764,1360771077,1356872604,1356226629,1608618140,1156073548,1076532519,1410836482,1484247881,1156237133,...,1156608652,1032937217,1032815504,1032163046,1032824407,1558125707,1887613470,1032552145,1862255876,1496403046
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1826645935,9561.720229,11722.277893,6711.101944,7200.011313,10738.339656,9205.257531,9500.397619,8859.502225,8931.505631,9498.768799,...,7894.957917,11302.078019,12371.501064,11018.855747,12266.712345,8618.987807,6065.321395,11300.324944,8056.532077,6365.946966
