In [1]:
import numpy as np
import pandas as pd
import ipaddress as ip

from multiprocessing import Pool

In [2]:
# free IP2Location data
ipv4 = pd.read_csv('IP2LOCATION-LITE-DB9.CSV', names=['start', 'end', 'ios', 'country', 'state', 'city', 'lat', 'lon', 'zipcode'],
                dtype={'start':long, 'end':long, 'ios':str, 'country':str, 'state':str, 'city':str,
                       'lat':float, 'lon':float, 'zipcode':str})

In [3]:
nums = ipv4['start'].tolist()
# binary search for ip index 
def searchIndex(x):
    if x <= 0:
        return 0
    elif x >= nums[-1]:
        return len(nums)-1
    left, right = 0, len(nums)-1
    while left < right:
        mid = (left+right)/2
        if nums[mid] > x:
            right = mid - 1
            if nums[right] <= x:
                return right
        else:
            left = mid
            if nums[left+1] > x:
                return left
            
def ip2LatLong(s):
    if ':' in s:
        return 0.0, 0.0
    else:
        try:
            val = int(ip.IPv4Address(unicode(s)))
            index = searchIndex(val)
            return ipv4['lat'].iloc[index], ipv4['lon'].iloc[index]
        except:
            return 0.0, 0.0
        
def parseChunk(chunk):
    df = chunk.copy()
    df.index = range(df.shape[0])
    df['lat_c'], df['lon_c'] = zip(*df['ip'].apply(ip2LatLong))

    return df[['ip', 'latitude', 'longitude', 'lat_u', 'lon_u', 'lat_c', 'lon_c']]

In [4]:
# attach the city Latitude and Longitude
pool = Pool(20)
multiprocessing_jobs = []

nbatch = 100
dframe = pd.read_table('part-home.dat', sep='\t', names=['ip', 'lat', 'lon', 'latitude', 'longitude', 'lat_u', 'lon_u', 'lat_s', 'lon_s'],
                  dtype={'ip':str, 'lat':float, 'lon':float, 'latitude':float, 'longitude':float, 
                         'lat_u':float, 'lon_u':float, 'lat_s':float, 'lon_s':float}, chunksize=nbatch)

for i, chunk in enumerate(dframe):
    if(((i + 1) * nbatch)%1000 == 0):
        print('Processing {:d} lines...'.format((i + 1) * nbatch))
    multiprocessing_jobs.append( pool.apply_async(parseChunk, ( chunk, )))

pool.close()
pool.join()

jobs = [job.get() for job in multiprocessing_jobs]
data = pd.concat(jobs)

Processing 1000 lines...
Processing 2000 lines...
Processing 3000 lines...
Processing 4000 lines...
Processing 5000 lines...


In [5]:
# Distance between two (lat,lon)'s
def convertLatLongtoGreatCircle(ll):
    Rearth = 6371.0 # earth's radius in km
    if len(ll) != 4 or ll[0] is None or ll[1] is None or ll[2] is None or ll[3] is None:
        return None

    factor = np.pi / 180.0
    long1 = ll[0] * factor
    lat1 = ll[1] * factor
    long2 = ll[2] * factor
    lat2 = ll[3] * factor

    dlong = long1 - long2
    dlat = lat1 - lat2

    numerator = (np.cos(lat2) * np.sin(dlong)) * (np.cos(lat2) * np.sin(dlong))
    numerator += (np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlong))**2
    numerator = np.sqrt(numerator)
    denominator = (np.sin(lat1) * np.sin(lat2) + np.cos(lat1) * np.cos(lat2) * np.cos(dlong))
    dsigma = np.arctan2(numerator, denominator)
    human_dist = Rearth * dsigma
    return human_dist*0.621371

In [6]:
data = data[data['lat_c']>0]
data.reset_index(inplace=True)
data['distance'] = data[['lon_c', 'lat_c', 'lon_u', 'lat_u']].apply(convertLatLongtoGreatCircle, axis=1)
data['dist'] = data[['lon_c', 'lat_c', 'longitude', 'latitude']].apply(convertLatLongtoGreatCircle, axis=1)

In [7]:
# compare the percentiles
pcts = pd.DataFrame({'distance': data['distance'].quantile(q=np.arange(0.01, 1.01, 0.01)),
                    'dist': data['dist'].quantile(q=np.arange(0.01, 1.01, 0.01))})
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(pcts)

             dist     distance
0.01     0.084688     0.335968
0.02     0.242909     0.471848
0.03     0.509945     0.609817
0.04     0.738707     0.702591
0.05     0.813883     0.779314
0.06     0.977232     0.874991
0.07     0.999840     0.961952
0.08     1.084163     1.070212
0.09     1.184792     1.169801
0.10     1.253703     1.265934
0.11     1.396322     1.350952
0.12     1.518982     1.439406
0.13     1.681781     1.529875
0.14     1.821925     1.610457
0.15     1.947867     1.693459
0.16     2.044967     1.775499
0.17     2.098927     1.860820
0.18     2.259159     1.971970
0.19     2.350505     2.081383
0.20     2.498062     2.162683
0.21     2.507778     2.252469
0.22     2.588675     2.339709
0.23     2.786798     2.460259
0.24     2.856566     2.541265
0.25     2.903873     2.627683
0.26     3.064366     2.741691
0.27     3.204023     2.844551
0.28     3.247449     2.931861
0.29     3.290729     3.039284
0.30     3.327431     3.158285
0.31     3.397361     3.254024
0.32    