In [2]:
import pandas as pd
import h3
from timeit import repeat as timeit_repeat
from numpy import arange as np_arange

# Fixed Resolution and Variable Sample Size

In [2]:
h3_resolution = 8

In [3]:
TRIPS_PATH = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/1c2a6af7dea7aa93105ac1d1d0118d07bd681d8a/data/NYC_Pilot2_PM_Part1.csv'

In [4]:
trips = pd.read_csv(TRIPS_PATH)
trips = trips[
    (trips['latitude']  != 0) &
    (trips['longitude'] != 0)
]

In [5]:
def group_by_h3(df):
    df['h3'] = df.apply(lambda x: h3.geo_to_h3(x.latitude, x.longitude, resolution=h3_resolution), axis=1)
    return df.groupby('h3')

In [6]:
for sampling_fraction in np_arange(0.1, 1.1, 0.1):
    sampled_trips = trips.sample(frac=sampling_fraction)
    print("Best of 5 runs for sampling fraction %.1f:" % sampling_fraction, min(timeit_repeat(
        "group_by_h3(sampled_trips)",
        setup="from __main__ import group_by_h3, sampled_trips",
        repeat=5, number=1
    )))

Best of 5 runs for sampling fraction 0.1: 0.1599410999999975
Best of 5 runs for sampling fraction 0.2: 0.32364209999999716
Best of 5 runs for sampling fraction 0.3: 0.4809246999999992
Best of 5 runs for sampling fraction 0.4: 0.6434975999999999
Best of 5 runs for sampling fraction 0.5: 0.813217599999998
Best of 5 runs for sampling fraction 0.6: 0.9770040000000009
Best of 5 runs for sampling fraction 0.7: 1.1390791999999976
Best of 5 runs for sampling fraction 0.8: 1.3212085999999985
Best of 5 runs for sampling fraction 0.9: 1.4777729999999991
Best of 5 runs for sampling fraction 1.0: 1.6231030999999945


# Fixed Sample Size and Variable Resolution

In [1]:
h3_resolution_min = 0
h3_resolution_max = 15

In [5]:
TRIPS_PATH = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/1c2a6af7dea7aa93105ac1d1d0118d07bd681d8a/data/NYC_Pilot2_PM_Part1.csv'

In [6]:
trips = pd.read_csv(TRIPS_PATH)
trips = trips[
    (trips['latitude']  != 0) &
    (trips['longitude'] != 0)
]

In [7]:
def group_by_h3(df, h3_resolution):
    df['h3'] = df.apply(lambda x: h3.geo_to_h3(x.latitude, x.longitude, resolution=h3_resolution), axis=1)
    return df.groupby('h3')

In [8]:
sampling_fraction = 0.6
sampled_trips = trips.sample(frac=sampling_fraction)

In [9]:
for h3_resolution in np_arange(h3_resolution_min, h3_resolution_max + 1):
    print("Best of 5 runs for h3 resolution %d:" % h3_resolution, min(timeit_repeat(
        "group_by_h3(sampled_trips, h3_resolution)",
        setup="from __main__ import group_by_h3, sampled_trips, h3_resolution",
        repeat=5, number=1
    )))

Best of 5 runs for h3 resolution 0: 1.4642936999443918
Best of 5 runs for h3 resolution 1: 1.3968135002069175
Best of 5 runs for h3 resolution 2: 1.4393537999130785
Best of 5 runs for h3 resolution 3: 1.4435300999321043
Best of 5 runs for h3 resolution 4: 1.4559528999961913
Best of 5 runs for h3 resolution 5: 1.4532270000781864
Best of 5 runs for h3 resolution 6: 1.4520791999530047
Best of 5 runs for h3 resolution 7: 1.4551937999203801
Best of 5 runs for h3 resolution 8: 1.481142499949783
Best of 5 runs for h3 resolution 9: 1.4600108000449836
Best of 5 runs for h3 resolution 10: 1.500545599963516
Best of 5 runs for h3 resolution 11: 1.511336199939251
Best of 5 runs for h3 resolution 12: 1.4940585000440478
Best of 5 runs for h3 resolution 13: 1.5129698000382632
Best of 5 runs for h3 resolution 14: 1.5571924999821931
Best of 5 runs for h3 resolution 15: 1.52674450003542
