In [6]:
import pandas as pd
from s2 import s2
from timeit import repeat as timeit_repeat
from numpy import arange as np_arange

# Fixed Resolution and Variable Sample Size

In [2]:
s2_resolution = 14

In [3]:
TRIPS_PATH = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/1c2a6af7dea7aa93105ac1d1d0118d07bd681d8a/data/NYC_Pilot2_PM_Part1.csv'

In [4]:
trips = pd.read_csv(TRIPS_PATH)
trips = trips[
    (trips['latitude']  != 0) &
    (trips['longitude'] != 0)
]

In [5]:
def group_by_s2(df):
    df['s2'] = df.apply(lambda x: s2.geo_to_s2(x.latitude, x.longitude, s2_resolution), axis=1)
    return df.groupby('s2')

In [6]:
for sampling_fraction in np_arange(0.1, 1.1, 0.1):
    sampled_trips = trips.sample(frac=sampling_fraction)
    print("Best of 5 runs for sampling fraction %.1f:" % sampling_fraction, min(timeit_repeat(
        "group_by_s2(sampled_trips)",
        setup="from __main__ import group_by_s2, sampled_trips",
        repeat=5, number=1
    )))

Best of 5 runs for sampling fraction 0.1: 0.39333940000000034
Best of 5 runs for sampling fraction 0.2: 0.7920401000000012
Best of 5 runs for sampling fraction 0.3: 1.1719258000000004
Best of 5 runs for sampling fraction 0.4: 1.5676227000000011
Best of 5 runs for sampling fraction 0.5: 1.965718299999999
Best of 5 runs for sampling fraction 0.6: 2.3710004
Best of 5 runs for sampling fraction 0.7: 2.780898399999998
Best of 5 runs for sampling fraction 0.8: 3.1408685000000105
Best of 5 runs for sampling fraction 0.9: 3.5363192000000083
Best of 5 runs for sampling fraction 1.0: 3.9809781999999956


# Fixed Sample Size and Variable Resolution

In [1]:
s2_resolution_min = 0
s2_resolution_max = 30

In [4]:
TRIPS_PATH = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/1c2a6af7dea7aa93105ac1d1d0118d07bd681d8a/data/NYC_Pilot2_PM_Part1.csv'

In [7]:
trips = pd.read_csv(TRIPS_PATH)
trips = trips[
    (trips['latitude']  != 0) &
    (trips['longitude'] != 0)
]

In [8]:
def group_by_s2(df):
    df['s2'] = df.apply(lambda x: s2.geo_to_s2(x.latitude, x.longitude, s2_resolution), axis=1)
    return df.groupby('s2')

In [9]:
sampling_fraction = 0.6
sampled_trips = trips.sample(frac=sampling_fraction)

In [10]:
s2_resolution = s2_resolution_min
while s2_resolution <= s2_resolution_max:
    print("Best of 5 runs for s2 resolution %d:" % s2_resolution, min(timeit_repeat(
        "group_by_s2(sampled_trips)",
        setup="from __main__ import group_by_s2, sampled_trips, s2_resolution",
        repeat=5, number=1
    )))
    s2_resolution += 1

Best of 5 runs for s2 resolution 0: 3.7777863999363035
Best of 5 runs for s2 resolution 1: 3.7273387000896037
Best of 5 runs for s2 resolution 2: 3.748213700018823
Best of 5 runs for s2 resolution 3: 3.752453599823639
Best of 5 runs for s2 resolution 4: 3.739897499792278
Best of 5 runs for s2 resolution 5: 3.718327600043267
Best of 5 runs for s2 resolution 6: 3.755277000134811
Best of 5 runs for s2 resolution 7: 3.752497499808669
Best of 5 runs for s2 resolution 8: 3.7401654000859708
Best of 5 runs for s2 resolution 9: 3.7474414000753313
Best of 5 runs for s2 resolution 10: 3.710450500017032
Best of 5 runs for s2 resolution 11: 3.735961399972439
Best of 5 runs for s2 resolution 12: 3.766419199993834
Best of 5 runs for s2 resolution 13: 3.70429449994117
Best of 5 runs for s2 resolution 14: 3.7508956999517977
Best of 5 runs for s2 resolution 15: 3.7018271998967975
Best of 5 runs for s2 resolution 16: 3.8282753999810666
Best of 5 runs for s2 resolution 17: 3.822458199923858
Best of 5 runs