In [3]:
import pandas as pd
import pygeohash as gh
from timeit import repeat as timeit_repeat
from numpy import arange as np_arange

In [4]:
geohash_precision = 6

In [5]:
TRIPS_PATH = "../../data/NYC_Pilot2_PM_Part1.csv"

In [6]:
def group_by_geohash(df):
    df['geohash'] = df.apply(lambda x: gh.encode(x.latitude, x.longitude, precision=geohash_precision), axis=1)
    return df.groupby('geohash')

In [7]:
trips = pd.read_csv(TRIPS_PATH)
trips = trips[
    (trips['latitude']  != 0) &
    (trips['longitude'] != 0)
]

In [9]:
for sampling_fraction in np_arange(0.1, 1.1, 0.1):
    sampled_trips = trips.sample(frac=sampling_fraction)
    print("Best of 5 runs for sampling fraction %.1f:" % sampling_fraction, min(timeit_repeat(
        "group_by_geohash(sampled_trips)",
        setup="from __main__ import group_by_geohash, sampled_trips",
        repeat=5, number=1
    )))

Best of 5 runs for sampling fraction 0.1: 0.4239414995536208
Best of 5 runs for sampling fraction 0.2: 0.8740012999624014
Best of 5 runs for sampling fraction 0.3: 1.2761177998036146
Best of 5 runs for sampling fraction 0.4: 1.7552982000634074
Best of 5 runs for sampling fraction 0.5: 2.078783600125462
Best of 5 runs for sampling fraction 0.6: 2.489771000109613
Best of 5 runs for sampling fraction 0.7: 3.0897793998010457
Best of 5 runs for sampling fraction 0.8: 3.677657399792224
Best of 5 runs for sampling fraction 0.9: 3.6898659002035856
Best of 5 runs for sampling fraction 1.0: 4.030581200029701
