In [1]:
import pandas as pd
import geopandas as gpd
from timeit import repeat as timeit_repeat
from numpy import arange as np_arange

In [2]:
geohash_precision = 6

In [3]:
TRIPS_PATH = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/1c2a6af7dea7aa93105ac1d1d0118d07bd681d8a/data/NYC_Pilot2_PM_Part1.csv'
POLYGON_PATH = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/1c2a6af7dea7aa93105ac1d1d0118d07bd681d8a/data/nyc_polygon.geojson'

In [4]:
trips = pd.read_csv(TRIPS_PATH)
trips = trips[
    (trips['latitude']  != 0) &
    (trips['longitude'] != 0)
]

In [5]:
neighborhoods_original = gpd.read_file(POLYGON_PATH)
def group_by_sjoin(df):
    sjoined_trips_original = gpd.sjoin(df, neighborhoods_original, predicate="within")
    return sjoined_trips_original

In [6]:
for sampling_fraction in np_arange(0.1, 1.1, 0.1):
    gdf_trips = gpd.GeoDataFrame(trips, geometry=gpd.points_from_xy(trips.longitude, trips.latitude))
    gdf_trips.set_crs('epsg:4326', inplace=True)
    sampled_trips = gdf_trips.sample(frac=sampling_fraction)
    print("Best of 5 runs for sampling fraction %.1f:" % sampling_fraction, min(timeit_repeat(
        "group_by_sjoin(sampled_trips)",
        setup="from __main__ import group_by_sjoin, sampled_trips",
        repeat=5, number=1
    )))

Best of 5 runs for sampling fraction 0.1: 0.04740239982493222
Best of 5 runs for sampling fraction 0.2: 0.12092790007591248
Best of 5 runs for sampling fraction 0.3: 0.16723629995249212
Best of 5 runs for sampling fraction 0.4: 0.2576520999427885
Best of 5 runs for sampling fraction 0.5: 0.2895650998689234
Best of 5 runs for sampling fraction 0.6: 0.3510345001704991
Best of 5 runs for sampling fraction 0.7: 0.4008855998981744
Best of 5 runs for sampling fraction 0.8: 0.46813639998435974
Best of 5 runs for sampling fraction 0.9: 0.5154222999699414
Best of 5 runs for sampling fraction 1.0: 0.5864599999040365
