In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import box

### Auxiliary functions 

In [25]:
def compute_bounding_boxes(bbox : tuple[float,float,float,float], step : float, crs: str) -> list[str]:
    list_bboxes = []
    
    min_lon = bbox[0]
    min_lat = bbox[1]
    max_lon = bbox[2]
    max_lat = bbox[3]

    # compute the width/height of each cell
    lon_step = (max_lon - min_lon) / step
    lat_step = (max_lat - min_lat) / step
    
    for i in range(step):
        curr_lon = min_lon + i * lon_step
        next_lon = min(curr_lon + lon_step, max_lon)
        for j in range(step):
            curr_lat = min_lat + j * lat_step
            next_lat = min(curr_lat + lat_step, max_lat)
            
            # build shapely box
            geom = box(curr_lon, curr_lat, next_lon, next_lat)

            # record bounds + geometry
            list_bboxes.append({"geometry": geom})

    # create GeoDataFrame
    gdf = gpd.GeoDataFrame.from_dict(list_bboxes)
    gdf.set_crs(crs=crs, inplace=True)
    return gdf

### Main code

In [3]:
# Create a GeoDataFrame for the trajectory dataset.
path_dataset = './dataset_simulator_trajectories.compressed.parquet'
gdf = pd.read_parquet('./dataset_simulator_trajectories.compressed.parquet')
gdf = gpd.GeoDataFrame(gdf, geometry=gpd.points_from_xy(gdf.lng, gdf.lat), crs="EPSG:4326")
del gdf['lng'], gdf['lat']
display(gdf.info())

# Create a GeoDataFrame for the stops dataset.
path_stops = f'{path_dataset}.stops.parquet'
gdf_stops = pd.read_parquet(path_stops)
gdf_stops = gpd.GeoDataFrame(gdf_stops, geometry=gpd.points_from_xy(gdf_stops.lng, gdf_stops.lat), crs="EPSG:4326")
del gdf_stops['lng'], gdf_stops['lat']
display(gdf_stops.info())

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 20500075 entries, 0 to 20500074
Data columns (total 3 columns):
 #   Column    Dtype         
---  ------    -----         
 0   datetime  datetime64[ns]
 1   uid       int64         
 2   geometry  geometry      
dtypes: datetime64[ns](1), geometry(1), int64(1)
memory usage: 469.2 MB


None

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4734819 entries, 0 to 4734818
Data columns (total 5 columns):
 #   Column            Dtype         
---  ------            -----         
 0   datetime          datetime64[ns]
 1   uid               int64         
 2   leaving_datetime  datetime64[ns]
 3   duration_secs     float64       
 4   geometry          geometry      
dtypes: datetime64[ns](2), float64(1), geometry(1), int64(1)
memory usage: 180.6 MB


None

In [29]:
# Find the trajectories' bounding box, and then superimpose a grid over it.
bbox_trajs = gdf.total_bounds
grid = compute_bounding_boxes(bbox_trajs, 20, gdf.crs)
display(grid)

Unnamed: 0,geometry
0,"POLYGON ((-84.40993 33.72206, -84.40993 33.724..."
1,"POLYGON ((-84.40993 33.72408, -84.40993 33.726..."
2,"POLYGON ((-84.40993 33.72611, -84.40993 33.728..."
3,"POLYGON ((-84.40993 33.72814, -84.40993 33.730..."
4,"POLYGON ((-84.40993 33.73016, -84.40993 33.732..."
...,...
395,"POLYGON ((-84.36416 33.75244, -84.36416 33.754..."
396,"POLYGON ((-84.36416 33.75447, -84.36416 33.756..."
397,"POLYGON ((-84.36416 33.7565, -84.36416 33.7585..."
398,"POLYGON ((-84.36416 33.75852, -84.36416 33.760..."


In [33]:
# For each stop, associate the index of the cell in which it falls.
joined = gpd.sjoin(gdf_stops, 
                   grid[["geometry"]], 
                   how="left", 
                   predicate="within")
joined.rename(columns={"index_right": "cell_id"}, inplace = True)
joined

Unnamed: 0,datetime,uid,leaving_datetime,duration_secs,geometry,cell_id
0,2019-07-01 00:00:00,0,2019-07-01 07:40:00,27600.0,POINT (-84.39973 33.73471),106
1,2019-07-01 07:40:00,0,2019-07-01 16:55:00,33300.0,POINT (-84.36582 33.75353),395
2,2019-07-01 17:05:00,0,2019-07-01 19:00:00,6900.0,POINT (-84.40177 33.73556),86
3,2019-07-01 19:10:00,0,2019-07-01 20:05:00,3300.0,POINT (-84.40177 33.73556),86
4,2019-07-01 20:15:00,0,2019-07-01 21:20:00,3900.0,POINT (-84.40177 33.73556),86
...,...,...,...,...,...,...
4734814,2019-07-08 07:40:00,99999,2019-07-08 16:40:00,32400.0,POINT (-84.3929 33.74812),172
4734815,2019-07-08 16:40:00,99999,2019-07-09 07:40:00,54000.0,POINT (-84.36958 33.75491),356
4734816,2019-07-09 07:40:00,99999,2019-07-09 16:40:00,32400.0,POINT (-84.3929 33.74812),172
4734817,2019-07-09 16:40:00,99999,2019-07-10 07:40:00,54000.0,POINT (-84.36958 33.75491),356


In [None]:
# Now, we compute some statistics concerning the cells.
