In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

from src.stop_explorer import StopExplorer
from src.grid_partitioning import Grid

### Main code

In [2]:
# Create a GeoDataFrame for the stops dataset.
path_stops = './data_simulator/huge_dataset/dataset_simulator_trajectories.compressed.parquet.stops.parquet'
stop_explorer = StopExplorer(path_stops)
df_stops = stop_explorer.get_df_stops()

display(df_stops)
df_stops.info()

Unnamed: 0,datetime,uid,leaving_datetime,duration_secs,geometry,hour_start,hour_end,weekday,weekend
0,2019-07-01 00:00:00,0,2019-07-01 13:52:00,49920.0,POINT (-84.36998 33.75245),0,13,0,0
1,2019-07-01 13:52:00,0,2019-07-01 15:48:00,6960.0,POINT (-84.37004 33.75206),13,15,0,0
2,2019-07-01 15:52:00,0,2019-07-01 16:58:00,3960.0,POINT (-84.36825 33.75266),15,16,0,0
3,2019-07-01 16:58:00,0,2019-07-01 18:04:00,3960.0,POINT (-84.36998 33.75245),16,18,0,0
4,2019-07-01 18:06:00,0,2019-07-01 19:28:00,4920.0,POINT (-84.36998 33.75245),18,19,0,0
...,...,...,...,...,...,...,...,...,...
4393268,2019-07-09 20:58:00,99999,2019-07-09 22:54:00,6960.0,POINT (-84.37187 33.75968),20,22,1,0
4393269,2019-07-09 22:54:00,99999,2019-07-10 07:52:00,32280.0,POINT (-84.36952 33.7616),22,7,1,0
4393270,2019-07-10 07:52:00,99999,2019-07-10 16:36:00,31440.0,POINT (-84.38731 33.76169),7,16,2,0
4393271,2019-07-10 16:36:00,99999,2019-07-10 22:44:00,22080.0,POINT (-84.36952 33.7616),16,22,2,0


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4393273 entries, 0 to 4393272
Data columns (total 9 columns):
 #   Column            Dtype         
---  ------            -----         
 0   datetime          datetime64[ns]
 1   uid               int64         
 2   leaving_datetime  datetime64[ns]
 3   duration_secs     float64       
 4   geometry          geometry      
 5   hour_start        uint8         
 6   hour_end          uint8         
 7   weekday           uint8         
 8   weekend           uint8         
dtypes: datetime64[ns](2), float64(1), geometry(1), int64(1), uint8(4)
memory usage: 184.3 MB


### Materialize a uniform grid, with side of a given length, over the bounding box enclosing the stop segments.

In [3]:
grid = Grid(grid_cell_length_meters = 100)
grid.compute_grid_over_geodata(df_stops)

# mappa = grid.generate_grid_map()
# mappa

Unnamed: 0,geometry
0,"POLYGON ((-84.41088 33.72895, -84.41085 33.729..."
1,"POLYGON ((-84.41085 33.72985, -84.41082 33.730..."
2,"POLYGON ((-84.41082 33.73076, -84.4108 33.7316..."
3,"POLYGON ((-84.4108 33.73166, -84.41077 33.7325..."
4,"POLYGON ((-84.41077 33.73256, -84.41074 33.733..."
...,...
1711,"POLYGON ((-84.3638 33.75861, -84.36378 33.7595..."
1712,"POLYGON ((-84.36378 33.75951, -84.36375 33.760..."
1713,"POLYGON ((-84.36375 33.76041, -84.36372 33.761..."
1714,"POLYGON ((-84.36372 33.76131, -84.36369 33.762..."


### Compute the spatial join between the stop segments and the grid cells.

In [4]:
join = grid.compute_join_other_geodata(df_stops)
join

Unnamed: 0,datetime,uid,leaving_datetime,duration_secs,geometry,hour_start,hour_end,weekday,weekend,cell_id
0,2019-07-01 00:00:00,0,2019-07-01 13:52:00,49920.0,POINT (-84.36998 33.75245),0,13,0,0,1509
1,2019-07-01 13:52:00,0,2019-07-01 15:48:00,6960.0,POINT (-84.37004 33.75206),13,15,0,0,1508
2,2019-07-01 15:52:00,0,2019-07-01 16:58:00,3960.0,POINT (-84.36825 33.75266),15,16,0,0,1548
3,2019-07-01 16:58:00,0,2019-07-01 18:04:00,3960.0,POINT (-84.36998 33.75245),16,18,0,0,1509
4,2019-07-01 18:06:00,0,2019-07-01 19:28:00,4920.0,POINT (-84.36998 33.75245),18,19,0,0,1509
...,...,...,...,...,...,...,...,...,...,...
4393268,2019-07-09 20:58:00,99999,2019-07-09 22:54:00,6960.0,POINT (-84.37187 33.75968),20,22,1,0,1438
4393269,2019-07-09 22:54:00,99999,2019-07-10 07:52:00,32280.0,POINT (-84.36952 33.7616),22,7,1,0,1519
4393270,2019-07-10 07:52:00,99999,2019-07-10 16:36:00,31440.0,POINT (-84.38731 33.76169),7,16,2,0,855
4393271,2019-07-10 16:36:00,99999,2019-07-10 22:44:00,22080.0,POINT (-84.36952 33.7616),16,22,2,0,1519


In [None]:
# Here we build the dataframe that will contain the mapping between users
# and grid cells, with several statistics associated with them.
join['duration_mins'] = join['duration_secs'] / 60
stats_pairs_cell_uid = {'num_stops' : pd.NamedAgg(column='uid', aggfunc='size'),
                        'mean_duration_mins' : pd.NamedAgg(column='duration_mins', aggfunc='mean'),
                        'mean_hour_start' : pd.NamedAgg(column='hour_start', aggfunc='mean'),
                        'mean_hour_end' : pd.NamedAgg(column='hour_end', aggfunc='mean')}
agg_cell_uid = join.groupby(['uid', 'cell_id']).agg(**stats_pairs_cell_uid)

agg_cell_uid

Unnamed: 0_level_0,Unnamed: 1_level_0,num_stops,mean_duration_mins,mean_hour_start,mean_hour_end
uid,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1508,10,102.000000,14.400000,16.200000
0,1509,38,313.105263,16.947368,16.394737
0,1548,8,106.750000,17.750000,19.750000
0,1586,4,91.000000,16.500000,18.500000
1,606,14,518.857143,10.857143,12.500000
...,...,...,...,...,...
99998,1589,1,144.000000,17.000000,19.000000
99999,855,6,524.000000,7.000000,16.000000
99999,1438,25,91.200000,15.920000,15.480000
99999,1481,9,61.111111,13.222222,14.555556


In [8]:
# For each pair (user_id, cell_id), compute the fraction of stops that happened during the weekday and the weekend.
weekend_analysis = join.groupby(['uid', 'cell_id'])['weekend'].value_counts(normalize=True)


# Now, some pairs (user_id, cell_id) might not have the fraction concerning the weekday or the weekend (this
# happens when all the stops happened during the weekend or during the weekday). We thus want to reindex and simplify
# the series, so that it simply contains the fraction of time a pair (user_id, cell_id) spent during the weekend.

pairs = weekend_analysis.index.droplevel('weekend').unique() # 1 - Get all the unique "(user_id, cell_id)" pairs.
weekend_idx = pd.MultiIndex.from_tuples([(u, c, 1) for u, c in pairs],      # 2 - Create a new multiindex with the "weekend" level 
                                        names=weekend_analysis.index.names) #     set to 1. This will be used to reindex the series.                                                                 
weekend_analysis = weekend_analysis.reindex(weekend_idx).fillna(0) # 3 - Reindex the original series. This will drop the pairs that
                                                                   #     do not have the "weekend" level equal to 1. We also fill 
                                                                   #     the NaN values, which are those pairs that did not occur
                                                                   #    during the weekend, with 0s. 
weekend_analysis.index = weekend_analysis.index.droplevel('weekend') # 4 - Drop the "weekend" level, as it is not needed anymore.
display(weekend_analysis)

agg_cell_uid['frac_time_weekend'] = weekend_analysis
agg_cell_uid.head(20)

uid    cell_id
0      1508       0.000000
       1509       0.394737
       1548       0.125000
       1586       0.250000
1      606        0.142857
                    ...   
99998  1589       0.000000
99999  855        0.333333
       1438       0.160000
       1481       0.000000
       1519       0.076923
Name: proportion, Length: 385017, dtype: float64

Unnamed: 0_level_0,Unnamed: 1_level_0,num_stops,mean_duration_mins,mean_hour_start,mean_hour_end,frac_time_weekend
uid,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1508,10,102.0,14.4,16.2,0.0
0,1509,38,313.105263,16.947368,16.394737,0.394737
0,1548,8,106.75,17.75,19.75,0.125
0,1586,4,91.0,16.5,18.5,0.25
1,606,14,518.857143,10.857143,12.5,0.142857
1,643,13,110.153846,16.923077,15.076923,0.384615
1,646,14,94.285714,18.071429,14.642857,0.285714
1,817,8,526.0,8.0,17.0,0.0
2,1115,3,114.0,13.0,14.666667,0.666667
2,1155,10,859.2,14.4,7.0,0.2


In [7]:
stats_config = {'num_stops' : pd.NamedAgg(column='uid', aggfunc='size'),
                'num_users' : pd.NamedAgg(column='uid', aggfunc='nunique'),
                'mean_duration_mins' : pd.NamedAgg(column='duration_mins', aggfunc='mean'),
                'median_duration_mins' : pd.NamedAgg(column='duration_mins', aggfunc='median')}
stats_cells = join.groupby('cell_id').agg(**stats_config)

stats_cells

Unnamed: 0_level_0,num_stops,num_users,mean_duration_mins,median_duration_mins
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,409,34,672.977995,634.0
5,311,24,562.218650,500.0
8,1806,231,523.334441,520.0
14,3711,367,313.343034,476.0
16,2139,258,526.517064,518.0
...,...,...,...,...
1681,60,6,794.666667,791.0
1682,45,4,727.155556,766.0
1683,114,11,535.526316,173.0
1684,31013,1672,93.164931,92.0
