In [None]:
import geopandas as gpd
import pandas as pd
from datetime import date
import numpy as np
import os

In [None]:
# Setup a few filenames.
gpx_path = './Traiettorie Parigi'
gpx_name = 'paris_centre'
traj_filename = os.path.join(gpx_path, gpx_name + '.processed.parquet')

traj_df = gpd.read_parquet(traj_filename)
traj_df.info()

### Segment detection from traces

In [None]:
# Define the gap threshold to determine when we have distinct segments in a trace.
threshold = pd.Timedelta(minutes=20)

# Sort the rows by uid and timestamp.
traj_df.sort_values(by=['uid', 'time'], inplace = True)
traj_df['time_diff'] = traj_df.groupby('uid', observed = True)['time'].diff()

# Mark the start of a new trajectory if the gap is larger or equal than the threshold
traj_df['new_trajectory'] = traj_df['time_diff'] >= threshold

# For each uid, assign a trajectory id using cumulative sum over new trajectory flags
traj_df['tid'] = traj_df.groupby('uid', observed = True)['new_trajectory'].cumsum().astype(np.int32)

# Drop the columns used to detect the trajectories
traj_df = traj_df.loc[:, ['time', 'geometry', 'uid', 'tid']]
traj_df.info()

In [None]:
traj_filename = os.path.join(gpx_path, gpx_name + '.processed.segmented.parquet')
traj_df.to_parquet(traj_filename)

### Compute some basic statistics about users and trajectories

In [None]:
stats_trajs_uid = traj_df.groupby('uid', observed = True)['tid'].max() + 1

print(f"Number of traces: {stats_trajs_uid.count()}")
print(f"Average number of segments per trace: {stats_trajs_uid.mean()}")
print(f"Maximum number of segments of a trace: {stats_trajs_uid.max()}")
print(f"Statistics about the distribution segments: {stats_trajs_uid.describe()}")
display(stats_trajs_uid.sort_values())

In [None]:
# Find the time intervals spanned by single traces.
min_time_user = traj_df.groupby('uid', observed = True)['time'].min()
max_time_user = traj_df.groupby('uid', observed = True)['time'].max()
span_time_user = max_time_user - min_time_user
display(span_time_user.sort_values(inplace = True))
display(span_time_user.describe())

In [None]:
# Find the time intervals spanned by single segments.
min_time_traj = traj_df.groupby(['uid', 'tid'], observed = True)['time'].min()
max_time_traj = traj_df.groupby(['uid', 'tid'], observed = True)['time'].max()
span_time_traj = max_time_traj - min_time_traj
display(span_time_traj.sort_values(inplace = True))
display(span_time_traj.describe())

In [None]:
# Plot the distribution of the time intervals spanned by single segments.
# TODO.