In [None]:
import geopandas as gpd
import pandas as pd
from datetime import date
import numpy as np
import os

In [None]:
# Setup a few filenames.
gpx_path = './Traiettorie Parigi'
gpx_name = 'paris_centre'

traj_filename = os.path.join(gpx_path, gpx_name + '.processed.parquet')
final_traj_filename = os.path.join(gpx_path, gpx_name + '.final.processed.parquet')

### Segment detection from traces

In [None]:
traj_df = gpd.read_parquet(traj_filename)
traj_df.info()

In [None]:
# Define the gap threshold to determine when we have distinct segments in a trace.
threshold = pd.Timedelta(minutes=20)

# Sort the rows by uid and timestamp.
traj_df.sort_values(by=['user', 'time'], inplace = True)
traj_df['time_diff'] = traj_df.groupby('user', observed = True)['time'].diff()

# Mark the start of a new trajectory if the gap is larger or equal than the threshold
traj_df['new_trajectory'] = traj_df['time_diff'] >= threshold

# For each uid, assign a trajectory id using cumulative sum over new trajectory flags
traj_df['traj_id'] = traj_df.groupby('user', observed = True)['new_trajectory'].cumsum().astype(np.int32)

# Drop the columns used to detect the trajectories
traj_df = traj_df.loc[:, ['time', 'geometry', 'user', 'traj_id']]
traj_df.info()

In [None]:
traj_df.to_parquet(final_traj_filename)
# traj_df = gpd.read_parquet(final_traj_filename)

### Compute some basic statistics about users and trajectories (optional)

In [None]:
stats_trajs_uid = traj_df.groupby('user', observed = True)['traj_id'].max() + 1

print(f"Number of traces: {stats_trajs_uid.count()}")
print(f"Average number of segments per trace: {stats_trajs_uid.mean()}")
print(f"Maximum number of segments of a trace: {stats_trajs_uid.max()}")
print(f"Statistics about the distribution segments: {stats_trajs_uid.describe()}")
display(stats_trajs_uid.sort_values())

In [None]:
# Find the time intervals spanned by single traces.
min_time_user = traj_df.groupby('user', observed = True)['time'].min()
max_time_user = traj_df.groupby('user', observed = True)['time'].max()
span_time_user = max_time_user - min_time_user
display(span_time_user.sort_values(inplace = True))
display(span_time_user.describe())

In [None]:
# Find the time intervals spanned by single segments.
min_time_traj = traj_df.groupby(['user', 'traj_id'], observed = True)['time'].min()
max_time_traj = traj_df.groupby(['user', 'traj_id'], observed = True)['time'].max()
span_time_traj = max_time_traj - min_time_traj
display(span_time_traj.sort_values(inplace = True))

display(span_time_traj.describe())
print(f"Number of trajectories that last more than 20 minutes: {(span_time_traj > pd.Timedelta(minutes=20)).sum()}")

In [None]:
# Plot some general statistics about the sampling rate of trajectories.
traj_df["time_diff"] = traj_df.groupby(['user','traj_id'], observed = True)['time'].diff()
sampling_rate_trajs = traj_df.groupby(['user','traj_id'], observed = True)["time_diff"].mean()
del traj_df["time_diff"]

display(sampling_rate_trajs.describe())
print(f"Number of trajectories with a sampling rate less than 10 seconds: {(sampling_rate_trajs <= pd.Timedelta(seconds=10)).sum()}")

In [None]:
cond = (span_time_traj >= pd.Timedelta(minutes=20)) & (sampling_rate_trajs <= pd.Timedelta(seconds=10))
print(f"Number of trajectories that last at least 20 mins and have a sampling rate less or equal than 10 sec.: {cond.sum()}")