In [None]:
import pandas as pd
import numpy as np

In [None]:
path_paris = './paris/traj_cleaned.parquet'
paris_trajs = pd.read_parquet(path_paris)

path_nyc = './nyc/traj_cleaned.parquet'
nyc_trajs = pd.read_parquet(path_nyc)

In [None]:
duration_trajs_paris = paris_trajs.groupby(['uid','tid']).agg({'datetime': ['min','max']})
duration_trajs_paris['duration'] = duration_trajs_paris[('datetime','max')] - duration_trajs_paris[('datetime','min')]
duration_trajs_paris['duration_mins'] = duration_trajs_paris['duration'].dt.total_seconds() / 60
duration_trajs_paris

In [None]:
duration_trajs_nyc = nyc_trajs.groupby(['uid','tid']).agg({'datetime': ['min','max']})
duration_trajs_nyc['duration'] = duration_trajs_nyc[('datetime','max')] - duration_trajs_nyc[('datetime','min')]
duration_trajs_nyc['duration_mins'] = duration_trajs_nyc['duration'].dt.total_seconds() / 60
duration_trajs_nyc

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))

# Adjust the sizes of the text used in the plot.
plt.rcParams.update({
    'font.size': 12,        # Base font size.
    'axes.titlesize': 14,   # Size font title.
})


paris = duration_trajs_paris['duration_mins']
nyc   = duration_trajs_nyc  ['duration_mins']

# Compute a common min/max interval.
min_d = min(paris.min(), nyc.min())
max_d = max(paris.max(), nyc.max())

# Choose a common binning (e.g. 20 bins spanning the full range)
bins = 30
bin_edges = np.linspace(min_d, max_d, bins + 1)

# Paris histogram
plt.hist(paris,
         bins=bin_edges,
         alpha=1,
         density=True,
         label='Paris',
         color='orange')

# NYC histogram
plt.hist(nyc,
         bins=bin_edges,
         alpha=0.3,            # semi-transparent
         density=True,
         label='New York City',
         color='white',           # Keep the fill white
         edgecolor='black',
         hatch='xxx',)         # user-requested different colors

plt.axvline(24 * 60,
            color='blue',
            linestyle='--',
            linewidth=1.5,
            label='1 day duration')

plt.axvline(7 * 24 * 60,
            color='blue',
            linestyle=':',
            linewidth=1.5,
            label='1 week duration')

# Log-scale on the y-axis
plt.yscale('log')
plt.xlim(left=0)

plt.xlabel('Trajectory duration (minutes)')
plt.ylabel('Fraction of trajectories (log scale)')
plt.title(f'Trajectory duration distribution (#bins = {bins})')
plt.legend()
# plt.show()

plt.savefig('trajectory_duration_histogram.pdf',
            format='pdf',
            bbox_inches='tight')  # trims extra whitespace