In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.simplefilter("ignore")

#### Setup global vars

In [None]:
fname_paris = './paris_trajectories/paris_centre.processed.parquet'
fname_nyc = './nyc_trajectories/nyc.processed.parquet'

fname = fname_paris

# Read the dataset.
gdf = gpd.read_parquet(fname)

# Analysis part

In [None]:
# Count the number of trajectories in the frame.
print(f"Number of unique trajectories: {gdf['user'].nunique()}")
gdf.info()

#### Analyses on the general characteristics of the trajectories

In [None]:
gb = gdf.groupby('user')

test = gb.agg({'time' : ['max', 'min'], 'user' : 'count'})
test.columns = ['_'.join(col) for col in test.columns.values]
test.info()

test.rename(columns = {'user_count' : 'num_obs', 'user_first' : 'user_id'}, inplace = True)

# Calcola l'intervallo temporale coperto dalle traiettorie.
test['time_span'] = (test['time_max'] - test['time_min'])

# Calcola la frequenza delle osservazioni.
test['update_rate'] = (test['time_span'] / test['num_obs'])

display(test)

##### Preliminary analysis on the duration of trajectories

In [None]:
day = datetime.timedelta(days=1)
week = datetime.timedelta(days=7)
month = datetime.timedelta(days=28)
lb = datetime.timedelta(minutes=10)
min_sampling_rate = datetime.timedelta(minutes=2)

In [None]:
print(f"Number of trajectories with duration >= {lb.seconds/60} min: {test[(test['time_span'] >= lb)].shape[0]}")
print(f"Number of trajectories with duration >= 1 day: {test[(test['time_span'] >= day)].shape[0]}")
print(f"Number of trajectories with duration >= 7 days: {test[(test['time_span'] >= week)].shape[0]}")
print(f"Number of trajectories with duration >= 28 days: {test[(test['time_span'] >= month)].shape[0]}")
print(f"Number of trajectories with average update rate <= {min_sampling_rate.seconds/60} min: {test[(test['update_rate'] <= min_sampling_rate)].shape[0]}")
# print(f"Number of trajectories with more than 20 samples: {test[(test['num_obs'] > 20)].shape[0]}")

# Mask used to select the trajectories satisfying the chosen criteria.
mask = (test['time_span'] >= lb) & \
       (test['update_rate'] <= min_sampling_rate)

final_dataset = test.loc[mask]

### Print some statistics about the final dataset.

In [None]:
print(f"Initial number of trajectories: {test.shape[0]}")
print(f"Final number of trajectories: {final_dataset.shape[0]}")

print(f"Number of trajectories with duration >= {lb.seconds/60} min: {final_dataset[(final_dataset['time_span'] >= lb)].shape[0]}")
print(f"Number of trajectories with duration >= 1 day: {final_dataset[(final_dataset['time_span'] >= day)].shape[0]}")
print(f"Number of trajectories with duration >= 1 week: {final_dataset[(final_dataset['time_span'] >= week)].shape[0]}")

### Save the preprocessed trajectory dataset

In [None]:
final_gdf = gdf.loc[gdf['user'].isin(final_dataset.index)]
final_gdf.to_parquet(f'{fname}.preprocessed.parquet')