In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import datetime

import skmob

import warnings
warnings.simplefilter("ignore")

#### Setup global vars

In [None]:
fname = 'rome.gpx.parquet'

# Analysis part

In [None]:
# Read the dataset.
gdf = gpd.read_parquet(fname)

In [None]:
# Count the number of trajectories in the frame.
print(f"Number of unique trajectories: {gdf['track_fid'].nunique()}")
print(f"Number of unique trajectories: {gdf['id'].nunique()}")
gdf.info()

#### Preparazione dataframe scikit-mobility

In [None]:
# Preparazione dataframe scikit-mobility
tdf = skmob.TrajDataFrame(gdf, latitude='lat', longitude='long', datetime = 'time',\
                          user_id = 'id')
tdf.head()

In [None]:
# Calcola la distanza percorsa da ogni traiettoria.
from skmob.measures.individual import distance_straight_line
md_df = distance_straight_line(tdf)
md_df.rename(columns = {'uid' : 'id'}, inplace = True)

# tdf.plot_trajectory(zoom=12, weight=3, opacity=0.9, tiles='Stamen Toner')

#### Analyses on the general characteristics of the trajectories

In [None]:
gb = gdf.groupby('id')

test = gb.agg({'time' : ['max', 'min'], 'id' : 'count', 'track_fid' : 'first'})
test.columns = ['_'.join(col) for col in test.columns.values]
test.info()

test.rename(columns = {'id_count' : 'num_obs', 'track_fid_first' : 'user_id'}, inplace = True)

# Calcola l'intervallo temporale coperto dalle traiettorie.
test['time_span'] = (test['time_max'] - test['time_min'])

# Calcola la frequenza delle osservazioni.
test['update_rate'] = (test['time_span'] / test['num_obs'])

# Ordina
test.sort_values(by = 'update_rate', inplace = True, ascending = True)

display(test.head(30))

In [None]:
test.reset_index(inplace = True)
test = test.merge(md_df, on = 'id', how = 'left')

In [None]:
test['speed'] = test['distance_straight_line'] / (test['time_span'] / pd.Timedelta('1h'))

##### Preliminary analysis on the duration of trajectories

In [None]:
day = datetime.timedelta(days=1)
lb = datetime.timedelta(minutes=30)

print(f"Numero totale traiettorie: {test.shape[0]}")
print(f"Numero traiettorie con durata [30 min, 1 day]: {test[(test['time_span'] <= day) & (test['time_span'] >= lb)].shape[0]}")
print(f"Numero traiettorie con durata > 1 day: {test[(test['time_span'] > day)].shape[0]}")
print(f"Numero traiettorie con update rate medio <= 60 sec: {test[(test['update_rate'] <= datetime.timedelta(seconds=60))].shape[0]}")
print(f"Numero traiettorie con piu' di 20 osservazioni: {test[(test['num_obs'] > 20)].shape[0]}")

mask = (test['time_span'] >= datetime.timedelta(minutes = 60 * 8)) & \
       (test['time_span'] <= datetime.timedelta(minutes = 60 * 24)) & \
       (test['update_rate'] <= datetime.timedelta(seconds = 60))
final_dataset = test.loc[mask]
print(f"Traiettorie finali considerate: {final_dataset.shape[0]}")
display(final_dataset)

**REMARKS:** 
- there are a few very large trajectories that appear to contain distinct sub-trajectories. We could try the MovingPandas' splitter to extract these.
- The vast majority of the trajectories have duration less than 30 minutes.

# Calcola e visualizza ulteriori statistiche con movingPandas + Folium

#### Plotting start locations

#### Plotting end locations

#### Plot singola traiettoria

In [None]:
# Codice MovingPandas
#my_traj = traj_collection.trajectories[55]
#print(my_traj.df)

# my_traj.plot(column="speed", linewidth=5, capstyle='round', legend=True)
#my_traj.hvplot(geo=True, tiles='OSM', line_width=4, frame_width=1000, frame_height=600)

#### Salvataggio set finale traiettorie a disco. 

In [None]:
# tdf[tdf["uid"] == 24637].plot_trajectory(zoom=15, weight=3, opacity=0.9, tiles='Stamen Toner')
# gdf.loc[gdf["id"] == 24637].to_parquet('trajectory.parquet')

In [None]:
out = gdf.loc[gdf['id'].isin(final_dataset['id'])]
out.to_parquet('trajectory.parquet')