# GPX to Geopandas translation

In [None]:
import geopandas as gpd
import pandas as pd
from datetime import date
import numpy as np

In [None]:
fname = 'test.gpx'

In [None]:
# Read the metadata associated with the trajectories in the currently considered GPX.
meta_gdf = gpd.read_file(fname, layer = 'tracks')
# display(meta_gdf)
display(meta_gdf.info())

# Read the GPX -- do it in blocks, so that we can also manage huge GPXs.
list_gdf = []
step = int(2e6)
for i in range(0, int(1000e6), step) :
    print(f"Processing points in the range: [{str(i)} -- {str(i + step)})")

    # Read the spatio-temporal information of the trajectories from the current block.
    gdf = gpd.read_file(fname, layer = 'track_points', rows = slice(i, i + step))
    display(gdf)
    display(gdf.info())
    
    # Early exit: when there are no more rows to read, terminate the loop.
    if gdf.shape[0] == 0 :
          print("No more points to process. Exiting the loop...")
          break
    
    # Select the columns of interest (trajectory identifier, timestamp, coordinates).
    print("Filtering useless columns...")
    selection = gdf.loc[:, ['track_fid', 'track_seg_id', 'track_seg_point_id', 'time', "geometry"]]
    
    # Use a compacter representation for 64-bit integer columns.
    selection['track_fid'] = selection['track_fid'].astype(np.int32)
    selection['track_seg_id'] = selection['track_seg_id'].astype(np.int32)
    selection['track_seg_point_id'] = selection['track_seg_point_id'].astype(np.int32)
    
    selection.info()
    
    # Drop the rows with missing or nonsensical timestamps.
    print("Filtering rows with missing or wrong timestamps...")
    selection.dropna(subset=['time'], inplace = True)
    selection = selection.loc[(selection['time'] > '1990-01-01') & (selection['time'] <= str(date.today()))]
    selection['time'] = pd.to_datetime(selection['time'])
    selection.info()

    # Associate a true unique identifier with trajectories.
    # 'track_fid' represents the identifier of a trajectory within a GPX file. If, however, a trajectory is split across multiple GPXs,
    # we cannot use it to reconstruct the trajectory. To solve the problem, we use the information from the 'link' element in a GPX file: 
    # this is the combination of a user ID AND a ID that OSM associates with a trace.
    # This information is available from meta_gdf, so we perform a merge to put it into selection.
    print('Merging meta information with the trajectories...')
    selection = selection.merge(meta_gdf['link1_href'], left_on = 'track_fid', right_index = True)
    selection.rename(columns={'link1_href':'track_uid'}, inplace = True)
    selection.info()
    
    # Append this dataframe to a list.    
    list_gdf.append(selection.copy(deep=True))

In [None]:
display(selection)

In [None]:
# Concatenate the dataframes created previously.
final = gpd.GeoDataFrame(pd.concat(list_gdf, ignore_index=True), crs = list_gdf[0].crs)
final.info()
del list_gdf

In [None]:
# Ogni 'track_fid' puo' contenere al suo interno piu' di una traiettoria (identificata da 'track_seg_id').
# Pertanto, genera un ID univoco per ogni traiettoria.
final['id'] = final['track_fid'].astype(str) + '_' + final['track_seg_id'].astype(str)
final['id'] = final['id'].astype('category')
final['id'] = final['id'].cat.codes
final.info()
print(f"Numero traiettorie: {final['id'].nunique()}, tmin: {final['time'].min()}, tmax: {final['time'].max()}")

In [None]:
# Salva esplicitamente latitudine e longitudine in colonne separate (puo' servire ad alcune librerie, e.g., scikit-mobility)
# final['lat'], final['long'] = final['geometry'].y, final['geometry'].x

In [None]:
# Write the preprocessed GeoPandas frame to disk.
final.to_parquet(fname + '.parquet')