# Single GPX file to Geopandas translation

In this notebook, we translate the trajectories and associated metadata contained in a single GPX file, such as those downloaded with JOSM, to a Geopandas dataframe.

In [None]:
import geopandas as gpd
import pandas as pd
from datetime import date
import numpy as np
import os

#### Aux functions

In [None]:
def read_metadata_gpx(fname_gpx : str) -> gpd.GeoDataFrame :
    '''
    Read the metadata associated with the trajectories in the 'fname_gpx' GPX file.
    
    NOTE: the index's values correspond to the 'track_fid' values in the main GeoPandas dataframe below, and will be used
          to merge the metadata.
    NOTE 2: we use "on_invalid='ignore'" to skip trajectories with less than 2 points, otherwise they'd raise an exception.
    '''
    
    meta_gdf = gpd.read_file(fname_gpx, layer = 'tracks', on_invalid='ignore')
    meta_gdf = meta_gdf.loc[:, ['name', 'desc', "link1_href", 'geometry']]
    return meta_gdf

In [None]:
def gpx_to_gdf(fname_gpx : str) -> gpd.GeoDataFrame :
    '''
    Read the actual trajectories from the 'fname_gpx' GPX file.
    '''
    
    list_gdf = []
    step = int(2e6)
    for i in range(0, int(1000e6), step) :
        print(f"Processing trajectories in the block of points [{str(i)} -- {str(i + step)})")
        
        gdf = gpd.read_file(fname_gpx, layer = 'track_points', on_invalid='ignore', rows = slice(i, i + step))
        # gdf.info()
    
        # Early exit: when there are no more rows to read, terminate the loop.
        if gdf.shape[0] == 0 :
              print("No more points to process. Exiting the loop...")
              break

        # Select the columns of interest (trajectory identifier within a GPX, timestamp, coordinates).
        # print("Filtering useless columns...")
        selection = gdf.loc[:, ['track_fid', 'time', "geometry"]]

        # Append this dataframe to a list.    
        list_gdf.append(selection.copy(deep=True))


    # Concatenate the dataframes created previously.
    final = gpd.GeoDataFrame(pd.concat(list_gdf, ignore_index=True), crs = list_gdf[0].crs)
    final.info()
    return final

In [None]:
def process_dataset(metadata_df : gpd.GeoDataFrame, trajs_df : gpd.GeoDataFrame) -> gpd.GeoDataFrame :
    
    # Associate a true unique identifier with trajectories.
    # 'track_fid' represents the identifier of a trajectory within a GPX file. If, however, a trajectory is split across multiple GPXs,
    # we cannot use it to reconstruct the trajectory. To solve the problem, we use the information from the 'link' element in a GPX file: 
    # this is the combination of a user ID AND a ID that OSM associates with a trace.
    # This information is available from meta_gdf, so we perform a merge to put it into selection.
    # print('Merging meta information with the trajectories...')
    selection = trajs_df.merge(metadata_df['link1_href'], left_on = 'track_fid', right_index = True)
    selection.rename(columns={'link1_href':'uid'}, inplace = True)
    # display(selection)
    display(selection.info())
    
    # Turn the 'uid' column into categorical, thereby compressing the trajectory identifiers.
    # Drop also the track_fid column, which was required to merge the metadata.
    selection['uid'] =  selection['uid'].astype('category')
    selection.drop(columns='track_fid', inplace = True)
    display(selection.info())
    
    # Drop the rows of trajectories for which it is not possible to understand the user behind them.
    selection.dropna(subset=['uid'], inplace = True)
    display(selection.info())
    
    # Drop the rows with missing or nonsensical timestamps.
    selection.dropna(subset=['time'], inplace = True)
    # selection = selection.loc[(selection['time'] > '1990-01-01') & (selection['time'] <= str(date.today()))]
    selection['time'] = pd.to_datetime(selection['time'])
    display(selection.info())
    
    # Remove duplicate rows, i.e., those having same timestamp, geometry, and uid.
    # GPX files downloaded from JOSM have LOTS of duplicated trajectories, so we need to take care of them.
    selection.drop_duplicates(ignore_index = True, inplace = True)
    display(selection.info())

    return selection

### Main code

#### Read the metadata and the trajectory information within a GPX

In [None]:
# Setup a few filenames.
gpx_path = './Traiettorie Parigi'
gpx_name = 'paris_centre'
gpx_filename = os.path.join(gpx_path, gpx_name + '.gpx')
meta_geodf_filename = os.path.join(gpx_path, gpx_name + '.meta.parquet')
geodf_filename = os.path.join(gpx_path, gpx_name + '.parquet')
trajdf_filename = os.path.join(gpx_path, gpx_name + '.processed.parquet')

In [None]:
# Extract metadata from a GPX file, or read an existing parquet with such info.
if not os.path.isfile(meta_geodf_filename) :
    print(f'Parsing metadata from GPX...')
    meta_geo_df = read_metadata_gpx(gpx_filename)
    meta_geo_df.to_parquet(meta_geodf_filename)
else :
    print(f'Reading metadata from an existing parquet file...')
    meta_geo_df = gpd.read_parquet(meta_geodf_filename)

print(meta_geo_df.info())

In [None]:
# Convert a single big GPX to a Geopandas Dataframe.
if not os.path.isfile(geodf_filename) :
    print(f'Parsing trajectories from GPX...')
    geo_df = gpx_to_gdf(gpx_filename)
    geo_df.to_parquet(geodf_filename)
else :
    print(f'Reading trajectories from an existing parquet file...')
    geo_df = gpd.read_parquet(geodf_filename)

print(geo_df.info())

In [None]:
display(meta_geo_df)
display(geo_df)

#### Preprocess the metadata and the trajectories

Associate the metadata with the trajectories, drop useless info, drop trajectories without a user,  drop trajectories with impossible timestamps.

In [None]:
final_dataset = process_dataset(meta_geo_df, geo_df)

In [None]:
# Write the GeoDataFrame into a parquet.
print(f'Writing GeoDataFrame to {trajdf_filename}...')
final_dataset.to_parquet(trajdf_filename)