# GPX to Geopandas translation

In [None]:
import geopandas as gpd
import pandas as pd
from datetime import date
import numpy as np
import os
from tqdm import tqdm

#### Aux functions

In [None]:
def correct_GPX_url_tags(fname, fname_corrected) :
    # Read the GPX file as text
    with open(fname, "r", encoding="utf-8") as f:
        text = f.read()
    
    # Replace <url> with <src> and </url> with </src>
    text = text.replace("<url>", "<link1_href>").replace("</url>", "</link1_href>")
    
    # Write the modified text to a new file (or overwrite the original)
    with open(fname_corrected, "w", encoding="utf-8") as f:
        f.write(text)

In [None]:
def list_subdirectories(directory):
    
    # List all entries in the given directory
    entries = os.listdir(directory)
    
    # Filter out entries that are directories
    return [os.path.join(directory, entry) for entry in entries if os.path.isdir(os.path.join(directory, entry))]


def list_files(directory):
    
    # List all entries in the given directory
    entries = os.listdir(directory)
    
    # Filter out entries that are files
    return [os.path.join(directory, entry) for entry in entries if os.path.isfile(os.path.join(directory, entry))]

In [None]:
def gpx_to_gdf(fname_gpx : str) -> gpd.GeoDataFrame :

    # 'correct_GPX_tag' replaces the <url> tags, which do not belong to the GPX standard but contain the trajectory IDs, 
    # with <link1_href>, so that GeoPandas can pick up that information.
    fname_temporary_corrected = './corrected_file_XXX.gpx'
    correct_GPX_url_tags(fname_gpx, fname_temporary_corrected)
    
    # Read the metadata associated with the trajectories in the currently considered GPX.
    # NOTE: the index's values correspond to the 'track_fid' values in the main GeoPandas dataframe below, and will be used
    #       to merge the metadata.
    # NOTE 2: we use "on_invalid='ignore'" to skip trajectories with less than 2 points, otherwise they'd raise an exception.
    meta_gdf = gpd.read_file(fname_temporary_corrected, layer = 'tracks', on_invalid='ignore')
    # display(meta_gdf)
    # display(meta_gdf.info())
    
    
    # Now read the actual trajectories from the GPX.
    list_gdf = []
    # Read the spatio-temporal information of the trajectories from the current block.
    gdf = gpd.read_file(fname_temporary_corrected, layer = 'track_points', on_invalid='ignore')
    # display(gdf)
    # display(gdf.info())

    assert meta_gdf.shape[0] == gdf['track_fid'].nunique(), "Error, different number of trajectories detected between GPX metadata and actual data!"
    
    # Select the columns of interest (trajectory identifier within a GPX, timestamp, coordinates).
    # print("Filtering useless columns...")
    selection = gdf.loc[:, ['track_fid', 'time', "geometry"]]
    
    # Use a compacter representation for 64-bit integer columns.
    selection['track_fid'] = selection['track_fid'].astype(np.int32)
    
    # Drop the rows with missing or nonsensical timestamps.
    # print("Filtering rows with missing or wrong timestamps...")
    selection.dropna(subset=['time'], inplace = True)
    selection = selection.loc[(selection['time'] > '1990-01-01') & (selection['time'] <= str(date.today()))]
    selection['time'] = pd.to_datetime(selection['time'])
    # selection.info()
    
    # Associate a true unique identifier with trajectories.
    # 'track_fid' represents the identifier of a trajectory within a GPX file. If, however, a trajectory is split across multiple GPXs,
    # we cannot use it to reconstruct the trajectory. To solve the problem, we use the information from the 'link' element in a GPX file: 
    # this is the combination of a user ID AND a ID that OSM associates with a trace.
    # This information is available from meta_gdf, so we perform a merge to put it into selection.
    # print('Merging meta information with the trajectories...')
    selection = selection.merge(meta_gdf['link1_href'], left_on = 'track_fid', right_index = True)
    selection.rename(columns={'link1_href':'track_uid'}, inplace = True)

    # Drop the track_fid column, which was required to merge the metadata from 'meta_gdf'
    selection.drop(columns='track_fid', inplace = True)

    # Drop the rows that have NaN values in the 'track_uid' column -- there are a few trajectories that do not have a 'username + id_traj'.
    selection.dropna(subset=['track_uid'], inplace = True)
    
    # Finally, turn the 'track_uid' column into categorical, thereby compressing the trajectory identifiers.
    selection['track_uid'] =  selection['track_uid'].astype('category')
    # selection.info()

    
    # Remove the temporary GPX file.
    os.remove(fname_temporary_corrected)
    
    # Append this dataframe to a list.    
    return selection

#### Main code

In [None]:
bbox_path = './gpx_traces/'

In [None]:
list_bbox_dirs = list_subdirectories(bbox_path)
for bbox_dir in list_bbox_dirs :

    # Path to the parquet file that will store the trajectories associated with this bbox.
    outfile_path = bbox_path + bbox_dir.split('/')[-1] + '.parquet'

    # Check if this bbox's parquet has already been generated. If so, skip to the next bbox.
    if os.path.isfile(outfile_path) : 
        print(f'{outfile_path} already exist, hence skipping.')
        continue

    
    # Process the GPXs within bbox_dir...
    list_gdfs = []
    list_bbox_files = list_files(bbox_dir)
    for fname in tqdm(list_bbox_files, desc=f"Processing files in {bbox_dir}"):
        # print(f'Processing {fname}...')
        list_gdfs.append(gpx_to_gdf(fname))

    
    # Concatenate the geodataframes related to the current bbox...
    print(f'Preparing the final concatenated dataframe...')
    combined = gpd.GeoDataFrame(pd.concat(list_gdfs, ignore_index=True), crs=list_gdfs[0].crs)
    # Ensure that the 'track_uid' is categorical.
    combined['track_uid'] = combined['track_uid'].astype('category')
    # Finally, remove the small fraction of duplicate rows (i.e., those having same timestamp, location, and ID) 
    # that might be present in the final Dataframe.
    combined.drop_duplicates(ignore_index = True, inplace = True)
    # combined.info()
    
    # Write the GeoDataFrame into a parquet.
    print(f'Writing GeoDataFrame to {outfile_path}...')
    combined.to_parquet(outfile_path)