In [None]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import numpy as np
import glob

In [None]:
# 1 - Determine the list of TSVs to convert to a dataframe.
path_logs = './data_simulator/'
files = sorted(
    glob.glob(f"{path_logs}AgentStateTable-*.tsv"),
    key=lambda fn: int(fn.split("AgentStateTable-")[1].split(".tsv")[0])
)
files

In [None]:
from joblib import Parallel, delayed

# 2 - Parse the TSVs: retrieve the positions of the agents. (PARALLEL VERSION)
def process_file(f):
    # print(f'Processing file {f}...')
    df = pd.read_csv(
        f,
        sep="\t",
        usecols=[1, 2, 3],
        dtype={2: str, 3: np.uint32}
    )
    df.columns = ["timestamp", "geometry", "ID"]
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['geometry'] = df['geometry'].apply(wkt.loads)
    return df

# Parallel process the TSVs.
print('Parallel processing the TSVs...')
list_df = Parallel(n_jobs=-1, verbose=10)(delayed(process_file)(f) for f in files)

# Concatenate results.
print('Concatenating the various geodataframes...')
df = pd.concat(list_df, ignore_index=True)

In [None]:
# Create a GeoDataFrame with the CRS initially set to the one used by the authors of the simulator "Patterns of Life" 
# for all the maps, i.e., EPSG:26916.
print('Creating Geopandas dataframe...')
original_crs = "EPSG:26916"
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs=original_crs)
del df

# Project the coordinates to WGS84.
print('Reprojecting coordinates to WGS84...')
gdf.to_crs(epsg=4326, inplace=True)
display(gdf)
display(gdf.info())

# Now store the latitude and longitudes of each location in the appropriate columns.
gdf['lon'] = gdf.geometry.x
gdf['lat'] = gdf.geometry.y
del gdf['geometry']
display(gdf)
display(gdf.info())

# Write the final dataframe to disk.
print('Storing the uncompressed trajectories to disk...')
gdf.to_parquet('./dataset_simulator_trajectories.parquet')

In [None]:
import skmob
from skmob.preprocessing import filtering, compression

# Finally, create a compressed version of the simulated trajectories. Here, the real goal is to compress the locations
# associated with the users' stays, which are the ones taking the vast majority of space in the uncompressed dataframe.
#
# The compression is as follows: all points within a radius of spatial_radius_km kilometers from a given initial point
# are compressed into a single point that has the median coordinates of all points and the time of the initial point.

spatial_radius_km = 1/1000
tdf = skmob.TrajDataFrame(gdf, latitude = 'lat', longitude = 'lon',
                          datetime = 'timestamp', user_id = 'ID')

print(f'Compressing the trajectories (compression level is {spatial_radius_km*1000} meters)...')
ctdf = compression.compress(tdf, spatial_radius_km = spatial_radius_km)

print('Storing the compressed trajectories to disk...')
ctdf.to_parquet('./dataset_simulator_trajectories.compressed.parquet')