In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

from shapely.geometry import LineString

In [None]:
# Main variables.
path_traj_dataset = './paris traj/paris_centre.final.processed.parquet'
out_synth_dataset = './synth_traj_dataset.pkl'

#### Read the original trajectory dataset

In [None]:
# Read the trajectory dataset.
data = gpd.read_parquet(path_traj_dataset)
data.info()

#### Group by the trajectories

In [None]:
# Group by the trajectories: for each trajectory, transform its sequence of points (ordered temporally) into a list of Points.
data.sort_values(by = ['user', 'traj_id', 'time'], inplace = True)
grouped = data.groupby(['user', 'traj_id'], observed = True)['geometry'].apply(list).reset_index()
display(grouped)

# Drop the trajectories with less than 2 points.
grouped['size'] = grouped['geometry'].str.len() # .str accessor enables vectorized count of the number of els in every list.
grouped = grouped.loc[grouped['size'] > 1]
grouped = grouped.drop(columns = ['user', 'traj_id', 'size'])
display(grouped)

# For each trajectory, turn the associated list of Points into a LineString.
grouped['geometry'] = grouped['geometry'].apply(lambda pts: LineString(pts))
grouped = gpd.GeoDataFrame(grouped, crs = data.crs).reset_index(drop = True)
display(grouped)

#### Simulate a fair trajectory classifier

We assume that there's an underlying Bernoullian probability distribution.

In [None]:
# Now associate synthetic labels [0,1] to the trajectories.
frac_pos = 0.6
grouped['label'] = np.random.binomial(size=len(grouped), n=1, p=frac_pos)
display(grouped)
#print(f"{grouped['label'].sum()/len(grouped['action_taken'])}")

### Make a specific spatial region spatially unfair

**TODO.**

Here, the idea is to partition the space with a uniform grid, select a specific cell, and relabel the trajectories traversing that cell. This simulates a trajectory classifier that was unfair w.r.t. the trajectories traversing that cell.

### Write the dataset to disk

**NOTE**: we use the pickle format as Parquet cannot handle LineStrings.

In [None]:
# Write the dataset to a pickle. This concludes the preparation of a syntethic dataset of
# labeled trajectories for an auditing algorithm that evaluates the spatial fairness of trajectory classifiers.
grouped.to_pickle('synth_traj_dataset_fair.pkl')