In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [None]:
# Main variables.
path_traj_dataset = './dataset_simulator_trajectories.parquet'
out_synth_dataset = './dataset_simulator_trajectories_fair.parquet'

#### Read the original trajectory dataset

In [None]:
# Read the trajectory dataset.
data = gpd.read_parquet(path_traj_dataset)
data.info()

#### Simulate a fair trajectory classifier

We assume that there's an underlying Bernoullian probability distribution.

In [None]:
# Get the user IDs from the dataset.
list_ids = data['ID'].unique()

# Now generate synthetic labels [0,1] according to a Bernoullian distribution.
frac_pos = 0.6
labels = np.random.binomial(size=len(list_ids), n=1, p=frac_pos)
#display(labels)
#print(f"{labels.sum()/len(labels)}")

# Build a mapping dict ID -> label:
id_to_label = dict(zip(list_ids, labels))

# Use the mapping to associate to each ID the assigned label.
data['label'] = data['ID'].map(id_to_label)

display(data)

### Write the dataset to disk

In [None]:
# Write the dataset to a pickle. This concludes the preparation of a syntethic dataset of
# labeled trajectories for an auditing algorithm that evaluates the spatial fairness of trajectory classifiers.
data.to_parquet(out_synth_dataset)