# Auditing a Trajectory Classifier for Spatial Fairness

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import folium

from src.functions import *
from src.traj_functions import *

## Load the output of a trajectory classifier

Here we load a dataset that should contain the labels given by a classifier to trajectories. In the dataset, we expect to find:

1. the labels given to each trajectory;
2. the sequence of points associated with each trajectory.

In [2]:
## Load the labelled trajectory dataset.
dataset_path = './synth_traj_dataset_fair.pkl'
gdf_trajs = gpd.GeoDataFrame(pd.read_pickle(dataset_path))
display(gdf_trajs.head())

# Set the name of the column containing the labels given to the trajectories.
label = 'label'
N, P = get_stats(gdf_trajs, label)

# Function below remaps labels different than "1" to "0". Used to reduce multi-class classification to binary classification.
true_types = get_true_types(gdf_trajs, label)
# print(true_types[:30])

# Print some general statistics about the points and labels. Here, the positive label is "1", the other ones are considered negative.
print(f'N={N} points, P={P} positives, frac.positives={P/N:.2f}')

Unnamed: 0,geometry,label
0,"LINESTRING (2.52375 48.96159, 2.52416 48.9611,...",0
1,"LINESTRING (2.54396 48.96144, 2.54069 48.9604,...",1
2,"LINESTRING (2.33953 48.87304, 2.33957 48.87295...",1
3,"LINESTRING (2.32876 48.87064, 2.32874 48.87055...",0
4,"LINESTRING (2.33999 48.86894, 2.34002 48.86897...",1


N=2738 points, P=1635 positives, frac.positives=0.60


In [3]:
# lat_max = df['lat'].values.max()
# lat_min = df['lat'].values.min()
# lon_max = df['lon'].values.max()
# lon_min = df['lon'].values.min()

# mapit = folium.Map(location=[37.09, -95.71], zoom_start=5, tiles="Stamen Toner")

# for index, row in df.iterrows():
#     if row[label] == 1:
#         folium.CircleMarker( location=(row['lat'], row['lon']), color='#00FF00', fill_color='#00FF00', fill=True, opacity=0.4, fill_opacity=0.4, radius=2 ).add_to( mapit )
#     elif row[label] == 0:
#         folium.CircleMarker( location=(row['lat'], row['lon']), color='#FF0000', fill_color='#FF0000', fill=True, opacity=0.4, fill_opacity=0.4, radius=2 ).add_to( mapit )


# mapit.fit_bounds([(lat_min, lon_min), (lat_max, lon_max)])

# mapit

# 2. Run Experiments

There are three experiments:
- (TODO) Unrestricted regions: runs **our approach** on unrestricted regions.
- Regions are cells of a uniform grid: evaluates **our approach** over a single partitioning.


## One Partitioning evaluation

Instead of creating regions from clusters of trajectories/points, here we use a more common approach.
We superimpose a uniform grid over the space: the grid's cells will be the regions.

In [4]:
### create the partitioning (grid) and its partitions (regions)
num_slices_lon = 20
num_slices_lat = 20
grid_info, grid_loc2_idx, regions = create_traj_partitioning(gdf_trajs, num_slices_lon, num_slices_lat)

2.0067567, 48.71535, 2.599733, 48.9782447


### Audit the classifier's output with our approach

In [5]:
best_region, max_likeli, statistics = scan_regions(regions, true_types, N, P, verbose=True)

range 0.0 5.145969938050712
max likelihood 5.145969938050712


In [None]:
# Determine the significance threshold based on a desired signif_level
# Example: if we conduct 200 simulations, and we have a signif_level of 0.005, then we are interested in
#          regions whose test statistics is larger than the top 200*0.005 = 1st test statistics obtained
#          in the simulations.
n_alt_worlds = 200
signif_level = 0.005
signif_thresh = get_signif_threshold(signif_level, n_alt_worlds, regions, N, P)
print(signif_thresh)

9.762502034416457


In [None]:
print(f'Number of regions: {len(statistics)}) {type(statistics[0])}')

# Identify regions with test statistic above the statistical significance threshold
sorted_statistics = np.sort(statistics)
top_k = len(statistics) - np.searchsorted(sorted_statistics, signif_thresh)
print(top_k, 'significant regions')

# Retrieve the indexes of the anomalous regions. Then, retrieve their details.
indexes = np.argsort(statistics)[::-1][:top_k]
significant_regions = [regions[i] for i in indexes]


Number of regions: 400) <class 'numpy.float64'>
0 significant regions
Argsort: [ 16  17  18 380  44 360  77  48 365 339 340 284 181 180 160 161 384  15
 392 391 394 393 242 241 200  12 139  65  57 110 359  42  80 326 100 105
 214 395 396 288 121 213 290 272 195   8 353  46 261 303 146 197 135 296
  28 159 358 319 109  50 271  14  36 304 108 348 285 370 141 256  27 334
 163 268 210 273 267 291 130 311 344  63  43  64 259 218 102 266 283 318
 276  37 123 250  13 137  71   1  40 390   0 382 317  26 245 134 229 253
 192 294 125 157 111 270 131 274  31   9 282 120 179 293 251 196 362 118
  21  20 237 224 238 113 147 235  67 103 107 183 158 305 337 151  85 328
  51 133 209  25 138  95 286 112 309 292 275 314  30 366  29  90 115 338
 376  75 172 162 333 247 132 122  47  45 280  19 219  35 297 281  41  89
 116  55  91 227  33 299 211 265 225 142 374 182 212 330  53 144  54 236
 216 308   7  86 232  10 316 287  49 104 260 355  87 336  56 243 295  76
 298  11 397 301  24 168 343 323 321 341 258 

In [None]:
# show_grid_region(df, grid_info, true_types, best_region)
show_grid_regions(gdf_trajs, grid_info, true_types, significant_regions[:])
