# Auditing a Trajectory Classifier for Spatial Fairness

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import folium

from src.functions import *
from src.traj_functions import *

## Load the output of a trajectory classifier

Here we load a dataset that should contain the labels given by a classifier to trajectories. In the dataset, we expect to find:

1. the labels given to each trajectory;
2. the sequence of points associated with each trajectory.

In [None]:
## Load the labelled trajectory dataset.
dataset_path = './synth_traj_dataset_unfair.pkl'
gdf_trajs = gpd.GeoDataFrame(pd.read_pickle(dataset_path))
display(gdf_trajs.head())

# Set the name of the column containing the labels given to the trajectories.
label = 'label'
N, P = get_stats(gdf_trajs, label)

# Function below remaps labels different than "1" to "0". Used to reduce multi-class classification to binary classification.
true_types = get_true_types(gdf_trajs, label)
# print(true_types[:30])

# Print some general statistics about the points and labels. Here, the positive label is "1", the other ones are considered negative.
print(f'N={N} points, P={P} positives, frac.positives={P/N:.2f}')

In [None]:
# lat_max = df['lat'].values.max()
# lat_min = df['lat'].values.min()
# lon_max = df['lon'].values.max()
# lon_min = df['lon'].values.min()

# mapit = folium.Map(location=[37.09, -95.71], zoom_start=5, tiles="Stamen Toner")

# for index, row in df.iterrows():
#     if row[label] == 1:
#         folium.CircleMarker( location=(row['lat'], row['lon']), color='#00FF00', fill_color='#00FF00', fill=True, opacity=0.4, fill_opacity=0.4, radius=2 ).add_to( mapit )
#     elif row[label] == 0:
#         folium.CircleMarker( location=(row['lat'], row['lon']), color='#FF0000', fill_color='#FF0000', fill=True, opacity=0.4, fill_opacity=0.4, radius=2 ).add_to( mapit )


# mapit.fit_bounds([(lat_min, lon_min), (lat_max, lon_max)])

# mapit

# 2. Run Experiments

There are three experiments:
- (TODO) Unrestricted regions: runs **our approach** on unrestricted regions. Unrestricted regions should be somehow computed from the clusters of a trajectory clustering algorithm.
- Regions are cells of a uniform grid: evaluates **our approach** over a single partitioning.


## One Partition Auditing

Instead of creating regions from clusters of trajectories/points, here we use a more common approach.
We superimpose a uniform grid over the space: the grid's cells will be the regions.

In [None]:
### create the partitioning (grid) and its partitions (regions)
num_slices_lon = 20
num_slices_lat = 20
grid_info, grid_loc2_idx, regions = create_traj_partitioning(gdf_trajs, num_slices_lon, num_slices_lat)

### Audit the classifier's output with our approach

In [None]:
# Compute the test statistics for each region with the real data.
best_region, max_likeli, statistics = scan_regions(regions, true_types, N, P, verbose=True)

In [None]:
# Conduct 'n_alt_worlds' simulations. 
# In each simulation, we shuffle the labels of the trajectories and compute the test statistics for each region. 
# We then consider the distribution of the test statistics obtained in the simulations to determine the significance threshold.
# The significance threshold is determined using 'signif_level', i.e., the desired quantile over the distribution of the test
# statistics obtained in the simulations.
#
# Example: if we conduct 200 simulations, and we have a signif_level of 0.005, then we are interested in
#          regions whose test statistics is larger than the top 200*0.005 = 1st test statistics obtained
#          in the simulations.
n_alt_worlds = 200
signif_level = 0.005
signif_thresh = get_signif_threshold(signif_level, n_alt_worlds, regions, N, P)
print(signif_thresh)

In [None]:
print(f'Number of regions to evaluate: {len(statistics)}')

### Identify regions with test statistic above the statistical significance threshold ###

# Sort 'statistics' in ascending order.
sorted_statistics = np.sort(statistics)

# 'np.searchsorted' finds the index in the sorted array where 'signif_thresh' 
# should be inserted to keep the array sorted. This effectively counts how many
# values are smaller than signif_thresh. Then, subtract this index from len(statistics):
# this finds how many values are greater than signif_thresh.
top_k = len(statistics) - np.searchsorted(sorted_statistics, signif_thresh)

# 'np.argsort(statistics)' returns the indices that would sort the statistics array in ascending order.
# '[::-1]' reverses the array, so that the highest values come first.
# '[:top_k]' selects the indexes of the top_k regions with the test statistics larger than signif_thresh: 
# these represent the 'unfair' regions.
desc_sorted_indexes = np.argsort(statistics)[::-1]
indexes_unfair_regions = desc_sorted_indexes[:top_k]
indexes_fair_regions = desc_sorted_indexes[top_k:]

significant_regions = [regions[i] for i in indexes_unfair_regions]
normal_regions = [regions[i] for i in indexes_fair_regions]

print(f'Indexes of unfair regions: {indexes_unfair_regions}')
print(f'{len(significant_regions)} significant (unfair) regions found')
print(f'{len(normal_regions)} fair regions found')

In [None]:
# Display on a map the grid and the regions with the test statistics above the significance threshold.
show_traj_grid_regions(gdf_trajs, grid_info, true_types, normal_regions, significant_regions, label_trajs='label')