# Auditing a Trajectory Classifier for Spatial Fairness

In [3]:
import pandas as pd
import numpy as np
import folium

import sys
sys.path.append('./src')    
from functions import *

## Load the output of a trajectory classifier

Here we load a dataset that should contain the labels given by a classifier to trajectories. In the dataset, we expect to find:

1. the labels given to each trajectory;
2. the sequence of points associated with each trajectory.

**NOTE**: for now we use the LAR (`data/LAR.csv`) for development purposes, which refers to points and not trajectories. It contains the modified Loan/Application Register records in the US for Bank of America for the year 2021; the dataset is created by `data/LAR/create_LAR.ipynb`.

**TODO**: adapt the code to a dataset of trajectories.

In [7]:
## load the dataset
df = load_data('./data/LAR.csv')
display(df.head())

# Set the name of the column containing the labels given to the trajectories.
label = 'action_taken'
N, P = get_stats(df, label)

# Print some general statistics about the points and labels. Here, the positive label is "1", the other ones are considered negative.
print(f'N={N} points')
print(f'P={P} positives')

Unnamed: 0,action_taken,census_tract,location,lat,lon
0,1,10003015200,"(39.7070504, -75.5832416)",39.70705,-75.583242
1,1,6059086502,"(33.8503559, -117.9121351)",33.850356,-117.912135
2,1,26163596100,"(42.1014482, -83.1602786)",42.101448,-83.160279
3,1,9009165600,"(41.3558996, -72.9323558)",41.3559,-72.932356
4,3,36061001200,"(40.7159065, -73.9820936)",40.715907,-73.982094


N=206418 points
P=127286 positives


In [None]:
rtree = create_rtree(df)
# rtree = create_rtree_v2(df)

In [None]:
# lat_max = df['lat'].values.max()
# lat_min = df['lat'].values.min()
# lon_max = df['lon'].values.max()
# lon_min = df['lon'].values.min()

# mapit = folium.Map(location=[37.09, -95.71], zoom_start=5, tiles="Stamen Toner")

# for index, row in df.iterrows():
#     if row[label] == 1:
#         folium.CircleMarker( location=(row['lat'], row['lon']), color='#00FF00', fill_color='#00FF00', fill=True, opacity=0.4, fill_opacity=0.4, radius=2 ).add_to( mapit )
#     elif row[label] == 0:
#         folium.CircleMarker( location=(row['lat'], row['lon']), color='#FF0000', fill_color='#FF0000', fill=True, opacity=0.4, fill_opacity=0.4, radius=2 ).add_to( mapit )


# mapit.fit_bounds([(lat_min, lon_min), (lat_max, lon_max)])

# mapit

# 2. Run Experiments

There are three experiments:
- Unrestricted regions: runs **our approach** on unrestricted regions.
- One Partitioning: runs **our approach** against **MeanVar** on regions from a single partitioning.
- Multiple Partitionings: runs **MeanVar** on multiple partitionings.


## Unrestricted regions (corresponds to Sec. 4.3 of the paper)

In [None]:
# Given a set of points (locations), cluster them and then consider the clusters' centroids. Then, for each
# centroid find the nearest location in the r-tree: this point will be used as a seed .
seeds = create_seeds(df, rtree, 100)
print(len(seeds), seeds[:10])

In [None]:
# Given a set of seed point IDs and a list of radii, this function creates candidate regions. 
# For each seed and radius, it queries the spatial index (using query_range) to get all points within a square centered around
# the seed and with side 2*radius. This square corresponds to a region, and packages the information into a dictionary.
# Purpose: To generate many regions whose fairness (or lack thereof) can be audited.

radii = np.arange(0.05, 1.01, 0.05)
regions = create_regions(df, rtree, seeds, radii)

print(len(regions), 'regions')

In [None]:
direction = 'both'
# direction = 'less_in'
# direction = 'less_out'

best_region, max_likeli, statistics = scan_regions(regions, true_types, N, P, direction=direction, verbose=True)

# statistics.sort(key=lambda x: -x)
# print(statistics)

In [None]:
## determine the significance threshold based on a desired signif_level
n_alt_worlds = 200
signif_level = 0.005

signif_thresh = get_signif_threshold(signif_level, n_alt_worlds, regions, N, P)
print(signif_thresh)

In [None]:
## identify regions with test statistic above statistical significance threshold

# Sort the max likelihood ratios computed when considering the real data.
sorted_statistics = np.sort(statistics)

# Now use the threshold computed from the simulations, to find out the number of regions, 'top_k',
# for which the null hypotesis is false (according to the given \alpha = signif_thresh)
# when considering max likelihood ration computed from the real data. In other words,
# we are determining the number of regions for which the considered classifier *is not spatially fair*.
top_k = len(statistics) - np.searchsorted(sorted_statistics, signif_thresh)
print(top_k, 'significant regions')

# Compute the list of indices that sort the array 'statistics'.
# In essence, 'np.argsort' perform an indirect sort along the given axis; it returns an array of indices of the same shape
# as the array being sorted that index data along the given axis in sorted order.
indexes = np.argsort(statistics)[::-1][:top_k]

# Select the 'top_k' regions that are not spatially fair
significant_regions = [ regions[i] for i in indexes ]

In [None]:
def intersects(regionA, regionB):
    cA = np.array(id2loc(df, regionA['center']))
    cB = np.array(id2loc(df, regionB['center']))
    rA = regionA['radius']
    rB = regionB['radius']

    A_top_right = cA + np.array([rA, rA])
    A_bottom_left = cA - np.array([rA, rA])
    B_top_right = cB + np.array([rB, rB])
    B_bottom_left = cB - np.array([rB, rB])

    # print(A_bottom_left, A_top_right, B_bottom_left, B_top_right)

    return not (A_top_right[0] < B_bottom_left[0] or A_bottom_left[0] > B_top_right[0] or A_top_right[1] < B_bottom_left[1] or A_bottom_left[1] > B_top_right[1])



non_olap_regions = []
centers = []
for region in significant_regions:
    center = region['center']
    if center in centers:
        continue
    
    no_intersections = True
    for other in non_olap_regions:
        if intersects(region, other):
            no_intersections = False
            break
    if no_intersections:
        centers.append(center)
        non_olap_regions.append(region)
    # print(region['radius'])

print(len(non_olap_regions), 'non-overlapping regions')

# over(non_olap_regions[0], non_olap_regions[20])

In [None]:
## find smallest, largest regions

min_radius = np.inf
max_radius = -np.inf
for region in non_olap_regions:
    if region['radius'] < min_radius:
        min_radius = region['radius']
        # region_min_radius = region
    if region['radius'] > max_radius:
        max_radius = region['radius']
        # region_max_radius = region

min_points = np.inf
max_points = -np.inf
for region in non_olap_regions:
    if region['radius'] == min_radius and len(region['points']) < min_points:
        min_points = len(region['points'])
        region_min_radius = region
    if region['radius'] == max_radius and len(region['points']) > max_points:
        max_points = len(region['points'])
        region_max_radius = region

print(len(region_min_radius['points']), len(region_max_radius['points']))

In [None]:
show_circular_regions(df, true_types, non_olap_regions[:5])

# show_circular_regions(df, true_types, [region_min_radius, region_max_radius])

## One Partitioning evaluation

Instead of creating regions from clusters of trajectories/points, here we use a more common approach.
We superimpose a uniform grid over the space: the grid's cells will be the regions.

#### Aux functions

In [None]:
def create_partitioning(df, rtree, lon_min: float, lon_max: float, lat_min: float, lat_max: float, lon_n: float, lat_n: float):
    grid_info = {}
    grid_info['lon_min'] = lon_min
    grid_info['lon_max'] = lon_max
    grid_info['lat_min'] = lat_min
    grid_info['lat_max'] = lat_max
    grid_info['lat_n'] = lat_n
    grid_info['lon_n'] = lon_n

    grid_loc2_idx = {} ## maps (x,y) grid_loc coords to an index in the partitions array

    partitions = []
    for i in range(lat_n):
        lat_start = lat_min + (i/lat_n)*(lat_max - lat_min)
        lat_end = lat_min + ((i+1)/lat_n)*(lat_max - lat_min)
        for j in range(lon_n):
            lon_start = lon_min + (j/lon_n)*(lon_max - lon_min)
            lon_end = lon_min + ((j+1)/lon_n)*(lon_max - lon_min)

            points = query_range_box(df, rtree, lon_start, lon_end, lat_start, lat_end)
            # print(len(points))
            partition  = {
                'grid_loc': (j, i),
                'points' : points,
            }
            grid_loc2_idx[(j,i)] = len(partitions)
            partitions.append(partition)
    
    return grid_info, grid_loc2_idx, partitions

In [None]:
# Determine the bounding box in which the points/trajectories are.
lat_max = df['lat'].values.max()
lat_min = df['lat'].values.min()
lon_max = df['lon'].values.max()
lon_min = df['lon'].values.min()
print(lat_min, lat_max, lon_min, lon_max)


### create the partitioning (grid) and its partitions (regions)

lat_n = 20 ## number of partitions along vertical axis (latitude)  ## was 12
lon_n = 20 ## number of partitions along horizontal axis (longitude) ## was 25

grid_info, grid_loc2_idx, regions = create_partitioning(df, rtree, lon_min, lon_max, lat_min, lat_max, lon_n, lat_n)

### Audit the classifier's output with our approach

In [None]:
best_region, max_likeli, statistics = scan_regions(regions, true_types, N, P, verbose=True)

In [None]:
## determine the significance threshold based on a desired signif_level
n_alt_worlds = 200
signif_level = 0.005

signif_thresh = get_signif_threshold(signif_level, n_alt_worlds, regions, N, P)
print(signif_thresh)

In [None]:
## identify regions with statistic above statistical significance threshold

sorted_statistics = np.sort(statistics)
# print(sorted_statistics[::-1][40:60])
# print(np.sort(statistics)[::-1][40:60])

top_k = len(statistics) - np.searchsorted(sorted_statistics, signif_thresh)

print(top_k, 'significant regions')


indexes = np.argsort(statistics)[::-1][:top_k]

significant_regions = [ regions[i] for i in indexes ]


In [None]:
# show_grid_region(df, grid_info, true_types, best_region)
show_grid_regions(df, grid_info, true_types, significant_regions[:])


In [None]:
## best_region vs the_region

the_region = top_regions[0]

best_idx = grid_loc2_idx[best_region['grid_loc']]
the_idx = grid_loc2_idx[the_region['grid_loc']]

print(best_region['grid_loc'], the_region['grid_loc'])
print(best_idx, the_idx)

print(statistics[best_idx], statistics[the_idx])
print(scores[best_idx], scores[the_idx])

## Try out different multiple Partitionings

Teoricamente non serve con metodi basati su test d'ipotesi (era usato con il solo MeanVar), ma eventualmente si puo' reintegrare dal notebook originale.