# Auditing for Spatial Fairness

This notebook runs the experiments. The methods are implemented in the `src/function.py` file.

In [None]:
import pandas as pd
import numpy as np
import folium


import sys
sys.path.append('./src')
                
from functions import *

# 1. Select the dataset

You can select one of the following:
- LAR (`data/LAR.csv`) contains the modified Loan/Application Register records in the US for Bank of America for the year 2021; the dataset is created by `data/LAR/create_LAR.ipynb`.
- Crime (`data/Crime.csv`) contains predictions about crime incidents in the city of Los Angeles from 2010â€“2019; the predictive model is a Random Forest Classifier and the dataset is created by `data/Crime/create_Crime.ipynb`.
- Synth_fair/Synth_unfair/Semisynth (`/data/Synth_fair.csv`, `/data/Synth_unfair.csv`, `/data/Semisynth.csv`) are synthetic datasets create by the notebooks in `data/Synth/`.

In [None]:
## load the LAR dataset
df = load_data('./data/LAR.csv')
label = 'action_taken'
## df = filterbbox(df, -87.634938, 24.523096, -80.031362, 31.000888) ## florida
## df = filterbbox(df, -80.8736, 25.13742, -80.06279, 25.979434) # miami


## load the CRIME_serious dataset
# df = load_data('./data/Crime.csv')
# label = 'pred'


# load a synthetic dataset
# df = load_data('./data/Synth_fair.csv') ## FAIR
# label = 'label'


# df = load_data('./data/Synth_unfair.csv') ## UNFAIR
# label = 'label'


# df = load_data('./data/Semisynth.csv') ## FAIR
# label = 'label'


N, P = get_stats(df, label)

print(f'N={N} points')
print(f'P={P} positives')
df.head()

In [None]:
rtree = create_rtree(df)
# rtree = create_rtree_v2(df)

In [None]:
# lat_max = df['lat'].values.max()
# lat_min = df['lat'].values.min()
# lon_max = df['lon'].values.max()
# lon_min = df['lon'].values.min()

# mapit = folium.Map(location=[37.09, -95.71], zoom_start=5, tiles="Stamen Toner")

# for index, row in df.iterrows():
#     if row[label] == 1:
#         folium.CircleMarker( location=(row['lat'], row['lon']), color='#00FF00', fill_color='#00FF00', fill=True, opacity=0.4, fill_opacity=0.4, radius=2 ).add_to( mapit )
#     elif row[label] == 0:
#         folium.CircleMarker( location=(row['lat'], row['lon']), color='#FF0000', fill_color='#FF0000', fill=True, opacity=0.4, fill_opacity=0.4, radius=2 ).add_to( mapit )


# mapit.fit_bounds([(lat_min, lon_min), (lat_max, lon_max)])

# mapit

# 2. Run Experiments

There are three experiments:
- Unrestricted regions: runs **our approach** on unrestricted regions.
- One Partitioning: runs **our approach** against **MeanVar** on regions from a single partitioning.
- Multiple Partitionings: runs **MeanVar** on multiple partitionings.


## Unrestricted regions (corresponds to Sec. 4.3 of the paper)

In [None]:
# Given a set of points (locations), cluster them and then consider the clusters' centroids. Then, for each
# centroid find the nearest location in the r-tree: this point will be used as a seed .
seeds = create_seeds(df, rtree, 100)
print(len(seeds), seeds[:10])

In [None]:
# Given a set of seed point IDs and a list of radii, this function creates candidate regions. 
# For each seed and radius, it queries the spatial index (using query_range) to get all points within a square centered around
# the seed and with side 2*radius. This square corresponds to a region, and packages the information into a dictionary.
# Purpose: To generate many regions whose fairness (or lack thereof) can be audited.

radii = np.arange(0.05, 1.01, 0.05)
regions = create_regions(df, rtree, seeds, radii)

print(len(regions), 'regions')

In [None]:
mapit = folium.Map(location=[37.09, -95.71], zoom_start=5, prefer_canvas = True, tiles='cartodbpositron')

# Plot the seeds on a map.
for point in seeds:
    # NOTE: id2loc retrieve from 'df' a specific location according to its ID in 'point'.
    folium.CircleMarker(location=id2loc(df, point), color='#0000FF', fill_color='#0000FF', fill=True, opacity=0.4, fill_opacity=0.4, radius=2 ).add_to( mapit )


# NOTE: The code below plots two rectangles in the ocean, and has been probably used just for debugging purposes.

#center = np.array([26, -126])
#r = radii[0]
#lower_left = center - r
#upper_right = center + r
#folium.Rectangle([lower_left, upper_right], color='#F1CF3B').add_to( mapit )

#r = radii[-1]
#lower_left = center - r
#upper_right = center + r
#folium.Rectangle([lower_left, upper_right], color='#F1CF3B').add_to( mapit )


mapit

In [None]:
direction = 'both'
# direction = 'less_in'
# direction = 'less_out'

# Function below used to manage the LAR dataset, in which we remap the label "3" to "0".
true_types = get_true_types(df, label)
print(true_types[:30])

best_region, max_likeli, statistics = scan_regions(regions, true_types, N, P, direction=direction, verbose=True)

# statistics.sort(key=lambda x: -x)
# print(statistics)

In [None]:
## determine the significance threshold based on a desired signif_level

n_alt_worlds = 200
signif_level = 0.005

signif_thresh = get_signif_threshold(signif_level, n_alt_worlds, regions, N, P)
print(signif_thresh)

In [None]:
## identify regions with test statistic above statistical significance threshold

# Sort the max likelihood ratios computed when considering the real data.
sorted_statistics = np.sort(statistics)

# Now use the threshold computed from the simulations, to find out the number of regions, 'top_k',
# for which the null hypotesis is false (according to the given \alpha = signif_thresh)
# when considering max likelihood ration computed from the real data. In other words,
# we are determining the number of regions for which the considered classifier *is not spatially fair*.
top_k = len(statistics) - np.searchsorted(sorted_statistics, signif_thresh)
print(top_k, 'significant regions')

# Compute the list of indices that sort the array 'statistics'.
# In essence, 'np.argsort' perform an indirect sort along the given axis; it returns an array of indices of the same shape
# as the array being sorted that index data along the given axis in sorted order.
indexes = np.argsort(statistics)[::-1][:top_k]

# Select the 'top_k' regions that are not spatially fair
significant_regions = [ regions[i] for i in indexes ]

In [None]:
def intersects(regionA, regionB):
    cA = np.array(id2loc(df, regionA['center']))
    cB = np.array(id2loc(df, regionB['center']))
    rA = regionA['radius']
    rB = regionB['radius']

    A_top_right = cA + np.array([rA, rA])
    A_bottom_left = cA - np.array([rA, rA])
    B_top_right = cB + np.array([rB, rB])
    B_bottom_left = cB - np.array([rB, rB])

    # print(A_bottom_left, A_top_right, B_bottom_left, B_top_right)

    return not (A_top_right[0] < B_bottom_left[0] or A_bottom_left[0] > B_top_right[0] or A_top_right[1] < B_bottom_left[1] or A_bottom_left[1] > B_top_right[1])



non_olap_regions = []
centers = []
for region in significant_regions:
    center = region['center']
    if center in centers:
        continue
    
    no_intersections = True
    for other in non_olap_regions:
        if intersects(region, other):
            no_intersections = False
            break
    if no_intersections:
        centers.append(center)
        non_olap_regions.append(region)
    # print(region['radius'])

print(len(non_olap_regions), 'non-overlapping regions')

# over(non_olap_regions[0], non_olap_regions[20])

In [None]:
## find smallest, largest regions

min_radius = np.inf
max_radius = -np.inf
for region in non_olap_regions:
    if region['radius'] < min_radius:
        min_radius = region['radius']
        # region_min_radius = region
    if region['radius'] > max_radius:
        max_radius = region['radius']
        # region_max_radius = region

min_points = np.inf
max_points = -np.inf
for region in non_olap_regions:
    if region['radius'] == min_radius and len(region['points']) < min_points:
        min_points = len(region['points'])
        region_min_radius = region
    if region['radius'] == max_radius and len(region['points']) > max_points:
        max_points = len(region['points'])
        region_max_radius = region

print(len(region_min_radius['points']), len(region_max_radius['points']))

In [None]:
show_circular_regions(df, true_types, non_olap_regions[:5])

# show_circular_regions(df, true_types, [region_min_radius, region_max_radius])

## One Partitioning 

**NOTE***: In the code seen so far, the authors partitioned the points in space by clustering them. So, no grid was used.
Below, they take a more common approach seen in other works, i.e., superimpose a uniform grid over the space, which effectively partitions the points.

In [None]:
lat_max = df['lat'].values.max()
lat_min = df['lat'].values.min()
lon_max = df['lon'].values.max()
lon_min = df['lon'].values.min()
print(lat_min, lat_max, lon_min, lon_max)

In [None]:
### create the partitioning (grid) and its partitions (regions)

# lat_n = 12 ## number of partitions along vertical axis (latitude)  ## was 12
# lon_n = 25 ## number of partitions along horizontal axis (longitude) ## was 25

lat_n = 20
lon_n = 20


grid_info, grid_loc2_idx, regions = create_partitioning(df, rtree, lon_min, lon_max, lat_min, lat_max, lon_n, lat_n)

### Our Method

In [None]:
best_region, max_likeli, statistics = scan_regions(regions, true_types, N, P, verbose=True)


In [None]:
## determine the significance threshold based on a desired signif_level

n_alt_worlds = 1000
signif_level = 0.005

signif_thresh = get_signif_threshold(signif_level, n_alt_worlds, regions, N, P)
print(signif_thresh)

In [None]:
## identify regions with statistic above statistical significance threshold

sorted_statistics = np.sort(statistics)
# print(sorted_statistics[::-1][40:60])
# print(np.sort(statistics)[::-1][40:60])

top_k = len(statistics) - np.searchsorted(sorted_statistics, signif_thresh)

print(top_k, 'significant regions')


indexes = np.argsort(statistics)[::-1][:top_k]

significant_regions = [ regions[i] for i in indexes ]


In [None]:
# show_grid_region(df, grid_info, true_types, best_region)
show_grid_regions(df, grid_info, true_types, significant_regions[:])


### MeanVar Method

In [None]:
## partioning-based scan

the_region, max_score, scores = scan_partitioning(regions, true_types)

print('max_score', max_score, 'with', len(the_region['points']), 'points')


## get the top_k regions

top_k = 5

ma = np.ma.masked_array(scores, mask=np.isnan(scores))

print(-np.sort(-ma)[:top_k])

indexes = np.argsort(-ma)[:top_k]

# print(indexes)

top_regions = [ regions[i] for i in indexes ]

In [None]:
# show_grid_region(df, grid_info, true_types, the_region)
show_grid_regions(df, grid_info, true_types, top_regions)


In [None]:
## best_region vs the_region

the_region = top_regions[0]

best_idx = grid_loc2_idx[best_region['grid_loc']]
the_idx = grid_loc2_idx[the_region['grid_loc']]

print(best_region['grid_loc'], the_region['grid_loc'])
print(best_idx, the_idx)

print(statistics[best_idx], statistics[the_idx])
print(scores[best_idx], scores[the_idx])


## Multiple Partitionings (does not apply to the method proposed in the paper)

In [None]:
lat_max = df['lat'].values.max()
lat_min = df['lat'].values.min()
lon_max = df['lon'].values.max()
lon_min = df['lon'].values.min()
print(lat_min, lat_max, lon_min, lon_max)

In [None]:
lat_n_range = (10, 40)
lon_n_range = (10, 40)

n_partitionings = 100

partitionings = []

for i in range(n_partitionings):
    lat_n = random.randint(*lat_n_range)
    lon_n = random.randint(*lon_n_range)

    grid_info, grid_loc2_idx, regions = create_partitioning(df, rtree, lon_min, lon_max, lat_min, lat_max, lon_n, lat_n)

    partitionings.append((grid_info, grid_loc2_idx, regions))


### MeanVar Method

In [None]:
mean_scores = []
max_scores = []
for partitioning in partitionings:
    the_region, max_score, scores = scan_partitioning(partitioning[2], true_types)
    mean_score = np.nanmean(scores)
    mean_scores.append(mean_score)
    
    max_scores.append(max_score)

    # print(f'{mean_score=:.4f}, {max_score=:.4f}')

print(f'mean of means={np.mean(mean_scores):.4f}')
print(f'max of maxs={np.max(max_scores):.4f}')