# Auditing a Classifier for Fairness Based on Movement Patterns

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import pickle
from tqdm import tqdm

from pathlib import Path

### Aux functions

### Main code

Read the dataset with the objects' labels.
**TODO**: we are using dummy labels for now.

In [None]:
path_candidates = './data_simulator/huge_dataset/gencand/'

with open(path_candidates + "dict_flattened_candidates.pkl", "rb") as f:
    data = pickle.load(f)

### Load the dictory containing the flattened objects ID lists associated with the candidates ###
flat_ids, indptr, lens = data['flat_ids'], data['start_pos'], data['lengths']
del data

Build a single numpy vector contaning the candidates of all the grids. Then, flatten the lists in the array.

In [None]:
# Read the dataset containing the true labels of the objects.
n_objects = 100000
positive_rate = 0.6
labels = np.random.binomial(n=1, p=positive_rate, size=n_objects).astype(np.int8)
# labels

In [None]:
from scipy.special import xlogy, xlog1py

def batch_max_likelihood_ratio(labels_objects: np.ndarray, 
                               flat_ids: np.ndarray, indptr: np.ndarray, lens: np.ndarray,
                               tot_sum_labels: int,
                               logL0_max: float) -> tuple[np.ndarray, np.ndarray, np.ndarray, float]:
    

    # Gather labels for all ids, then sum per candidate via segmented reduction.
    flat_vals = labels_objects[flat_ids]
    inside_sum = np.add.reduceat(flat_vals, indptr[:-1]).astype(np.float32, copy=False)


    # Vectorized computation: for each candidate subset of cells, compute the positive rate of the objects
    # associated with the subset vs the positive rate of the other objects.
    # NOTE: we use np.divide with the `where` parameter to avoid divisions by zero.
    p, n = inside_sum, lens
    P, N = tot_sum_labels, labels_objects.size
    inside_positive_rate  = np.divide(p, n, out=np.zeros_like(p, dtype=np.float32), where=(n > 0))
    outside_positive_rate = np.divide(P - p, N - n, out=np.zeros_like(p, dtype=np.float32), where=((N - n) > 0))
    

    # Unsafe computation of the log-likelihood under the alternative hypotesis.
    #logL1 = (p * np.log(inside_positive_rate)
    #         + (n - p) * np.log1p(-inside_positive_rate)
    #         + (P - p) * np.log(outside_positive_rate)
    #         + (N - n - (P - p)) * np.log1p(-outside_positive_rate))
    
    
    # Safe alternative computation of the log-L1 via scipy functions. 
    # NOTE: the log-likelihood is -inf when the positive rate is 0 or 1, which can happen when p==0 or p==n for the inside positive rate, 
    # or when P-p==0 or N-n-(P-p)==0 for the outside positive rate. This is not a problem per se, since we are interested in the likelihood
    # ratio, and if the likelihood under the alternative hypotesis is -inf, then the likelihood ratio will be 0, which is what we expect in
    # these cases.
    
    # valid = (n > 0) & (n < N) # optional: mask degenerate windows (n==0 or n==N)
    # logL1 = np.full_like(inside_positive_rate, -np.inf, dtype=np.float32)
    logL1 = ( xlogy(p, inside_positive_rate) +                          # p * np.log(inside_positive_rate)
              xlog1py((n - p), -inside_positive_rate) +                 # + (n - p) * np.log1p(-inside_positive_rate)
              xlogy((P - p), outside_positive_rate) +                   # + (P - p) * np.log(outside_positive_rate)
              xlog1py((N - n - (P - p)), -outside_positive_rate) )      # + (N - n - (P - p)) * np.log1p(-outside_positive_rate))

    # Vectorized computation of the log-likelihood ratio of the candidates
    logLR = logL1 - logL0_max
    maxLogLR = float(np.nanmax(logLR)) 

    return inside_positive_rate, outside_positive_rate, logLR, maxLogLR

#### Compute the empirical distribution of the considered test statistic with a certain number of Monte Carlo simulations.

Here we perform the Monte Carlo simulations needed to determine the distribution of the test statistics under the assumption that the null hypothesis is true. The test statistics used is the maximum likelihood ratio computed across the regions of all the grids, while the likelihood function is the binomial-based one.

In [None]:
P = labels.sum(dtype=np.uint32) # Constant across permutations
N = labels.size


# Compute the L_0 likelihood, which models the likelihood of observing the labels in the data under the the assumption that the 
# null hypotesis H_0 is true, i.e., there is a single global distribution that governs the labels. L_0 is constant across 
# permutations, since it depends only on the total number of positive and negative labels in the dataset, which does not change
# when shuffling the original labels.
rho = P / N
logL0_max = P * np.log(rho) + (N - P) * np.log1p(-rho)


num_simulations = 200
vec_max_LR = np.empty(num_simulations, dtype=np.float32)
for i in tqdm(range(num_simulations)):    
    # Shuffle the original labels assigned to the objects. This represents the null hypotesis H_0, according to which
    # there is a single global distribution that governs the labels, i.e., there is not one or more sets of geographical regions
    # in which the associated objects have an average positive rate that is significantly different than that of the other objects. 
    rng = np.random.default_rng(i)
    shuffled_labels = rng.permutation(labels)

    # For the objects associated with each subset of cells, compute their positive rate vs that of the other objects.
    _, _, _, vec_max_LR[i] = batch_max_likelihood_ratio(shuffled_labels,
                                                        flat_ids, indptr, lens,
                                                        P, logL0_max)


# DEBUG
# np_list_candidates_inrate, np_list_candidates_outrate

# Sort the max_LR distribution obtained via the simulations.
# This is the distribution of the likelihood ratios of the most extreme regions observed empirically
# under the assumption that the null hypothesis H_0 is true.
sorted_vec_max_LR = np.sort(vec_max_LR)

# Print the most extreme likelihood ratio observed across all the simulations.
print(sorted_vec_max_LR[-1])

#### Now compute the max likelihood ratio from the candidates when considering the original labels.

In [None]:
inside_positive_rate, outside_positive_rate, vec_LR_dataset, max_LR_dataset = batch_max_likelihood_ratio(labels,
                                                                                                         flat_ids, indptr, lens,
                                                                                                         P, logL0_max)
max_LR_dataset

#### Finally, determine if $H_0$ must be rejected.

In [None]:
# Significance level required (probability of rejecting the null when it is actually true).
alpha = 0.05

# Determine where the max LR computed with the original labels fall in the empirical test statistic's distribution.
pos = np.searchsorted(sorted_vec_max_LR, max_LR_dataset, side="right")  # where observed stat falls

# Monte Carlo p-value of the observed test statistic's value derived from the ranked test statistic's distribution 
# (right tail), with +1 correction.
p_value = (num_simulations - pos + 1) / (num_simulations + 1)

# Based on the distribution and the real data we have, decide if we have to reject H_0.
reject_H0 = p_value <= alpha

print(f"position in sorted MC sample: {pos}/{num_simulations}")
print(f"Monte Carlo p-value: {p_value:.6f}")
print("Reject H0" if reject_H0 else "Do NOT reject H0")

In [None]:
# DEBUG: plot the distribution of the likelihood ratios computed over the subsets of cells when considering the original labels.

import numpy as np
import matplotlib.pyplot as plt

plt.hist(vec_LR_dataset, bins=200)  # tune bins (100â€“500 usually fine)
plt.xlabel("Values of the candidates' likelihood ratios")
plt.ylabel(f"Frequency (log-scale) (total: {vec_LR_dataset.size} objects)")
plt.yscale("log")
plt.show()

In [None]:
# DEBUG: find the characteristics of the subset of cells with the maximum LR when considering the original labels.

# Find where the candidate with the max LR is located.
idx = np.argsort(vec_LR_dataset)

# Sort the 1D arrays of interest accordingly.
vec_LR_dataset_sorted = vec_LR_dataset[idx]
lens_sorted = lens[idx]
inside_positive_rate_sorted = inside_positive_rate[idx]
outside_positive_rate_sorted = outside_positive_rate[idx]

# Print the max LR candidate's info.
print(f"Info most 'problematic' candidate: local ps: {inside_positive_rate_sorted[-1]}, " +
      f"other ps: {outside_positive_rate_sorted[-1]}, num_objs_candidate: {lens_sorted[-1]}, " +
      f"likelihood ratio: {vec_LR_dataset_sorted[-1]}")


### DEBUG: Simple for loop-based version of the computation of inside/outside positive rates; to be used for debugging purposes ###