# Auditing a Classifier for Fairness Based on Movement Patterns

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from tqdm import tqdm

from pathlib import Path

### Aux functions

In [None]:
def flatten_lists_ids(np_list_candidates : np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    ''' 
    Flatten the object ID lists associated with the candidates into a 1D array ###
    # NOTE: we do this because we can then use joblib's shared memory.

    Parameters
    ----------
    np_list_candidates : np.ndarray
        An array of lists, where each list contains the object IDs associated with a candidate.
    labels : np.ndarray
        A binary array indicating the presence (1) or absence (0) of a certain property for a given set of objects.

    Returns
    -------
    flat_ids : np.ndarray
        A 1D array containing all the object IDs associated with the candidates, concatenated together.
    indptr : np.ndarray
        An array of indices indicating the starting position of each candidate's list in the `flat_ids` array.
    lens : np.ndarray
        An array containing the length of each candidate's list of associated object IDs.
    '''

    # Compute the lengths of each candidate's list.
    lens = np.fromiter((a.size for a in np_list_candidates),
                       dtype=np.int32, count=len(np_list_candidates))

    # Compute the starting/ending positions of each candidate's list in the flattened array.
    indptr = np.empty(lens.size + 1, dtype=np.uint32)
    indptr[0] = 0
    np.cumsum(lens, out=indptr[1:])

    # Flatten the lists into a single vector.
    flat_ids = np.concatenate(np_list_candidates).astype(np.uint32, copy=False)
    
    return flat_ids, indptr, lens
    

In [None]:
def batch_in_out_probs(labels_objects: np.ndarray, 
                       flat_ids: np.ndarray, indptr: np.ndarray, lens: np.ndarray,
                       tot_sum_labels: int) -> tuple[np.ndarray, np.ndarray]:
    """
    list_users: object array of 1D int arrays (your candidates["list_users"].to_numpy()).
    Returns: inside_mean, outside_mean as float arrays (len = n_candidates)
    """
    
    # Total number of objects.
    num_objects = labels_objects.size

    # Gather labels for all ids, then sum per candidate via segmented reduction
    flat_vals = labels_objects[flat_ids]
    inside_sum = np.add.reduceat(flat_vals, indptr[:-1]).astype(np.float32, copy=False)

    # Compute the positive rates inside and outside each candidate.
    inside_mean = inside_sum / lens
    outside_mean = (tot_sum_labels - inside_sum) / (num_objects - lens)

    return inside_mean, outside_mean

### Main code

Read the dataset with the objects' labels.
**TODO**: we are using dummy labels for now.

In [None]:
# Read the dataset containing the true labels of the objects.
n_objects = 100000
positive_rate = 0.6
labels = np.random.binomial(n=1, p=positive_rate, size=n_objects).astype(np.int8)
# labels

Build a single numpy vector contaning the candidates of all the grids. Then, flatten the lists in the array.

In [None]:
path_candidates = './data_simulator/huge_dataset/gencand/'
list_candidates_paths = [f for f in Path(path_candidates).iterdir() if f.is_file()]

# Read the candidates to be tested over a set of grids.
np_list_candidates = None
for path_candidates in tqdm(list_candidates_paths, 
                            desc="Processing candidate files",
                            unit="file"):

    # Read the candidates that have been generated for a specific grid.
    candidates = pd.read_pickle(path_candidates)
    # print(f"Reading grid candidates from {path_candidates}")

    # Generate two numpy arrays from the candidates DataFrame: one for the list of users associated with each candidate
    # (subset of cells), and one for the size (number of cells of a subset) of each candidate.
    cand = candidates['list_users'].to_numpy()
    np_list_candidates = np.append(np_list_candidates, cand) if np_list_candidates is not None else cand

    # print(f"Number of candidates: {cand.size}")

print(f"Total number of candidates: {np_list_candidates.size}")


### Flatten the object ID lists associated with the candidates into a 1D array (plus aux arrays) ###
flat_ids, indptr, lens = flatten_lists_ids(np_list_candidates)
del np_list_candidates  # free memory

Here we perform the Monte Carlo simulations needed to determine the distribution of the test statistics under the assumption that the null hypothesis is true.
 
The test statistics used is the maximum likelihood ratio computed across the regions of all the grids, while the likelihood function is the binomial-based one.

In [None]:
num_simulations = 200
tot_sum_labels = labels.sum(dtype=np.uint32) # Constant across permutations
max_likelihood_ratios_vec = np.empty(num_simulations, dtype=np.float32)
for i in tqdm(range(num_simulations)):    
    # Shuffle the original labels assigned to the objects. This represents the null hypotesis H_0, according to which
    # there is a single global distribution that governs the labels, i.e., there is not one or more sets of geographical regions
    # in which the associated objects have an average positive rate that is significantly different than that of the other objects. 
    rng = np.random.default_rng(i)
    shuffled_labels = rng.permutation(labels)

    # For the objects associated with each subset of cells, compute their positive rate vs that of the other objects.
    np_list_candidates_inrate, np_list_candidates_outrate = batch_in_out_probs(shuffled_labels,
                                                                               flat_ids, indptr, lens,
                                                                               tot_sum_labels)

    # For each candidate, compute the likelihood ratio.

    # Determine the maximum ratio found, and append it to a list.


# DEBUG
# np_list_candidates_inrate, np_list_candidates_outrate

### DEBUG: Simple for loop-based version of the computation of inside/outside positive rates; to be used for debugging purposes ###