# Auditing a Classifier for Fairness Based on Movement Patterns

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import numba as nb

from tqdm import tqdm

### Main code

In [None]:
def batch_in_out_probs(labels: np.ndarray, list_users: np.ndarray):
    """
    list_users: object array of 1D int arrays (your candidates["list_users"].to_numpy()).
    Returns: inside_mean, outside_mean as float arrays (len = n_candidates)
    """
    
    n = labels.size
    tot_sum = labels.sum(dtype=np.uint32)

    # Compute the number of cells in each candidate subset of cells.
    lens = np.fromiter((a.size for a in list_users), dtype=np.uint32, count=len(list_users))

    # Exception: if any empty candidate exists, avoid division by zero
    if np.any(lens == 0) or np.any(lens == n):
        raise ValueError("Found candidate with size 0 or size n; handle these cases explicitly.")

    # "indptr" = starting offsets of each candidate inside the flattened array
    indptr = np.empty(len(lens) + 1, dtype=np.uint32)
    indptr[0] = 0
    np.cumsum(lens, out=indptr[1:])

    # flatten all selected ids
    flat_ids = np.concatenate(list_users)

    # gather labels for all ids, then sum per candidate via segmented reduction
    flat_vals = labels[flat_ids]
    inside_sum = np.add.reduceat(flat_vals, indptr[:-1])

    # Compute the positive rates inside and outside each candidate.
    inside_mean = inside_sum / lens
    outside_mean = (tot_sum - inside_sum) / (n - lens)

    return inside_mean.astype(np.float32), outside_mean.astype(np.float32)


In [None]:
# Generate a dummy vector of labels 0/1
n_objects = 100000
positive_rate = 0.6
labels = np.random.binomial(n=1, p=positive_rate, size=n_objects).astype(np.int8)
# labels

In [None]:
# Read the candidates to be tested over a certain grid.
path_candidates = './data_simulator/huge_dataset/gencand/candidates_100_0.pkl'
candidates = pd.read_pickle(path_candidates)
candidates['num_users'] = candidates['list_users'].apply(len).astype(np.uint32) # Take note of the number of users

# Remove the original index, which contained in the form of tuples the cells making up a candidate subset of cells,
# and we do not need them at this step of our approach. Saves a lot of memory.
candidates.reset_index(drop=True, inplace=True)
# candidates

# For the objects associated with each subset of cells, compute the inside and outside positive rates.
candidates["in_rate"], candidates["out_rate"] = batch_in_out_probs(labels, candidates["list_users"].to_numpy())

del candidates['list_users']
candidates.info(memory_usage="deep")
# candidates[450:470]

### DEBUG: Simple for loop-based version of the computation of inside/outside positive rates; to be used for debugging purposes ###