# Auditing a Classifier for Fairness Based on Movement Patterns

In [None]:
import numpy as np
from src.bernoulli_spatial_scan import BernoulliSpatialScan

### Set up an object modeling the Bernoulli-based spatial scan statistic

In [None]:
# Retrieve the flattened candidates + aux info.
path_dict_candidates = './data_simulator/huge_dataset/gencand/dict_flattened_candidates.pkl' # Path to the flattened candidates.
flat_ids, indptr, lengths = BernoulliSpatialScan.load_flattened_candidates(path_dict_candidates)

### Read the dataset with the "true" labels.
**TODO**: we are using dummy labels for now.

In [None]:
# Read the dataset containing the true labels of the objects.
n_objects = 100000
positive_rate = 0.6
labels = np.random.default_rng(42).binomial(n=1, p=positive_rate, size=n_objects).astype(np.int8)
# labels

First compute the Monte Carlo simulations needed to determine the distribution of the test statistics under the assumption that the null hypothesis is true. The test statistics used is the maximum log likelihood ratio computed across the regions of all the grids, while the likelihood functions to model H_0 and H_1 are the Bernoulli-based ones.

Once we have the distribution, determine if the value of the test statistics computed from the "real" labels points to unfairness somewhere or not, thus rejecting the null.

In [None]:
# Instantiate the 'BernoulliSpatialScan' object.
num_simulations = 500   # Number of Monte Carlo simulations to derive an approx. distribution of the test statistics
alpha = 0.02            # Significance level required.
spatial_scan = BernoulliSpatialScan(num_simulations, alpha, flat_ids, indptr, lengths)
#
# reject, vec_max_LR_sims, dist_LR_labels, max_LR_labels = spatial_scan.sequential_simulations(labels)
reject, vec_max_LR_sims, dist_LR_labels, max_LR_labels = spatial_scan.parallel_simulations(labels)

In [None]:
# DEBUG: print some info about the results.
print(reject, np.flip(np.sort(vec_max_LR_sims)[-int(vec_max_LR_sims.size * alpha) : ]), max_LR_labels)

# Find out the indexes of the subsets of cells whose likelihood ratios with the original labels are above
# the simulations' threshold for which they should be considered 'extreme' and the null hypothesis must be rejected.
threshold_value = np.sort(vec_max_LR_sims)[-int(vec_max_LR_sims.size * alpha)]
print(threshold_value)
pos_extreme_candidates = np.argwhere(dist_LR_labels >= threshold_value).tolist()
print(pos_extreme_candidates)