In [3]:
import pandas as pd
import numpy as np

import pickle
import re

# DL Stack
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [4]:
%load_ext autoreload
%autoreload 2
from main.models.classification_layers import ClassifierLayer
from main.utils.loaders import FaceDataSetPrep
from main.fairness.quantile_transport import Calibrator

Will need:
* Split ids
* Data Loader
* Model Arch
* Model weights
* Calibrator

In [5]:
import pickle
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import metrics
from main.utils.loaders import FaceDataSetPrep
from main.fairness.quantile_transport import Calibrator

def calculate_metrics(model, 
                      split_location, 
                      prediction_feature,
                      sensitive_feature,
                      awareness,
                      data_location='./data/training/label_loader.csv', 
                      batch_size=256,
                      device='cpu'):
    
    # Set up device, load data
    device_ = device
    return_dict = {}

    with open(split_location, 'rb') as con_:
        splits_dict = pickle.load(con_)

    data_labels = pd.read_csv(data_location)

    data_calibration = (data_labels
                        .loc[data_labels.img_id
                             .isin(splits_dict['calib'])])
    
    data_test = (data_labels
                 .loc[data_labels.img_id
                      .isin(splits_dict['calib'])])
    
    idx_sensitive = int(np.where(data_labels.columns == sensitive_feature)[0])
    idx_prediction = int(np.where(data_labels.columns == prediction_feature)[0])


    # Prepare dataloaders
    data_set_calib = FaceDataSetPrep(data_calibration, 
                                     label_idx=idx_prediction, 
                                     sens_idx=idx_sensitive)

    data_set_test = FaceDataSetPrep(data_test, 
                                    label_idx=idx_prediction, 
                                    sens_idx=idx_sensitive)

    calib_loader = DataLoader(data_set_calib, 
                              batch_size=batch_size, 
                              shuffle=False)

    test_loader = DataLoader(data_set_test, 
                             batch_size=batch_size, 
                             shuffle=False)
    
    # Run predictions on calibration set
    calib_predictions = []
    calib_labels = []
    calib_sens = []

    for i, (inputs, labels, input_fair) in enumerate(calib_loader):
        inputs, labels, inputs_fair = inputs.to(device_), labels.to(device_), input_fair.to(device_)

        if awareness:    
            with torch.no_grad():
                outputs = model(inputs.squeeze(1), inputs_fair)
        else:
            with torch.no_grad():
                outputs = model(inputs.squeeze(1))

        if device_ != 'cpu':
            out_ = torch.sigmoid(outputs.squeeze()).cpu().detach().numpy()
            labs_ = labels.cpu().detach().numpy()
            labs_sensitive = inputs_fair.cpu().detach().numpy()
        else:
            out_ = torch.sigmoid(outputs.squeeze()).detach().numpy()
            labs_ = labels.detach().numpy()
            labs_sensitive = inputs_fair.detach().numpy()

        calib_predictions.append(out_)
        calib_labels.append(labs_)
        calib_sens.append(labs_sensitive)

    preds_calib = np.concatenate(calib_predictions)
    labs_calib = np.concatenate(calib_labels)
    sensitive_calib = np.concatenate(calib_sens)

    return_dict['calib'] = {}
    return_dict['calib']['preds'] = preds_calib
    return_dict['calib']['labs'] = labs_calib
    return_dict['calib']['sensitive'] = sensitive_calib

    fpr, tpr, _ = metrics.roc_curve(labs_calib,
                                    preds_calib,
                                    pos_label=1)

    auc_calib = metrics.auc(fpr, tpr)
    print(f'AUC Calibration: {auc_calib}')

    return_dict['metrics'] = {}
    return_dict['metrics']['auc_calibration'] = auc_calib

    # Fit Calibrator
    calibrator_ = Calibrator()
    calibrator_.fit(preds_calib,
                    sensitive_calib)
    
    ## Run Test predictions
    test_preds = []
    test_labs = []
    test_sensitive = []

    for i, (inputs, labels, input_fair) in enumerate(test_loader):
        inputs, labels, inputs_fair = inputs.to(device_), labels.to(device_), input_fair.to(device_)

        if awareness:
            with torch.no_grad():
                outputs = model(inputs.squeeze(1), inputs_fair)
        else:
            with torch.no_grad():
                outputs = model(inputs.squeeze(1))

        if device_ != 'cpu':
            out_ = torch.sigmoid(outputs.squeeze()).cpu().detach().numpy()
            labs_ = labels.cpu().detach().numpy()
            labs_sensitive = inputs_fair.cpu().detach().numpy()
        else:
            out_ = torch.sigmoid(outputs.squeeze()).detach().numpy()
            labs_ = labels.detach().numpy()
            labs_sensitive = inputs_fair.detach().numpy()

        test_preds.append(out_)
        test_labs.append(labs_)
        test_sensitive.append(labs_sensitive)

    preds_test = np.concatenate(test_preds)
    labs_test = np.concatenate(test_labs)
    sensitive_test = np.concatenate(test_sensitive)

    return_dict['test'] = {}
    return_dict['test']['preds'] = preds_test
    return_dict['test']['labs'] = labs_test
    return_dict['test']['sensitive'] = sensitive_test

    fpr, tpr, _ = metrics.roc_curve(labs_test,
                                    preds_test,
                                    pos_label=1)

    auc_test_unfair = metrics.auc(fpr, tpr)
    print(f'AUC-Test, unfair: {auc_test_unfair}')

    return_dict['metrics']['auc_test_unfair'] = auc_test_unfair

    scores_test_fair = calibrator_.transform(preds_test,
                                             sensitive_test)
    
    return_dict['test']['preds_fair'] = scores_test_fair

    fpr, tpr, _ = metrics.roc_curve(labs_test,
                                    scores_test_fair,
                                    pos_label=1)

    auc_test_fair = metrics.auc(fpr, tpr)
    print(f'AUC-Test, fair: {auc_test_fair}')
    return_dict['metrics']['auc_test_fair'] = auc_test_fair

    # Calculate unfairness
    unfairness_test = stats.ks_2samp(preds_test[sensitive_test==1],
                                     preds_test[sensitive_test==0],
                                     alternative='two-sided')[0]
    unfairness_corrected_test = stats.ks_2samp(scores_test_fair[sensitive_test==1],
                                               scores_test_fair[sensitive_test==0],
                                               alternative='two-sided')[0]
    
    return_dict['metrics']['fairness_uncorrected'] = unfairness_test
    return_dict['metrics']['fairness_corrected'] = unfairness_corrected_test

    return return_dict


In [19]:
split_string = './results_cluster/splits_seed_42.pkl'
model_pred_ = ClassifierLayer([512,256,64])
model_pred_.load_state_dict(torch.load('./data/results/models/aware/models/model_simple_42_beard.pt'))


<All keys matched successfully>

In [21]:
metrics_dict = calculate_metrics(model=model_pred_, 
                                 split_location=split_string, 
                                 prediction_feature='No_Beard', 
                                 sensitive_feature='Male', 
                                 awareness=True)

AUC Calibration: 0.9406463730030297
AUC-Test, unfair: 0.9408697138344602
AUC-Test, fair: 0.7653758492799391


In [11]:
metrics_dict['metrics']

{'auc_calibration': 0.8574212915358411,
 'auc_test_unfair': 0.8587327781750448,
 'auc_test_fair': 0.81096720518655,
 'fairness_uncorrected': 0.31564992330775826,
 'fairness_corrected': 0.005564680794877486}

# Run on all results 

In [52]:
auc_unfair_list = []
auc_fair_list = []
unfairness_unfair = []
unfairness_fair = []

want_string = 'beard'

for mod_string in os.listdir('./data/results/models/aware/models/'):
    if want_string in mod_string:
        seed_string = re.findall('\d+', mod_string)[0]

        # Load up 
        split_string = f'./data/results/split_idx/splits_seed_{seed_string}.pkl'
        model_pred_ = ClassifierLayer([512,256,64])
        model_pred_.load_state_dict(torch.load(f'./data/results/models/aware/models/model_simple_{seed_string}_{want_string}.pt'))
        model_pred_.train()

        metrics_dict = calculate_metrics(model=model_pred_, 
                                        split_location=split_string, 
                                        prediction_feature='No_Beard', 
                                        sensitive_feature='Male', 
                                        awareness=True)
        
        auc_fair_list.append(metrics_dict['metrics']['auc_test_fair'])
        auc_unfair_list.append(metrics_dict['metrics']['auc_test_unfair'])
        unfairness_unfair.append(metrics_dict['metrics']['fairness_uncorrected'])
        unfairness_fair.append(metrics_dict['metrics']['fairness_corrected'])
                        


AUC Calibration: 0.9390738251599217
AUC-Test, unfair: 0.9396422233470364
AUC-Test, fair: 0.7404997008893719
AUC Calibration: 0.9406073453648611
AUC-Test, unfair: 0.9403533816810408
AUC-Test, fair: 0.7596128883970965
AUC Calibration: 0.9412984893914752
AUC-Test, unfair: 0.9409192880342978
AUC-Test, fair: 0.7730653371270315
AUC Calibration: 0.944030299456841
AUC-Test, unfair: 0.9440343911590161
AUC-Test, fair: 0.767317047581947
AUC Calibration: 0.9411418497725421
AUC-Test, unfair: 0.941767944127714
AUC-Test, fair: 0.7732084834827461
AUC Calibration: 0.9404055173299612
AUC-Test, unfair: 0.9399710641355836
AUC-Test, fair: 0.7648107168719924
AUC Calibration: 0.9380435837430515
AUC-Test, unfair: 0.9373770997423702
AUC-Test, fair: 0.7756606011464631
AUC Calibration: 0.9451088679631046
AUC-Test, unfair: 0.9450098720743406
AUC-Test, fair: 0.8070062590630411
AUC Calibration: 0.9414216502721509
AUC-Test, unfair: 0.9420197033902054
AUC-Test, fair: 0.7541828328051344
AUC Calibration: 0.944171858199

In [54]:
fair_fair = np.round(np.array(unfairness_fair).mean(),3)
fair_fair_std = np.round(np.array(unfairness_fair).std(),3)

fair_unfair = np.round(np.array(unfairness_unfair).mean(),3)
fair_unfair_std = np.round(np.array(unfairness_unfair).std(),3)

auc_fair = np.round(np.array(auc_fair_list).mean(),3)
auc_fair_std = np.round(np.array(auc_fair_list).std(),3)

auc_unfair = np.round(np.array(auc_unfair_list).mean(),3)
auc_unfair_std = np.round(np.array(auc_unfair_list).std(),3)

print(f'& {auc_unfair} & {fair_unfair} & {auc_fair} & {fair_fair} \\\\')
print(f'& $\pm${auc_unfair_std} & $\pm$ {fair_unfair_std} & $\pm$ {auc_fair_std} & $\pm$ {fair_fair_std} \\\\')

& 0.941 & 0.896 & 0.771 & 0.531 \\
& $\pm$0.002 & $\pm$ 0.01 & $\pm$ 0.018 & $\pm$ 0.096 \\
