In [1]:
pip install aif360

Note: you may need to restart the kernel to use updated packages.


In [2]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
import numpy as np
from tqdm import tqdm
from warnings import warn

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset
from aif360.metrics import ClassificationMetric, BinaryLabelDatasetMetric
from aif360.metrics.utils import compute_boolean_conditioning_vector
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions\
        import load_preproc_data_adult, load_preproc_data_german, load_preproc_data_compas
from aif360.algorithms.postprocessing.reject_option_classification\
        import RejectOptionClassification
from common_utils_ import compute_metrics

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt
from ipywidgets import interactive, FloatSlider
from tabulate import tabulate
from xgboost import XGBClassifier

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'


Load Dataset and specify options

In [3]:
protected_attribute_used = 0
dataset_orig = GermanDataset()
if protected_attribute_used == 1:
    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]
    dataset_orig = load_preproc_data_german(['sex'])
else:
    privileged_groups = [{'age': 1}]
    unprivileged_groups = [{'age': 0}]
    dataset_orig = load_preproc_data_german(['age'])
    
        
# Metric used (should be one of allowed_metrics)
metric_name = "Statistical parity difference"

# Upper and lower bound on the fairness metric used
metric_ub = 0.05
metric_lb = -0.05
        
#random seed for calibrated equal odds prediction
np.random.seed(1)

# Verify metric name
allowed_metrics = ["Statistical parity difference",
                   "Average odds difference",
                   "Equal opportunity difference"]
if metric_name not in allowed_metrics:
    raise ValueError("Metric name should be one of allowed metrics")

Split into train, test and validation

In [4]:
# Get the dataset and split into train and test
dataset_orig_train, dataset_orig_vt = dataset_orig.split([0.7], shuffle=True)
dataset_orig_valid, dataset_orig_test = dataset_orig_vt.split([0.5], shuffle=True)

Clean up training data and display properties of the data

In [5]:
# print out some labels, names, etc.
display(Markdown("#### Training Dataset shape"))
print(dataset_orig_train.features.shape)
display(Markdown("#### Favorable and unfavorable labels"))
print(dataset_orig_train.favorable_label, dataset_orig_train.unfavorable_label)
display(Markdown("#### Protected attribute names"))
print(dataset_orig_train.protected_attribute_names)
display(Markdown("#### Privileged and unprivileged protected attribute values"))
print(dataset_orig_train.privileged_protected_attributes, 
      dataset_orig_train.unprivileged_protected_attributes)
display(Markdown("#### Dataset feature names"))
print(dataset_orig_train.feature_names)

#### Training Dataset shape

(700, 11)


#### Favorable and unfavorable labels

1.0 2.0


#### Protected attribute names

['age']


#### Privileged and unprivileged protected attribute values

[array([1.])] [array([0.])]


#### Dataset feature names

['age', 'sex', 'credit_history=Delay', 'credit_history=None/Paid', 'credit_history=Other', 'savings=500+', 'savings=<500', 'savings=Unknown/None', 'employment=1-4 years', 'employment=4+ years', 'employment=Unemployed']


## Metric for original training data

In [6]:
metric_orig_train = BinaryLabelDatasetMetric(dataset_orig_train, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
display(Markdown("#### Original training dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())

#### Original training dataset

Difference in mean outcomes between unprivileged and privileged groups = -0.119044


In [7]:
# Select the model
model_used = "XGB"  # SVC, RF

## Train classifier on original data

In [8]:
# Logistic regression classifier and predictions
# scale_orig = StandardScaler()
X_train = dataset_orig_train.features
y_train = dataset_orig_train.labels.ravel()
for i, j in enumerate(y_train):
    if j == 2.0:
        y_train[i] = 1.0
    else:
        y_train[i] = 0.0

if model_used == "XGB":
    clf_model = XGBClassifier()
    clf_model.fit(X_train, y_train)

    

# positive class index
# ==================================================
if dataset_orig_train.favorable_label == 1.0:
    dataset_orig_train_num = 0.
else:
    dataset_orig_train_num = 1.0
# positive class index 
pos_ind = np.where(clf_model.classes_ == dataset_orig_train_num)[0][0]

dataset_orig_train_pred = dataset_orig_train.copy(deepcopy=True)
y_train_pred = dataset_orig_train_pred.labels

## Obtain scores for validation and test sets

In [9]:
if model_used == "XGB":
    dataset_orig_valid_pred = dataset_orig_valid.copy(deepcopy=True)
    # X_valid = scale_orig.transform(dataset_orig_valid_pred.features)
    X_valid = dataset_orig_valid_pred.features
    y_valid = dataset_orig_valid_pred.labels
    # predict on vaidation set
    dataset_orig_valid_pred.scores = clf_model.predict_proba(X_valid)[:,pos_ind].reshape(-1,1)
    
    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
    # X_test = scale_orig.transform(dataset_orig_test_pred.features)
    X_test = dataset_orig_test_pred.features
    y_test = dataset_orig_test_pred.labels
    # predict on test set
    dataset_orig_test_pred.scores = clf_model.predict_proba(X_test)[:,pos_ind].reshape(-1,1)

## Find the optimal parameters from the validation set
### Best threshold for classification only (no fairness)

In [10]:
num_thresh = 100
ba_arr = np.zeros(num_thresh)
class_thresh_arr = np.linspace(0.01, 0.99, num_thresh)
for idx, class_thresh in enumerate(class_thresh_arr):
    
    fav_inds = dataset_orig_valid_pred.scores > class_thresh
    dataset_orig_valid_pred.labels[fav_inds] = dataset_orig_valid_pred.favorable_label
    dataset_orig_valid_pred.labels[~fav_inds] = dataset_orig_valid_pred.unfavorable_label
    
    classified_metric_orig_valid = ClassificationMetric(dataset_orig_valid,
                                             dataset_orig_valid_pred, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
    
    ba_arr[idx] = 0.5*(classified_metric_orig_valid.true_positive_rate()\
                       +classified_metric_orig_valid.true_negative_rate())

best_ind = np.where(ba_arr == np.max(ba_arr))[0][0]
best_class_thresh = class_thresh_arr[best_ind]

print("Best balanced accuracy (no fairness constraints) = %.4f" % np.max(ba_arr))
print("Optimal classification threshold (no fairness constraints) = %.4f" % best_class_thresh)

Best balanced accuracy (no fairness constraints) = 0.6761
Optimal classification threshold (no fairness constraints) = 0.6732


## Estimate optimal parameters for the ROC method

In [11]:
ROC = RejectOptionClassification(unprivileged_groups=unprivileged_groups, 
                                 privileged_groups=privileged_groups, 
                                 low_class_thresh=0.01, high_class_thresh=0.99,
                                  num_class_thresh=100, num_ROC_margin=50,
                                  metric_name=metric_name,
                                  metric_ub=metric_ub, metric_lb=metric_lb)
ROC = ROC.fit(dataset_orig_valid, dataset_orig_valid_pred)

In [12]:
print("Optimal classification threshold (with fairness constraints) = %.4f" % ROC.classification_threshold)
print("Optimal ROC margin = %.4f" % ROC.ROC_margin)

Optimal classification threshold (with fairness constraints) = 0.5247
Optimal ROC margin = 0.1261


## Predictions from Validation Set

In [13]:
fav_inds = dataset_orig_valid_pred.scores > best_class_thresh
dataset_orig_valid_pred.labels[fav_inds] = dataset_orig_valid_pred.favorable_label
dataset_orig_valid_pred.labels[~fav_inds] = dataset_orig_valid_pred.unfavorable_label

display(Markdown("#### Validation set"))
display(Markdown("##### Raw predictions - No fairness constraints, only maximizing balanced accuracy"))

metric_valid_bef = compute_metrics(dataset_orig_valid, dataset_orig_valid_pred, 
                unprivileged_groups, privileged_groups)

#### Validation set

##### Raw predictions - No fairness constraints, only maximizing balanced accuracy

Balanced accuracy = 0.6761
Statistical parity difference = -0.2240
Precision = 0.8906
Recall = 0.5229
F1 = 0.6590
Disparate impact = 0.5172
Average odds difference = -0.0947
Equal opportunity difference = -0.0253
Theil index = 0.4491


In [14]:
# Transform the validation set
dataset_transf_valid_pred = ROC.predict(dataset_orig_valid_pred)

display(Markdown("#### Validation set"))
display(Markdown("##### Transformed predictions - With fairness constraints"))
metric_valid_aft = compute_metrics(dataset_orig_valid, dataset_transf_valid_pred, 
                unprivileged_groups, privileged_groups)

#### Validation set

##### Transformed predictions - With fairness constraints

Balanced accuracy = 0.6457
Statistical parity difference = 0.0480
Precision = 0.8472
Recall = 0.5596
F1 = 0.6740
Disparate impact = 1.1017
Average odds difference = 0.1836
Equal opportunity difference = 0.2646
Theil index = 0.4182


### Predictions from Test Set

In [15]:
# Metrics for the test set
fav_inds = dataset_orig_test_pred.scores > best_class_thresh
dataset_orig_test_pred.labels[fav_inds] = dataset_orig_test_pred.favorable_label
dataset_orig_test_pred.labels[~fav_inds] = dataset_orig_test_pred.unfavorable_label

display(Markdown("#### Test set"))
display(Markdown("##### Raw predictions - No fairness constraints, only maximizing balanced accuracy"))

metric_test_bef = compute_metrics(dataset_orig_test, dataset_orig_test_pred, 
                unprivileged_groups, privileged_groups)

#### Test set

##### Raw predictions - No fairness constraints, only maximizing balanced accuracy

Balanced accuracy = 0.6238
Statistical parity difference = -0.2278
Precision = 0.8026
Recall = 0.5810
F1 = 0.6740
Disparate impact = 0.5853
Average odds difference = -0.2016
Equal opportunity difference = -0.2318
Theil index = 0.3867


In [16]:
# Metrics for the transformed test set
dataset_transf_test_pred = ROC.predict(dataset_orig_test_pred)

display(Markdown("#### Test set"))
display(Markdown("##### Transformed predictions - With fairness constraints"))
metric_test_aft = compute_metrics(dataset_orig_test, dataset_transf_test_pred, 
                unprivileged_groups, privileged_groups)

#### Test set

##### Transformed predictions - With fairness constraints

Balanced accuracy = 0.6190
Statistical parity difference = 0.0937
Precision = 0.7882
Recall = 0.6381
F1 = 0.7053
Disparate impact = 1.1706
Average odds difference = 0.1151
Equal opportunity difference = 0.1015
Theil index = 0.3350


In [17]:
display(Markdown("## Summary of Optimal Parameters"))
display(Markdown("### We show the optimal parameters for all combinations of metrics optimized, dataset, and protected attributes below"))
display(Markdown("#### Fairness Metric: Statistical parity difference, Accuracy Metric: Balanced accuracy"))
performance_table = {
    'Dataset': ["German(Valid)", "German(Test)"],
    'Age(Acc-Bef)': [metric_valid_bef["Balanced accuracy"],metric_test_bef["Balanced accuracy"]],
    'Age(Acc-Aft)' : [metric_valid_aft["Balanced accuracy"],metric_test_aft["Balanced accuracy"]],
    'Age(Fair-Bef)' : [metric_valid_bef["Average odds difference"], metric_test_bef["Average odds difference"]],
    'Age(Fair-Aft)' : [metric_valid_aft["Average odds difference"],metric_test_aft["Average odds difference"]]
    
}

#print(f'\nModel used: {model_used}')
display(Markdown("#### Performance metrics"))
print(tabulate(performance_table, headers='keys', tablefmt='fancy_grid', floatfmt=".4f"))
display(Markdown("#### Fairness Metric: Average odds difference, Accuracy Metric: Balanced accuracy"))
performance_table = {
    'Dataset': ["German(Valid)", "German(Test)"],
    'Age(Acc-Bef)': [metric_valid_bef["Balanced accuracy"],metric_test_bef["Balanced accuracy"]],
    'Age(Acc-Aft)' : [metric_valid_aft["Balanced accuracy"],metric_test_aft["Balanced accuracy"]],
    'Age(Fair-Bef)' : [metric_valid_bef["Statistical parity difference"], metric_test_bef["Statistical parity difference"]],
    'Age(Fair-Aft)' : [metric_valid_aft["Statistical parity difference"],metric_test_aft["Statistical parity difference"]]
    
}
#print(f'\nModel used: {model_used}')
display(Markdown("#### Performance metrics"))
print(tabulate(performance_table, headers='keys', tablefmt='fancy_grid', floatfmt=".4f"))

display(Markdown("#### Fairness Metric: Equal opportunity difference, Accuracy Metric: Balanced accuracy"))
performance_table = {
    'Dataset': ["German(Valid)", "German(Test)"],
    'Age(Acc-Bef)': [metric_valid_bef["Balanced accuracy"],metric_test_bef["Balanced accuracy"]],
    'Age(Acc-Aft)' : [metric_valid_aft["Balanced accuracy"],metric_test_aft["Balanced accuracy"]],
    'Age(Fair-Bef)' : [metric_valid_bef["Equal opportunity difference"], metric_test_bef["Equal opportunity difference"]],
    'Age(Fair-Aft)' : [metric_valid_aft["Equal opportunity difference"],metric_test_aft["Equal opportunity difference"]]
    
}
print(f'\nModel used: {model_used}')
display(Markdown("#### Performance metrics"))
print(tabulate(performance_table, headers='keys', tablefmt='fancy_grid', floatfmt=".4f"))

optimal_table = {
    'Dataset' : ["German"],
    'Age (Class. thresh.)' : [best_class_thresh],
    'Age (Class. thresh. - fairness)': [ROC.classification_threshold],
    'Age (ROC margin - fairness)' : [ROC.ROC_margin]
}

display(Markdown("#### Optimal Parameters"))
print(tabulate(optimal_table, headers='keys', tablefmt='fancy_grid', floatfmt=".4f"))

performance_table = {
    'Reweighting': ["Before", "After"],
    'Accuracy': [metric_valid_bef["Balanced accuracy"], metric_test_aft["Balanced accuracy"]],
    'Precision' : [metric_valid_bef["Precision"], metric_test_aft["Precision"]],
    'Recall' : [metric_valid_bef["Recall"], metric_test_aft["Recall"]],
    'F1' : [metric_valid_bef["F1"], metric_test_aft["F1"]],
}

display(Markdown("#### Performance metrics"))
print(tabulate(performance_table, headers='keys', tablefmt='fancy_grid', floatfmt=".4f"))


## Summary of Optimal Parameters

### We show the optimal parameters for all combinations of metrics optimized, dataset, and protected attributes below

#### Fairness Metric: Statistical parity difference, Accuracy Metric: Balanced accuracy

#### Performance metrics

╒═══════════════╤════════════════╤════════════════╤═════════════════╤═════════════════╕
│ Dataset       │   Age(Acc-Bef) │   Age(Acc-Aft) │   Age(Fair-Bef) │   Age(Fair-Aft) │
╞═══════════════╪════════════════╪════════════════╪═════════════════╪═════════════════╡
│ German(Valid) │         0.6761 │         0.6457 │         -0.0947 │          0.1836 │
├───────────────┼────────────────┼────────────────┼─────────────────┼─────────────────┤
│ German(Test)  │         0.6238 │         0.6190 │         -0.2016 │          0.1151 │
╘═══════════════╧════════════════╧════════════════╧═════════════════╧═════════════════╛


#### Fairness Metric: Average odds difference, Accuracy Metric: Balanced accuracy

#### Performance metrics

╒═══════════════╤════════════════╤════════════════╤═════════════════╤═════════════════╕
│ Dataset       │   Age(Acc-Bef) │   Age(Acc-Aft) │   Age(Fair-Bef) │   Age(Fair-Aft) │
╞═══════════════╪════════════════╪════════════════╪═════════════════╪═════════════════╡
│ German(Valid) │         0.6761 │         0.6457 │         -0.2240 │          0.0480 │
├───────────────┼────────────────┼────────────────┼─────────────────┼─────────────────┤
│ German(Test)  │         0.6238 │         0.6190 │         -0.2278 │          0.0937 │
╘═══════════════╧════════════════╧════════════════╧═════════════════╧═════════════════╛


#### Fairness Metric: Equal opportunity difference, Accuracy Metric: Balanced accuracy


Model used: XGB


#### Performance metrics

╒═══════════════╤════════════════╤════════════════╤═════════════════╤═════════════════╕
│ Dataset       │   Age(Acc-Bef) │   Age(Acc-Aft) │   Age(Fair-Bef) │   Age(Fair-Aft) │
╞═══════════════╪════════════════╪════════════════╪═════════════════╪═════════════════╡
│ German(Valid) │         0.6761 │         0.6457 │         -0.0253 │          0.2646 │
├───────────────┼────────────────┼────────────────┼─────────────────┼─────────────────┤
│ German(Test)  │         0.6238 │         0.6190 │         -0.2318 │          0.1015 │
╘═══════════════╧════════════════╧════════════════╧═════════════════╧═════════════════╛


#### Optimal Parameters

╒═══════════╤════════════════════════╤═══════════════════════════════════╤═══════════════════════════════╕
│ Dataset   │   Age (Class. thresh.) │   Age (Class. thresh. - fairness) │   Age (ROC margin - fairness) │
╞═══════════╪════════════════════════╪═══════════════════════════════════╪═══════════════════════════════╡
│ German    │                 0.6732 │                            0.5247 │                        0.1261 │
╘═══════════╧════════════════════════╧═══════════════════════════════════╧═══════════════════════════════╛


#### Performance metrics

╒═══════════════╤════════════╤═════════════╤══════════╤════════╕
│ Reweighting   │   Accuracy │   Precision │   Recall │     F1 │
╞═══════════════╪════════════╪═════════════╪══════════╪════════╡
│ Before        │     0.6761 │      0.8906 │   0.5229 │ 0.6590 │
├───────────────┼────────────┼─────────────┼──────────┼────────┤
│ After         │     0.6190 │      0.7882 │   0.6381 │ 0.7053 │
╘═══════════════╧════════════╧═════════════╧══════════╧════════╛
