In [9]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [11]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "fairness-variance":
    os.chdir("../../../..")

print('Current location: ', os.getcwd())

Current location:  /Users/denys_herasymuk/UCU/4course_2term/Bachelor_Thesis/Code/fairness-variance


In [12]:
import copy
from virny.datasets import StudentPerformancePortugueseDataset
from virny.preprocessing.basic_preprocessing import preprocess_dataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.datasets import BinaryLabelDataset

from source.preprocessing import get_simple_preprocessor, apply_lfr
from configs.constants import EXPERIMENT_SEEDS
from IPython.display import Markdown, display

In [13]:
exp_iter_num = 1
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
test_set_fraction = 0.25
sensitive_attr_for_intervention = 'sex_binary'
privileged_groups = [{sensitive_attr_for_intervention: 1}]
unprivileged_groups = [{sensitive_attr_for_intervention: 0}]
intervention_options = {'k': 10, 'Ax': 0.1, 'Ay': 0.9, 'Az': 2.0}

data_loader = StudentPerformancePortugueseDataset()
data_loader.X_data.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,reason,guardian,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,18,4,4,2,2,0,4,3,4,1,...,course,mother,yes,no,no,no,yes,yes,no,no
1,17,1,1,1,2,0,5,3,3,1,...,course,father,no,yes,no,no,no,yes,yes,no
2,15,1,1,1,2,0,4,3,2,2,...,other,mother,yes,no,no,no,yes,yes,yes,no
3,15,4,2,1,3,0,3,2,2,1,...,home,mother,no,yes,no,yes,yes,yes,yes,yes
4,16,3,3,1,2,0,4,3,2,1,...,home,father,no,yes,no,no,yes,yes,no,no


## Preprocessing

In [14]:
init_data_loader = copy.deepcopy(data_loader)
data_loader.categorical_columns = [col for col in data_loader.categorical_columns if col != 'sex']
data_loader.X_data[sensitive_attr_for_intervention] = data_loader.X_data['sex'].apply(lambda x: 1 if x == 'M' else 0)
data_loader.full_df = data_loader.full_df.drop(['sex'], axis=1)
data_loader.X_data = data_loader.X_data.drop(['sex'], axis=1)

# Preprocess the dataset using the defined preprocessor
column_transformer = get_simple_preprocessor(data_loader)
base_flow_dataset = preprocess_dataset(data_loader, column_transformer, test_set_fraction, experiment_seed)
base_flow_dataset.init_features_df = init_data_loader.full_df.drop(init_data_loader.target, axis=1, errors='ignore')

# Align indexes of base_flow_dataset with data_loader for sensitive_attr_for_intervention column
base_flow_dataset.X_train_val[sensitive_attr_for_intervention] =\
    data_loader.X_data.loc[base_flow_dataset.X_train_val.index, sensitive_attr_for_intervention]
base_flow_dataset.X_test[sensitive_attr_for_intervention] =\
    data_loader.X_data.loc[base_flow_dataset.X_test.index, sensitive_attr_for_intervention]

In [15]:
# Fair preprocessing
processed_base_flow_dataset = apply_lfr(base_flow_dataset,
                                        intervention_options=intervention_options,
                                        sensitive_attribute=sensitive_attr_for_intervention)

step: 0, loss: 0.7781489614434901, L_x: 1.0183954863110314,  L_y: 0.7414509572034835,  L_z: 0.004501775664625898
step: 250, loss: 0.778148961731957, L_x: 1.0183954943422897,  L_y: 0.7414509570366035,  L_z: 0.0045017754823924545
step: 500, loss: 0.7781489045226891, L_x: 1.018395514727092,  L_y: 0.7414508930165289,  L_z: 0.0045017746675519955
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          580     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  7.78149D-01    |proj g|=  1.72107D-01
step: 750, loss: 0.6340894436050039, L_x: 1.01820749865161,  L_y: 0.5816106725136384,  L_z: 0.0044095442387841605
step: 1000, loss: 0.6340894069162193, L_x: 1.0182074755950312,  L_y: 0.5816106352057855,  L_z: 0.004409543835754594

At iterate    1    f=  6.34089D-01    |proj g|=  1.08246D-01
step: 1250, loss: 0.49504240329393906, L_x: 1.0158754662745966,  L_y: 0.4283572398428771,  L_z: 0.003966670403945005
step: 1500, loss: 0

## Check if the intervention was applied correctly

In [17]:
processed_base_flow_dataset.X_train_val[sensitive_attr_for_intervention] = \
    data_loader.X_data.loc[base_flow_dataset.X_train_val.index, sensitive_attr_for_intervention]
processed_base_flow_dataset.X_test[sensitive_attr_for_intervention] = \
    data_loader.X_data.loc[base_flow_dataset.X_test.index, sensitive_attr_for_intervention]

# Original
original_train_df = base_flow_dataset.X_train_val
original_train_df[base_flow_dataset.target] = base_flow_dataset.y_train_val
original_test_df = base_flow_dataset.X_test
original_test_df[base_flow_dataset.target] = base_flow_dataset.y_test

dataset_orig_train = BinaryLabelDataset(df=original_train_df,
                                        label_names=[base_flow_dataset.target],
                                        protected_attribute_names=[sensitive_attr_for_intervention],
                                        favorable_label=1,
                                        unfavorable_label=0)
dataset_orig_test = BinaryLabelDataset(df=original_test_df,
                                       label_names=[base_flow_dataset.target],
                                       protected_attribute_names=[sensitive_attr_for_intervention],
                                       favorable_label=1,
                                       unfavorable_label=0)

# Transformed
transf_train_df = processed_base_flow_dataset.X_train_val
transf_train_df[processed_base_flow_dataset.target] = processed_base_flow_dataset.y_train_val
transf_test_df = processed_base_flow_dataset.X_test
transf_test_df[processed_base_flow_dataset.target] = processed_base_flow_dataset.y_test

dataset_transf_train = BinaryLabelDataset(df=transf_train_df,
                                          label_names=[processed_base_flow_dataset.target],
                                          protected_attribute_names=[sensitive_attr_for_intervention],
                                          favorable_label=1,
                                          unfavorable_label=0)
dataset_transf_test = BinaryLabelDataset(df=transf_test_df,
                                         label_names=[processed_base_flow_dataset.target],
                                         protected_attribute_names=[sensitive_attr_for_intervention],
                                         favorable_label=1,
                                         unfavorable_label=0)

In [18]:
metric_orig_train = BinaryLabelDatasetMetric(dataset_orig_train,
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
metric_transf_train = BinaryLabelDatasetMetric(dataset_transf_train,
                                               unprivileged_groups=unprivileged_groups,
                                               privileged_groups=privileged_groups)
display(Markdown("#### Training dataset"))
print("Transformed: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_transf_train.mean_difference())
print("Original: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())

metric_orig_test = BinaryLabelDatasetMetric(dataset_orig_test,
                                            unprivileged_groups=unprivileged_groups,
                                            privileged_groups=privileged_groups)
metric_transf_test = BinaryLabelDatasetMetric(dataset_transf_test,
                                              unprivileged_groups=unprivileged_groups,
                                              privileged_groups=privileged_groups)
display(Markdown("#### Test dataset"))
print("Transformed: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_transf_test.mean_difference())
print("Original: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_test.mean_difference())

#### Training dataset

Transformed: Difference in mean outcomes between unprivileged and privileged groups = 0.013156
Original: Difference in mean outcomes between unprivileged and privileged groups = 0.042717


#### Test dataset

Transformed: Difference in mean outcomes between unprivileged and privileged groups = 0.043478
Original: Difference in mean outcomes between unprivileged and privileged groups = 0.100370


In [19]:
display(Markdown("#### Individual fairness metrics"))
print("Consistency of labels in transformed training dataset= %f" %metric_transf_train.consistency())
print("Consistency of labels in original training dataset= %f" %metric_orig_train.consistency())
print("Consistency of labels in transformed test dataset= %f" %metric_transf_test.consistency())
print("Consistency of labels in original test dataset= %f" %metric_orig_test.consistency())

#### Individual fairness metrics

Consistency of labels in transformed training dataset= 0.987654
Consistency of labels in original training dataset= 0.888889
Consistency of labels in transformed test dataset= 0.991411
Consistency of labels in original test dataset= 0.857669


In [20]:
def check_algorithm_success():
    """Transformed dataset consistency should be greater than original dataset."""
    assert metric_transf_test.consistency() > metric_orig_test.consistency(), \
        "Transformed dataset consistency should be greater than original dataset."

    print('Success!')

check_algorithm_success()    

Success!
