In [2]:
# !pip install cvxpy

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "fairness-variance":
    os.chdir("../..")

print('Current location: ', os.getcwd())

Current location:  /home/dh3553/projects/fairness-variance


In [4]:
import numpy as np
import pandas as pd
import folktables
from folktables import ACSDataSource

from aif360.datasets import StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric

from source.preprocessing import get_distortion_acs_income

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'


In [None]:
class ACSIncomeFolktablesDataset(StandardDataset):
    def __init__(self, protected_attr_name='SEX', root_dir="data/folktables/",
                 survey_year='2018', states=["PA"]):
        assert protected_attr_name in ['SEX', 'RAC1P']
        self.protected_attr_name = protected_attr_name

        # 'sex': 'SEX',
        # 'race': 'RAC1P'

        def group_race(x):
            if x == 1.:
                return 1.
            else:
                return 0.

        data_source = ACSDataSource(survey_year=survey_year, horizon='1-Year', survey='person', root_dir=root_dir)
        acs_data = data_source.get_data(states=states, download=True)
        
        # Subsample
        subsample_size = 500
        subsample_seed = 42
        acs_data = acs_data.sample(subsample_size, random_state=subsample_seed) if subsample_seed is not None \
            else acs_data.sample(subsample_size)
        acs_data = acs_data.reset_index(drop=True)
        
        features, labels, _ = self.formulate_problem().df_to_pandas(acs_data)

        df = pd.concat([features, labels], axis=1)
        df['SEX'] = df['SEX'].replace({1.: 1., 2.: 0.})
        df['RAC1P'] = df['RAC1P'].apply(lambda x: group_race(x))

        del features
        del labels

        super(ACSIncomeFolktablesDataset, self).__init__(df=df, label_name='PINCP', favorable_classes=[True],
                                                         protected_attribute_names=[protected_attr_name],
                                                         privileged_classes=[[1.]],
                                                         categorical_features=['COW', 'MAR', 'SCHL'])

    def formulate_problem(self):
        problem = folktables.BasicProblem(
            features=[
                'AGEP',
                'COW',    # class of worker
                'SCHL',   # education level
                'MAR',    # marital status
                # 'OCCP', # occupation
                # 'POBP', # place of birth
                # 'RELP', # relationship 
                'WKHP',
                'SEX',
                'RAC1P',
            ],
            target='PINCP',
            target_transform=lambda x: x > 50000,
            group=self.protected_attr_name,
            preprocess=folktables.adult_filter,
            postprocess=lambda x: np.nan_to_num(x, -1),
        )
        return problem

In [None]:
privilege_mode = 'SEX'
dd = ACSIncomeFolktablesDataset(protected_attr_name=privilege_mode)

train_dataset, val_dataset = dd.split([0.65], shuffle=True)
val_dataset, test_dataset = val_dataset.split([0.43], shuffle=True) # 0.43 * 0.35 = 0.15

privileged_groups = [{privilege_mode: 1}]
unprivileged_groups = [{privilege_mode: 0}]

index = train_dataset.feature_names.index(privilege_mode)

# Metric for the original dataset #################################################################
print("\n\nStats of Initial Dataset: ")
metric_orig_train = BinaryLabelDatasetMetric(train_dataset,
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
print("Train set: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())
print("Train set: Initial disparate impact in source dataset = %f" % metric_orig_train.disparate_impact())
metric_orig_test = BinaryLabelDatasetMetric(test_dataset,
                                            unprivileged_groups=unprivileged_groups,
                                            privileged_groups=privileged_groups)
print("Test set: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_test.mean_difference())
print("Test set: Initial disparate impact in source dataset = %f" % metric_orig_test.disparate_impact())

In [None]:
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools
from aif360.algorithms.preprocessing import OptimPreproc

optim_options = {
    "distortion_fun": get_distortion_acs_income,
    "epsilon": 0.05,
    "clist": [0.99, 1.99, 2.99],
    "dlist": [.1, 0.05, 0]
}

debias_model = OptimPreproc(OptTools, optim_options)
debias_model = debias_model.fit(train_dataset)

train_repd = debias_model.transform(train_dataset, transform_Y=True)
val_repd = debias_model.transform(val_dataset, transform_Y=True)
test_repd = debias_model.transform(test_dataset, transform_Y=True)

metrics_train_dataset = BinaryLabelDatasetMetric(train_repd,
                                                 unprivileged_groups=unprivileged_groups,
                                                 privileged_groups=privileged_groups)
print("\nTRANSFORMED Train set: Difference in mean outcomes "
      "between unprivileged and privileged groups = %f" % metrics_train_dataset.mean_difference())