In [2]:
import aif360
from aif360.datasets import AdultDataset
from aif360.metrics import ClassificationMetric
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

import numpy as np
import pandas as pd
START_BOLD = '\033[1m'
END_BOLD = '\033[0m'

In [3]:
column_names = ['age', 'workclass', 'fnlwgt', 'education',
    'education-num', 'marital-status', 'occupation', 'relationship',
    'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
    'native-country', 'income-per-year']


dataset_path = Path(aif360.__file__).parent / 'data' / 'raw' / 'adult' / 'adult.data'
original_df = pd.read_csv(dataset_path, names=column_names)

In [4]:
original_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-per-year
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


# TASK 1

## Preprocess

In [7]:
# Using as base the aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions.load_preproc_data_adult

def load_preproc_data_adult(protected_attributes=None):
    min_privileged_age = 35
    max_privileged_age = 55
    def custom_preprocessing(df):

        def is_in_privileged_age(x):
            if x > min_privileged_age and x < max_privileged_age:
                return 1.0
            else:
                return 0.0

        def group_edu(x):
            if x <= 5:
                return '<6'
            elif x >= 13:
                return '>12'
            else:
                return x

        def group_race(x):
            if x == "White":
                return 1.0
            else:
                return 0.0

        # Limit education range
        df['education years'] = df['education-num'].apply(lambda x: group_edu(x))
        df['education years'] = df['education years'].astype('category')

        # Rename income variable
        df['income binary'] = df['income-per-year']
        df['income binary'] = df['income binary'].replace(to_replace='>50K.', value='>50K', regex=True)
        df['income binary'] = df['income binary'].replace(to_replace='<=50K.', value='<=50K', regex=True)

        # Recode sex and race
        df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0})
        df['race'] = df['race'].apply(lambda x: group_race(x))
        df['age'] = df['age'].apply(lambda x: is_in_privileged_age(x))

        return df


    XD_features = ['age', 'education years', 'sex', 'race']
    D_features = ['age', 'race', 'sex'] if protected_attributes is None else protected_attributes
    Y_features = ['income binary']
    X_features = list(set(XD_features)-set(D_features))
    print(X_features + Y_features + D_features)
    categorical_features = ['education years']

    # privileged classes
    all_privileged_classes = {"age": [1.0],
                              "race": [1.0],
                              "sex": [1.0]}

    # protected attribute maps
    all_protected_attribute_maps = {"age": {1.0: f'Between {min_privileged_age} and {max_privileged_age}', 0.0: f'Not in between {min_privileged_age} and {max_privileged_age}'}, 
                                    "race": {1.0: 'White', 0.0: 'Non-white'}, 
                                    "sex": {1.0: 'Male', 0.0: 'Female'}}

    return AdultDataset(
        label_name=Y_features[0],
        favorable_classes=['>50K', '>50K.'],
        protected_attribute_names=D_features,
        privileged_classes=[all_privileged_classes[x] for x in D_features],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features+Y_features+D_features,
        na_values=['?'],
        metadata={'label_maps': [{1.0: '>50K', 0.0: '<=50K'}],
                  'protected_attribute_maps': [all_protected_attribute_maps[x]
                                for x in D_features]},
        custom_preprocessing=custom_preprocessing)

In [8]:
dataset = load_preproc_data_adult([])

['education years', 'sex', 'race', 'age', 'income binary']


  df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0})


AttributeError: 'NoneType' object has no attribute 'columns'

In [39]:
print(f"Protected attributes: {dataset.protected_attribute_names}")

Protected attributes: []


In [49]:
df = dataset.convert_to_dataframe()[0]

In [48]:
train_test_valid_split = 0.7
test_valid_split = 0.5

train_ds, valid_test_ds = df.split([train_test_valid_split], shuffle=True)
validation_ds, test_ds = valid_test_ds.split([test_valid_split], shuffle=True)

AttributeError: 'DataFrame' object has no attribute 'split'

In [25]:
scale = StandardScaler()
X_train = scale.fit_transform(train_ds.features)
y_train = train_ds.labels.ravel()
w_train = train_ds.instance_weights

dtmod = DecisionTreeClassifier()
dtmod.fit(X_train, y_train)
pos_ind = np.where(dtmod.classes_ == train_ds.favorable_label)[0][0]

In [26]:
validation_ds_pred = validation_ds.copy(deepcopy=True)
X_valid = scale.transform(validation_ds_pred.features)
validation_ds_pred.scores = dtmod.predict_proba(X_valid)[:,pos_ind].reshape(-1,1)

test_ds_pred = test_ds.copy(deepcopy=True)
X_test = scale.transform(test_ds_pred.features)
test_ds_pred.scores = dtmod.predict_proba(X_test)[:,pos_ind].reshape(-1,1)

In [29]:
num_thresh = 99 
ba_arr = np.zeros(num_thresh)
class_thresh_arr = np.linspace(0.01, 0.99, num_thresh)
for idx, class_thresh in enumerate(class_thresh_arr):
    
    fav_inds = validation_ds_pred.scores > class_thresh
    validation_ds_pred.labels[fav_inds] = validation_ds_pred.favorable_label
    validation_ds_pred.labels[~fav_inds] = validation_ds_pred.unfavorable_label
    
    classified_metric_orig_valid = ClassificationMetric(validation_ds,
                                             validation_ds_pred)
    
    ba_arr[idx] = (classified_metric_orig_valid.true_positive_rate()
                   +classified_metric_orig_valid.true_negative_rate()) /2

best_ind = np.where(ba_arr == np.max(ba_arr))[0][0]
best_class_thresh = class_thresh_arr[best_ind]

In [30]:
fav_inds = test_ds_pred.scores > best_class_thresh
test_ds_pred.labels[fav_inds] = test_ds_pred.favorable_label
test_ds_pred.labels[~fav_inds] = test_ds_pred.unfavorable_label

metric_test = ClassificationMetric(test_ds, test_ds_pred)

balanced_accuracy = (metric_test.true_negative_rate() + metric_test.true_positive_rate()) / 2
print(f"Balanced accuracy for {START_BOLD}classifier{END_BOLD}: {round(balanced_accuracy, 4)}")

Balanced accuracy for [1mclassifier[0m: 0.7448


# TASK 2

In [46]:
protected_attributes = ['sex']
protected_dataset = load_preproc_data_adult(protected_attributes)

  df['sex'] = df['sex'].replace({'Female': 0.0, 'Male': 1.0})


In [40]:
print(f"Protected attributes: {protected_dataset.protected_attribute_names}")

Protected attributes: ['race', 'sex']


In [41]:
protected_dataset.convert_to_dataframe()[0]

Unnamed: 0,race,sex,Age (decade)=10,Age (decade)=20,Age (decade)=30,Age (decade)=40,Age (decade)=50,Age (decade)=60,Age (decade)=>=70,Education Years=6,Education Years=7,Education Years=8,Education Years=9,Education Years=10,Education Years=11,Education Years=12,Education Years=<6,Education Years=>12,Income Binary
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
48838,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
48839,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
48840,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
