## Selection bias and missing values with COMPAS data
This notebook demonstrates the effect of selection bias and missing values on fairness using COMPAS data. <br>
In this notebook, we first import packages needed in this file

In [1]:
import sys
sys.path.append("models")
sys.path.append("AIF360/")
import numpy as np
from compas_model import get_distortion_compas, CompasDataset, reweight_df, get_evaluation, CompasDataset_test, CompasDataset_train, load_preproc_data_compas_test_comb
from aif360.algorithms.preprocessing.optim_preproc import OptimPreproc
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import pandas as pd

The function below creates a dataset with both missing values and selection bias. This processing is a combination of demo_missing_compas.ipynb and demo_sel_compas.ipynb

In [2]:
def load_preproc_data_compas_train(protected_attributes=None):
    def custom_preprocessing(df):
        """The custom pre-processing function is adapted from
            https://github.com/fair-preprocessing/nips2017/blob/master/compas/code/Generate_Compas_Data.ipynb
        """

        df = df[['age',
                 'c_charge_degree',
                 'race',
                 'age_cat',
                 'score_text',
                 'sex',
                 'priors_count',
                 'days_b_screening_arrest',
                 'decile_score',
                 'is_recid',
                 'two_year_recid',
                 'length_of_stay']]

        # Indices of data samples to keep
        ix = df['days_b_screening_arrest'] <= 30
        ix = (df['days_b_screening_arrest'] >= -30) & ix
        ix = (df['is_recid'] != -1) & ix
        ix = (df['c_charge_degree'] != "O") & ix
        ix = (df['score_text'] != 'N/A') & ix
        df = df.loc[ix, :]

        # Restrict races to African-American and Caucasian
        dfcut = df.loc[~df['race'].isin(
            ['Native American', 'Hispanic', 'Asian', 'Other']), :]

        # Restrict the features to use
        dfcutQ = dfcut[['sex',
                        'race',
                        'age_cat',
                        'c_charge_degree',
                        'score_text',
                        'priors_count',
                        'is_recid',
                        'two_year_recid',
                        'length_of_stay']].copy()

        # Quantize priors count between 0, 1-3, and >3
        def quantizePrior(x):
            if x == 0:
                return '0'
            elif x == 1:
                return '1 to 3'
            elif x == 2:
                return 'More than 3'
            else:
                return 'missing'
        # Quantize length of stay

        def quantizeLOS(x):
            if x == 0:
                return '<week'
            if x == 1:
                return '<3months'
            else:
                return '>3 months'

        # Quantize length of stay
        def adjustAge(x):
            if x == 1:
                return '25 to 45'
            elif x == 2:
                return 'Greater than 45'
            elif x == 0:
                return 'Less than 25'
        # Quantize score_text to MediumHigh

        def quantizeScore(x):
            if x == 1:
                return 'MediumHigh'
            else:
                return 'Low'

        def group_race(x):
            if x == "Caucasian":
                return 1.0
            else:
                return 0.0

        dfcutQ['priors_count'] = dfcutQ['priors_count'].apply(
            lambda x: quantizePrior(x))
        dfcutQ['length_of_stay'] = dfcutQ['length_of_stay'].apply(
            lambda x: quantizeLOS(x))
        dfcutQ['score_text'] = dfcutQ['score_text'].apply(
            lambda x: quantizeScore(x))
        dfcutQ['age_cat'] = dfcutQ['age_cat'].apply(lambda x: adjustAge(x))
        # Recode sex and race
        dfcutQ['sex'] = dfcutQ['sex'].replace({'Female': 1.0, 'Male': 0.0})
        dfcutQ['race'] = dfcutQ['race'].apply(lambda x: group_race(x))

        features = ['two_year_recid', 'race',
                    'age_cat', 'priors_count', 'c_charge_degree', 'score_text']
        # the code below creates a dataset with selection, same approach as demo_sel_compas.ipynb
        # Pass vallue to df
        df = dfcutQ[features]
        # Here, we filter out dataframe with negative outcome
        df_neg = df.loc[df['two_year_recid'] == 1, :]
        # df_neg_priv represents observations with negative outcome in privileged group
        df_neg_priv = df_neg.loc[(df_neg['two_year_recid'] == 1) & (
            df_neg['race'] == 1), :]
        # df_neg_unpriv represents observations with negative outcome in unprivileged group
        df_neg_unpriv = df_neg.loc[(df_neg['two_year_recid'] == 1) & (
            df_neg['race'] == 0), :]
        # the code below will create a biased dataset for observations with negative outcome. 
        # We randomly select observations from df_neg_unpriv and df_neg_priv to create
        # a new dataset with selection bias 
        _, df_neg_priv_test = train_test_split(
            df_neg_priv, test_size=500, random_state=10)
        _, df_neg_unpriv_test = train_test_split(
            df_neg_unpriv, test_size=950, random_state=10)
        df_neg_test = df_neg_priv_test.append(df_neg_unpriv_test)
        print('negative outcome, unpriv before resampling')
        print(len(df_neg_unpriv_test.index))

        print('negative outcome, priv before resampling')
        print(len(df_neg_priv_test.index))
        # Here, we filter out dataframe with positive outcome
        df_pos = df.loc[df['two_year_recid'] == 0, :]
        # df_pos_priv represents observations with positive outcome in privileged group
        df_pos_priv = df_pos.loc[(df_pos['two_year_recid'] == 0) & (
            df_pos['race'] == 1), :]
        # df_pos_unpriv represents observations with positive outcome in unprivileged group
        df_pos_unpriv = df_pos.loc[(df_pos['two_year_recid'] == 0) & (
            df_pos['race'] == 0), :]
        # the code below will create a biased dataset for observations with positive outcome. 
        # We randomly select observations from df_pos_unpriv and df_pos_priv to create
        # a new dataset with selection bias.
        _, df_pos_priv_test = train_test_split(
            df_pos_priv, test_size=650, random_state=10)
        _, df_pos_unpriv_test = train_test_split(
            df_pos_unpriv, test_size=900, random_state=10)
        df_pos_test = df_pos_priv_test.append(df_pos_unpriv_test)
        print('positive outcome, unpriv before resampling')
        print(len(df_pos_unpriv_test.index))

        print('positive outcome, priv before resampling')
        print(len(df_pos_priv_test.index))
        df = df_neg_test.append(df_pos_test)
        
        df['mis_prob'] = 0
        # the code below creates MAR type of missing value, the processing is the same as 
        # demo_missing_compas.ipynb
        for index, row in df.iterrows():
            if row['race'] != 'African-American' and row['two_year_recid']==0:
                df.loc[index, 'mis_prob'] = 0.3
            elif row['race'] != 'African-American':
                df.loc[index, 'mis_prob'] = 0.1
            else:
                df.loc[index, 'mis_prob'] = 0.05
        new_label = []
        for index, row in df.iterrows():
            if np.random.binomial(1, float(row['mis_prob']), 1)[0] == 1:
                new_label.append('missing')
            else:
                new_label.append(row['priors_count'])
        df['priors_count'] = new_label
        print('Total number of missing values')
        print(len(df.loc[df['priors_count'] == 'missing', :].index))
        print('Total number of observations')
        print(len(df.index))
        return df

    XD_features = [
        'age_cat',
        'c_charge_degree',
        'priors_count',
        'race',
        'score_text']
    D_features = [
        'race'] if protected_attributes is None else protected_attributes
    Y_features = ['two_year_recid']
    X_features = list(set(XD_features) - set(D_features))
    categorical_features = [
        'age_cat',
        'priors_count',
        'c_charge_degree',
        'score_text']

    # privileged classes
    all_privileged_classes = {"sex": [1.0],
                              "race": [1.0]}

    # protected attribute maps
    all_protected_attribute_maps = {
        "sex": {
            0.0: 'Male', 1.0: 'Female'}, "race": {
            1.0: 'Caucasian', 0.0: 'Not Caucasian'}}

    return CompasDataset_train(
        label_name=Y_features[0],
        favorable_classes=[0],
        protected_attribute_names=D_features,
        privileged_classes=[all_privileged_classes[x] for x in D_features],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features + Y_features + D_features,
        na_values=[],
        metadata={'label_maps': [{1.0: 'Did recid.', 0.0: 'No recid.'}],
                  'protected_attribute_maps': [all_protected_attribute_maps[x]
                                               for x in D_features]},
        custom_preprocessing=custom_preprocessing)


The code below is to load the data and run the fairness fixing algorithm proposed by Calmon et al. \[1\]. 

In [None]:
privileged_groups = [{'race': 1}]
unprivileged_groups = [{'race': 0}]
dataset_orig_vt = load_preproc_data_compas_test_comb(['race'])
dataset_orig = load_preproc_data_compas_train(['race'])

optim_options = {
    "distortion_fun": get_distortion_compas,
    "epsilon": 0.05,
    "clist": [0.99, 1.99, 2.99],
    "dlist": [.1, 0.05, 0]
}

dataset_orig_train, dataset_orig_vt = dataset_orig.split(
    [0.7], shuffle=True)

OP = OptimPreproc(OptTools, optim_options,
                  unprivileged_groups=unprivileged_groups,
                  privileged_groups=privileged_groups)

OP = OP.fit(dataset_orig_train)

dataset_transf_cat_test = OP.transform(dataset_orig_vt, transform_Y=True)
dataset_transf_cat_test = dataset_orig_vt.align_datasets(
    dataset_transf_cat_test)

dataset_transf_cat_train = OP.transform(
    dataset_orig_train, transform_Y=True)
dataset_transf_cat_train = dataset_orig_train.align_datasets(
    dataset_transf_cat_train)

We then use the processed data to train a logistic regression classifier and validate the classifier on the test set.

In [None]:
scale_transf = StandardScaler()
X_train = dataset_orig_train.features
y_train = dataset_orig_train.labels.ravel()

X_test = scale_transf.fit_transform(dataset_orig_vt.features)

scale_transf = StandardScaler()
X_train = scale_transf.fit_transform(dataset_transf_cat_train.features)
y_train = dataset_transf_cat_train.labels.ravel()

X_test = scale_transf.fit_transform(dataset_transf_cat_test.features)

lmod = LogisticRegression()
lmod.fit(X_train, y_train)
y_pred = lmod.predict(X_test)
print('Without resampling')
get_evaluation(dataset_orig_vt,y_pred,privileged_groups,unprivileged_groups,0,1,1)

In this part, we do uniform resampling. This code is very similar to the code presented in demo_sel_compas.ipynb

In [None]:
def load_preproc_data_compas_train(protected_attributes=None):
    def custom_preprocessing(df):
        """The custom pre-processing function is adapted from
            https://github.com/fair-preprocessing/nips2017/blob/master/compas/code/Generate_Compas_Data.ipynb
        """

        df = df[['age',
                 'c_charge_degree',
                 'race',
                 'age_cat',
                 'score_text',
                 'sex',
                 'priors_count',
                 'days_b_screening_arrest',
                 'decile_score',
                 'is_recid',
                 'two_year_recid',
                 'length_of_stay']]

        # Indices of data samples to keep
        ix = df['days_b_screening_arrest'] <= 30
        ix = (df['days_b_screening_arrest'] >= -30) & ix
        ix = (df['is_recid'] != -1) & ix
        ix = (df['c_charge_degree'] != "O") & ix
        ix = (df['score_text'] != 'N/A') & ix
        df = df.loc[ix, :]

        # Restrict races to African-American and Caucasian
        dfcut = df.loc[~df['race'].isin(
            ['Native American', 'Hispanic', 'Asian', 'Other']), :]

        # Restrict the features to use
        dfcutQ = dfcut[['sex',
                        'race',
                        'age_cat',
                        'c_charge_degree',
                        'score_text',
                        'priors_count',
                        'is_recid',
                        'two_year_recid',
                        'length_of_stay']].copy()

        # Quantize priors count between 0, 1-3, and >3
        def quantizePrior(x):
            if x == 0:
                return '0'
            elif x == 1:
                return '1 to 3'
            elif x == 2:
                return 'More than 3'
            else:
                return 'missing'
        # Quantize length of stay

        def quantizeLOS(x):
            if x == 0:
                return '<week'
            if x == 1:
                return '<3months'
            else:
                return '>3 months'

        # Quantize length of stay
        def adjustAge(x):
            if x == 1:
                return '25 to 45'
            elif x == 2:
                return 'Greater than 45'
            elif x == 0:
                return 'Less than 25'
        # Quantize score_text to MediumHigh

        def quantizeScore(x):
            if x == 1:
                return 'MediumHigh'
            else:
                return 'Low'

        def group_race(x):
            if x == "Caucasian":
                return 1.0
            else:
                return 0.0

        dfcutQ['priors_count'] = dfcutQ['priors_count'].apply(
            lambda x: quantizePrior(x))
        dfcutQ['length_of_stay'] = dfcutQ['length_of_stay'].apply(
            lambda x: quantizeLOS(x))
        dfcutQ['score_text'] = dfcutQ['score_text'].apply(
            lambda x: quantizeScore(x))
        dfcutQ['age_cat'] = dfcutQ['age_cat'].apply(lambda x: adjustAge(x))
        # Recode sex and race
        dfcutQ['sex'] = dfcutQ['sex'].replace({'Female': 1.0, 'Male': 0.0})
        dfcutQ['race'] = dfcutQ['race'].apply(lambda x: group_race(x))

        features = ['two_year_recid', 'race',
                    'age_cat', 'priors_count', 'c_charge_degree', 'score_text']

        # Pass vallue to df
        df = dfcutQ[features]
        # Here, we filter out dataframe with negative outcome
        df_neg = df.loc[df['two_year_recid'] == 1, :]
        # df_neg_priv represents observations with negative outcome in privileged group
        df_neg_priv = df_neg.loc[(df_neg['two_year_recid'] == 1) & (
            df_neg['race'] == 1), :]
        # df_neg_unpriv represents observations with negative outcome in unprivileged group
        df_neg_unpriv = df_neg.loc[(df_neg['two_year_recid'] == 1) & (
            df_neg['race'] == 0), :]
        # the code below will create a biased dataset for observations with negative outcome. 
        # We randomly select observations from df_neg_unpriv and df_neg_priv to create
        # a new dataset with selection bias 
        _, df_neg_priv_test = train_test_split(
            df_neg_priv, test_size=500, random_state=10)
        _, df_neg_unpriv_test = train_test_split(
            df_neg_unpriv, test_size=950, random_state=10)
        df_neg_test = df_neg_priv_test.append(df_neg_unpriv_test)
        print('negative outcome, unpriv before resampling')
        print(len(df_neg_unpriv_test.index))

        print('negative outcome, priv before resampling')
        print(len(df_neg_priv_test.index))
        # Here, we filter out dataframe with positive outcome
        df_pos = df.loc[df['two_year_recid'] == 0, :]
        # df_pos_priv represents observations with positive outcome in privileged group
        df_pos_priv = df_pos.loc[(df_pos['two_year_recid'] == 0) & (
            df_pos['race'] == 1), :]
        # df_pos_unpriv represents observations with positive outcome in unprivileged group
        df_pos_unpriv = df_pos.loc[(df_pos['two_year_recid'] == 0) & (
            df_pos['race'] == 0), :]
        # the code below will create a biased dataset for observations with positive outcome. 
        # We randomly select observations from df_pos_unpriv and df_pos_priv to create
        # a new dataset with selection bias.
        _, df_pos_priv_test = train_test_split(
            df_pos_priv, test_size=650, random_state=10)
        _, df_pos_unpriv_test = train_test_split(
            df_pos_unpriv, test_size=900, random_state=10)
        df_pos_test = df_pos_priv_test.append(df_pos_unpriv_test)
        print('positive outcome, unpriv before resampling')
        print(len(df_pos_unpriv_test.index))

        print('positive outcome, priv before resampling')
        print(len(df_pos_priv_test.index))
        df = df_neg_test.append(df_pos_test)
        
        df['mis_prob'] = 0
        for index, row in df.iterrows():
            if row['race'] != 'African-American' and row['two_year_recid']==0:
                df.loc[index, 'mis_prob'] = 0.3
            elif row['race'] != 'African-American':
                df.loc[index, 'mis_prob'] = 0.
            else:
                df.loc[index, 'mis_prob'] = 0.05
        new_label = []
        for index, row in df.iterrows():
            if np.random.binomial(1, float(row['mis_prob']), 1)[0] == 1:
                new_label.append('missing')
            else:
                new_label.append(row['priors_count'])
        df['priors_count'] = new_label
        print('Total number of missing values')
        print(len(df.loc[df['priors_count'] == 'missing', :].index))
        print('Total number of observations')
        print(len(df.index))
        
        df_result = pd.DataFrame()
        # In this part, we preform uniform resampling described in the paper so that
        # the training data has no selection bias
        N = len(df)
        for i in df['two_year_recid'].unique():
            for j in df['race'].unique():
                orig_df = df.loc[(df['two_year_recid'] == i)
                                 & (df['race'] == j), :]
                # real_count is the number of observations in the original data
                real_count = len(orig_df.index)
                # exp_count is the expected number of obsercations given statistical independence
                exp_count = int((len(df.loc[(df['two_year_recid'] == i), :].index) / len(
                    df.index)) * (len(df.loc[(df['race'] == j), :].index) / len(df.index)) * N)
                # if real_count is bigger than exp_count, we randomly drop some samples 
                if real_count >= exp_count:
                    _, df_toapp = train_test_split(
                        orig_df, test_size=exp_count, random_state=10)
                # if real_count is smaller than exp_count, we bootstrap from the original data to
                # reach statistical independence
                else:
                    df_toapp = resample(
                        orig_df,
                        replace=True,
                        n_samples=exp_count -
                        real_count,
                        random_state=10)
                    df_toapp = df_toapp.append(orig_df)
                if len(df_result.index) == 0:
                    df_result = df_toapp.copy()
                else:
                    df_result = df_result.append(df_toapp)
        df = df_result
        
        return df

    XD_features = [
        'age_cat',
        'c_charge_degree',
        'priors_count',
        'race',
        'score_text']
    D_features = [
        'race'] if protected_attributes is None else protected_attributes
    Y_features = ['two_year_recid']
    X_features = list(set(XD_features) - set(D_features))
    categorical_features = [
        'age_cat',
        'priors_count',
        'c_charge_degree',
        'score_text']

    # privileged classes
    all_privileged_classes = {"sex": [1.0],
                              "race": [1.0]}

    # protected attribute maps
    all_protected_attribute_maps = {
        "sex": {
            0.0: 'Male', 1.0: 'Female'}, "race": {
            1.0: 'Caucasian', 0.0: 'Not Caucasian'}}

    return CompasDataset_train(
        label_name=Y_features[0],
        favorable_classes=[0],
        protected_attribute_names=D_features,
        privileged_classes=[all_privileged_classes[x] for x in D_features],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features + Y_features + D_features,
        na_values=[],
        metadata={'label_maps': [{1.0: 'Did recid.', 0.0: 'No recid.'}],
                  'protected_attribute_maps': [all_protected_attribute_maps[x]
                                               for x in D_features]},
        custom_preprocessing=custom_preprocessing)


Same as the previous case, we run the processing function and run the fairness fixing algorithm proposed by Calmon et al. \[1\]and use the processed data to train a new logistic regression classifier and validate the classifier on the same test set.

In [None]:
privileged_groups = [{'race': 1}]
unprivileged_groups = [{'race': 0}]
dataset_orig_vt = load_preproc_data_compas_test_comb(['race'])
dataset_orig_train = load_preproc_data_compas_train(['race'])

optim_options = {
    "distortion_fun": get_distortion_compas,
    "epsilon": 0.05,
    "clist": [0.99, 1.99, 2.99],
    "dlist": [.1, 0.05, 0]
}

OP = OptimPreproc(OptTools, optim_options,
                  unprivileged_groups=unprivileged_groups,
                  privileged_groups=privileged_groups)

OP = OP.fit(dataset_orig_train)

dataset_transf_cat_test = OP.transform(dataset_orig_vt, transform_Y=True)
dataset_transf_cat_test = dataset_orig_vt.align_datasets(
    dataset_transf_cat_test)

dataset_transf_cat_train = OP.transform(
    dataset_orig_train, transform_Y=True)
dataset_transf_cat_train = dataset_orig_train.align_datasets(
    dataset_transf_cat_train)
scale_transf = StandardScaler()
X_train = dataset_orig_train.features
y_train = dataset_orig_train.labels.ravel()

X_test = scale_transf.fit_transform(dataset_orig_vt.features)




scale_transf = StandardScaler()
dataset_orig_train.instance_weights = reweight_df(dataset_orig_train)
scale_transf = StandardScaler()
X_train = scale_transf.fit_transform(dataset_transf_cat_train.features)
y_train = dataset_transf_cat_train.labels.ravel()
X_test = scale_transf.fit_transform(dataset_transf_cat_test.features)
lmod = LogisticRegression()
lmod.fit(
    X_train,
    y_train,
    sample_weight=dataset_orig_train.instance_weights)
y_pred = lmod.predict(X_test)
print('With uniform resampling')
get_evaluation(dataset_orig_vt,y_pred,privileged_groups,unprivileged_groups,0,1,0)

In this part, we do stratified resampling that we perform uniform resampling only on observations without missing values. This code is very similar to the code in the previous part except the resampling part

In [None]:
def load_preproc_data_compas_train(protected_attributes=None):
    def custom_preprocessing(df):
        """The custom pre-processing function is adapted from
            https://github.com/fair-preprocessing/nips2017/blob/master/compas/code/Generate_Compas_Data.ipynb
        """

        df = df[['age',
                 'c_charge_degree',
                 'race',
                 'age_cat',
                 'score_text',
                 'sex',
                 'priors_count',
                 'days_b_screening_arrest',
                 'decile_score',
                 'is_recid',
                 'two_year_recid',
                 'length_of_stay']]

        # Indices of data samples to keep
        ix = df['days_b_screening_arrest'] <= 30
        ix = (df['days_b_screening_arrest'] >= -30) & ix
        ix = (df['is_recid'] != -1) & ix
        ix = (df['c_charge_degree'] != "O") & ix
        ix = (df['score_text'] != 'N/A') & ix
        df = df.loc[ix, :]

        # Restrict races to African-American and Caucasian
        dfcut = df.loc[~df['race'].isin(
            ['Native American', 'Hispanic', 'Asian', 'Other']), :]

        # Restrict the features to use
        dfcutQ = dfcut[['sex',
                        'race',
                        'age_cat',
                        'c_charge_degree',
                        'score_text',
                        'priors_count',
                        'is_recid',
                        'two_year_recid',
                        'length_of_stay']].copy()

        # Quantize priors count between 0, 1-3, and >3
        def quantizePrior(x):
            if x == 0:
                return '0'
            elif x == 1:
                return '1 to 3'
            elif x == 2:
                return 'More than 3'
            else:
                return 'missing'
        # Quantize length of stay

        def quantizeLOS(x):
            if x == 0:
                return '<week'
            if x == 1:
                return '<3months'
            else:
                return '>3 months'

        # Quantize length of stay
        def adjustAge(x):
            if x == 1:
                return '25 to 45'
            elif x == 2:
                return 'Greater than 45'
            elif x == 0:
                return 'Less than 25'
        # Quantize score_text to MediumHigh

        def quantizeScore(x):
            if x == 1:
                return 'MediumHigh'
            else:
                return 'Low'

        def group_race(x):
            if x == "Caucasian":
                return 1.0
            else:
                return 0.0

        dfcutQ['priors_count'] = dfcutQ['priors_count'].apply(
            lambda x: quantizePrior(x))
        dfcutQ['length_of_stay'] = dfcutQ['length_of_stay'].apply(
            lambda x: quantizeLOS(x))
        dfcutQ['score_text'] = dfcutQ['score_text'].apply(
            lambda x: quantizeScore(x))
        dfcutQ['age_cat'] = dfcutQ['age_cat'].apply(lambda x: adjustAge(x))
        # Recode sex and race
        dfcutQ['sex'] = dfcutQ['sex'].replace({'Female': 1.0, 'Male': 0.0})
        dfcutQ['race'] = dfcutQ['race'].apply(lambda x: group_race(x))

        features = ['two_year_recid', 'race',
                    'age_cat', 'priors_count', 'c_charge_degree', 'score_text']

        # Pass vallue to df
        df = dfcutQ[features]
        # Here, we filter out dataframe with negative outcome
        df_neg = df.loc[df['two_year_recid'] == 1, :]
        # df_neg_priv represents observations with negative outcome in privileged group
        df_neg_priv = df_neg.loc[(df_neg['two_year_recid'] == 1) & (
            df_neg['race'] == 1), :]
        # df_neg_unpriv represents observations with negative outcome in unprivileged group
        df_neg_unpriv = df_neg.loc[(df_neg['two_year_recid'] == 1) & (
            df_neg['race'] == 0), :]
        # the code below will create a biased dataset for observations with negative outcome. 
        # We randomly select observations from df_neg_unpriv and df_neg_priv to create
        # a new dataset with selection bias 
        _, df_neg_priv_test = train_test_split(
            df_neg_priv, test_size=500, random_state=10)
        _, df_neg_unpriv_test = train_test_split(
            df_neg_unpriv, test_size=950, random_state=10)
        df_neg_test = df_neg_priv_test.append(df_neg_unpriv_test)
        print('negative outcome, unpriv before resampling')
        print(len(df_neg_unpriv_test.index))

        print('negative outcome, priv before resampling')
        print(len(df_neg_priv_test.index))
        # Here, we filter out dataframe with positive outcome
        df_pos = df.loc[df['two_year_recid'] == 0, :]
        # df_pos_priv represents observations with positive outcome in privileged group
        df_pos_priv = df_pos.loc[(df_pos['two_year_recid'] == 0) & (
            df_pos['race'] == 1), :]
        # df_pos_unpriv represents observations with positive outcome in unprivileged group
        df_pos_unpriv = df_pos.loc[(df_pos['two_year_recid'] == 0) & (
            df_pos['race'] == 0), :]
        # the code below will create a biased dataset for observations with positive outcome. 
        # We randomly select observations from df_pos_unpriv and df_pos_priv to create
        # a new dataset with selection bias.
        _, df_pos_priv_test = train_test_split(
            df_pos_priv, test_size=650, random_state=10)
        _, df_pos_unpriv_test = train_test_split(
            df_pos_unpriv, test_size=900, random_state=10)
        df_pos_test = df_pos_priv_test.append(df_pos_unpriv_test)
        print('positive outcome, unpriv before resampling')
        print(len(df_pos_unpriv_test.index))

        print('positive outcome, priv before resampling')
        print(len(df_pos_priv_test.index))
        df = df_neg_test.append(df_pos_test)
        
        df['mis_prob'] = 0
        for index, row in df.iterrows():
            if row['race'] != 'African-American' and row['two_year_recid']==0:
                df.loc[index, 'mis_prob'] = 0.3
            elif row['race'] != 'African-American':
                df.loc[index, 'mis_prob'] = 0.1
            else:
                df.loc[index, 'mis_prob'] = 0.05
        new_label = []
        for index, row in df.iterrows():
            if np.random.binomial(1, float(row['mis_prob']), 1)[0] == 1:
                new_label.append('missing')
            else:
                new_label.append(row['priors_count'])
        df['priors_count'] = new_label
        print('Total number of missing values')
        print(len(df.loc[df['priors_count'] == 'missing', :].index))
        print('Total number of observations')
        print(len(df.index))
        
        df_result = pd.DataFrame()
        # In this part, we preform statified resampling described in the paper 
        # that we only resample from observations without missing values
        N = len(df)
        df_result = pd.DataFrame()
        for i in df['two_year_recid'].unique():
            for j in df['race'].unique():
                orig_df = df.loc[(df['two_year_recid'] == i)
                                 & (df['race'] == j), :]
                # here we filter the data without missing values 
                orig_df_nomiss = df.loc[(df['two_year_recid'] == i) & (
                    df['race'] == j) & (df['priors_count'] != 'missing'), :]
                # real_count is the number of observations in the original data
                real_count = len(orig_df.index)
                # exp_count is the expected number of obsercations given statistical independence
                exp_count = int((len(df.loc[(df['two_year_recid'] == i), :].index) / len(
                    df.index)) * (len(df.loc[(df['race'] == j), :].index) / len(df.index)) * N)
                # if real_count is bigger than exp_count, we randomly drop some samples 
                if real_count >= exp_count:
                    _, df_toapp = train_test_split(
                        orig_df, test_size=exp_count, random_state=10)
                # this is the part that is different from uniform resampling that we resample 
                # only from observations without missing values 
                else:
                    df_toapp = resample(
                        orig_df_nomiss,
                        replace=True,
                        n_samples=exp_count -
                        real_count,
                        random_state=10)
                    df_toapp = df_toapp.append(orig_df)
                if len(df_result.index) == 0:
                    df_result = df_toapp.copy()
                else:
                    df_result = df_result.append(df_toapp)
        df = df_result
        
        return df

    XD_features = [
        'age_cat',
        'c_charge_degree',
        'priors_count',
        'race',
        'score_text']
    D_features = [
        'race'] if protected_attributes is None else protected_attributes
    Y_features = ['two_year_recid']
    X_features = list(set(XD_features) - set(D_features))
    categorical_features = [
        'age_cat',
        'priors_count',
        'c_charge_degree',
        'score_text']

    # privileged classes
    all_privileged_classes = {"sex": [1.0],
                              "race": [1.0]}

    # protected attribute maps
    all_protected_attribute_maps = {
        "sex": {
            0.0: 'Male', 1.0: 'Female'}, "race": {
            1.0: 'Caucasian', 0.0: 'Not Caucasian'}}

    return CompasDataset_train(
        label_name=Y_features[0],
        favorable_classes=[0],
        protected_attribute_names=D_features,
        privileged_classes=[all_privileged_classes[x] for x in D_features],
        instance_weights_name=None,
        categorical_features=categorical_features,
        features_to_keep=X_features + Y_features + D_features,
        na_values=[],
        metadata={'label_maps': [{1.0: 'Did recid.', 0.0: 'No recid.'}],
                  'protected_attribute_maps': [all_protected_attribute_maps[x]
                                               for x in D_features]},
        custom_preprocessing=custom_preprocessing)


Same as the previous case, we run the processing function to get a training data using stratified resampling and run the fairness fixing algorithm proposed by Calmon et al. \[1\] and use the processed data to train a new logistic regression classifier and validate the classifier on the same test set.

In [None]:
privileged_groups = [{'race': 1}]
unprivileged_groups = [{'race': 0}]
dataset_orig_vt = load_preproc_data_compas_test_comb(['race'])
dataset_orig_train = load_preproc_data_compas_train(['race'])
optim_options = {
    "distortion_fun": get_distortion_compas,
    "epsilon": 0.05,
    "clist": [0.99, 1.99, 2.99],
    "dlist": [.1, 0.05, 0]
}

OP = OptimPreproc(OptTools, optim_options,
                  unprivileged_groups=unprivileged_groups,
                  privileged_groups=privileged_groups)

OP = OP.fit(dataset_orig_train)

dataset_transf_cat_test = OP.transform(dataset_orig_vt, transform_Y=True)
dataset_transf_cat_test = dataset_orig_vt.align_datasets(
    dataset_transf_cat_test)

dataset_transf_cat_train = OP.transform(
    dataset_orig_train, transform_Y=True)
dataset_transf_cat_train = dataset_orig_train.align_datasets(
    dataset_transf_cat_train)
scale_transf = StandardScaler()
X_train = dataset_orig_train.features
y_train = dataset_orig_train.labels.ravel()

X_test = scale_transf.fit_transform(dataset_orig_vt.features)

scale_transf = StandardScaler()
dataset_orig_train.instance_weights = reweight_df(dataset_orig_train)
scale_transf = StandardScaler()
X_train = scale_transf.fit_transform(dataset_transf_cat_train.features)
y_train = dataset_transf_cat_train.labels.ravel()
X_test = scale_transf.fit_transform(dataset_transf_cat_test.features)
lmod = LogisticRegression()
lmod.fit(
    X_train,
    y_train,
    sample_weight=dataset_orig_train.instance_weights)
y_pred = lmod.predict(X_test)
print('With uniform resampling')
get_evaluation(dataset_orig_vt,y_pred,privileged_groups,unprivileged_groups,0,1,0)

# Reference
[1] Optimized Pre-Processing for Discrimination Prevention <br>
Flavio Calmon, Dennis Wei, Bhanukiran Vinzamuri, Karthikeyan Natesan Ramamurthy and Kush R. Varshney.
31st Advances in Neural Information Processing Systems (NIPS), Long Beach, CA, December 2017.