In [1]:
from bias_framework import Bias_Framework
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


If not done already, download data from https://archive.ics.uci.edu/dataset/2/adult 

In [2]:
# Data appears not to include column names, but these can be found in adult.names. fnlwgt isn't a very good name, but it isn't mine
df_census_income_train = pd.read_csv("./adult/adult.data", names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"])

df_census_income_validation = pd.read_csv("./adult/adult.test", skiprows=1, names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"])



In [3]:
df_census_income_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df_census_income_validation.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [5]:
df_census_income_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:


df_census_income_train.describe()



Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
for column in df_census_income_train.columns:
    print(column + ",", df_census_income_train[column].nunique(), "unique values,")

age, 73 unique values,
workclass, 9 unique values,
fnlwgt, 21648 unique values,
education, 16 unique values,
education-num, 16 unique values,
marital-status, 7 unique values,
occupation, 15 unique values,
relationship, 6 unique values,
race, 5 unique values,
sex, 2 unique values,
capital-gain, 119 unique values,
capital-loss, 92 unique values,
hours-per-week, 94 unique values,
native-country, 42 unique values,
income, 2 unique values,


We will partly adapt https://www.kaggle.com/code/yashhvyass/adult-census-income-logistic-reg-explained-86-2 in order to save dev time. This will be useful to return to for further modelling, but right now I am just going to do the bare minimum to test the bias framework

In [8]:
df_census_income_train["sex"] = df_census_income_train["sex"].apply(lambda x: 1 if x.strip().lower() == "male" else 0)
# According to the graphs in the link provided (and verified below more simply) the dataset is overwhelming white, so group all other races together
# This is also useful since it makes it simpler to compare fairness
df_census_income_train["race"] = df_census_income_train["race"].apply(lambda x: 1 if x.strip().lower() == "white" else 0)
# Similar to race
df_census_income_train["native-country"] = df_census_income_train["native-country"].apply(lambda x: 1 if x.strip().lower() == "united-states" else 0)
df_census_income_train["income"] = df_census_income_train["income"].apply(lambda x: 1 if x.strip().lower() == ">50k" else 0)

# Everything we do to the training data we need to do to the validation data
df_census_income_validation["sex"] = df_census_income_validation["sex"].apply(lambda x: 1 if x.strip().lower() == "male" else 0)
df_census_income_validation["race"] = df_census_income_validation["race"].apply(lambda x: 1 if x.strip().lower() == "white" else 0)
df_census_income_validation["native-country"] = df_census_income_validation["native-country"].apply(lambda x: 1 if x.strip().lower() == "united-states" else 0)
# Turns out this is ever so slightly different from the training set
df_census_income_validation["income"] = df_census_income_validation["income"].apply(lambda x: 1 if x.strip().lower() == ">50k." else 0)


In [9]:
df_census_income_train["race"].value_counts()

race
1    27816
0     4745
Name: count, dtype: int64

In [10]:
df_census_income_train["native-country"].value_counts()

native-country
1    29170
0     3391
Name: count, dtype: int64

In [11]:
df_census_income_train["income"].value_counts()

income
0    24720
1     7841
Name: count, dtype: int64

In [12]:
print("Training data percent belonging to class 0:")
print(len(df_census_income_train[df_census_income_train["income"] == 0]) / len(df_census_income_train))
print("Validation data percent belonging to class 0:")
print(len(df_census_income_validation[df_census_income_validation["income"] == 0]) / len(df_census_income_validation))

Training data percent belonging to class 0:
0.7591904425539756
Validation data percent belonging to class 0:
0.7637737239727289


In [13]:
print("Training data size", len(df_census_income_train))
print("Validation data size", len(df_census_income_validation))

Training data size 32561
Validation data size 16281


In [14]:
# Getting rid of fnlwgt because the discription in the kaggle link does not make it sound useful
# Getting rid of education-num since it duplicates education, which we can 1 hot encode
df_census_income_train.drop(columns=["fnlwgt", "education-num"], inplace=True)
df_census_income_validation.drop(columns=["fnlwgt", "education-num"], inplace=True)


In [15]:
df_census_income_train = pd.get_dummies(df_census_income_train, columns=df_census_income_train.select_dtypes(include=['object']).columns.tolist())
df_census_income_validation = pd.get_dummies(df_census_income_validation, columns=df_census_income_validation.select_dtypes(include=['object']).columns.tolist())

In [16]:
df_census_income_train.head()

Unnamed: 0,age,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,workclass_ ?,workclass_ Federal-gov,...,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife
0,39,1,1,2174,0,40,1,0,False,False,...,False,False,False,False,False,True,False,False,False,False
1,50,1,1,0,0,13,1,0,False,False,...,False,False,False,False,True,False,False,False,False,False
2,38,1,1,0,0,40,1,0,False,False,...,False,False,False,False,False,True,False,False,False,False
3,53,0,1,0,0,40,1,0,False,False,...,False,False,False,False,True,False,False,False,False,False
4,28,0,0,0,0,40,0,0,False,False,...,False,False,False,False,False,False,False,False,False,True


In [17]:
df_census_income_train_x = df_census_income_train.drop(columns=["income"])
df_census_income_train_y = df_census_income_train["income"]

df_census_income_validation_x = df_census_income_validation.drop(columns=["income"])
df_census_income_validation_y = df_census_income_validation["income"]



In [18]:
df_census_income_train.head()

Unnamed: 0,age,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,workclass_ ?,workclass_ Federal-gov,...,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife
0,39,1,1,2174,0,40,1,0,False,False,...,False,False,False,False,False,True,False,False,False,False
1,50,1,1,0,0,13,1,0,False,False,...,False,False,False,False,True,False,False,False,False,False
2,38,1,1,0,0,40,1,0,False,False,...,False,False,False,False,False,True,False,False,False,False
3,53,0,1,0,0,40,1,0,False,False,...,False,False,False,False,True,False,False,False,False,False
4,28,0,0,0,0,40,0,0,False,False,...,False,False,False,False,False,False,False,False,False,True


In [19]:
random_forest_classifier = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=0)


random_forest_bias = Bias_Framework(random_forest_classifier, df_census_income_train_x, df_census_income_validation_x, df_census_income_train_y, df_census_income_validation_y)

In [20]:
random_forest_bias.set_privilege_function(lambda x: x["sex"] == 1 or x["race"] == 1)

In [21]:
random_forest_bias.run_framework()



9.776099920272827 seconds to run with no debiasing
23.40844202041626 seconds to get fairea baseline
81.2869770526886 seconds to run learning fair representation
10.74599289894104 seconds to run reweighting
127.24047613143921 seconds to run reject option classification
30.03303599357605 seconds to run calibrated equal odds
10.067971229553223 seconds to run equal odds


In [30]:
bias_graphs = random_forest_bias.get_DebiasingGraphsObject()
bias_graphs.show_single_graph("accuracy", "statistical parity difference", include_fairea_labels=True)

In [23]:
random_forest_bias.show_all_subplots()

In [24]:

random_forest_bias.show_single_graph("accuracy", "equal opportunity difference")


In [25]:
graph_1 = random_forest_bias.get_DebiasingGraphsObject()

In [26]:
graph_2 = random_forest_bias.get_DebiasingGraphsObject()

In [27]:
graph_3 = graph_1 + graph_2

In [28]:
graph_3.show_all_subplots()