In [None]:
from bias_framework import Bias_Framework
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


Download data from https://archive.ics.uci.edu/dataset/2/adult and unzip in this directory

In [None]:
# Data appears not to include column names, but these can be found in adult.names. fnlwgt isn't a very good name, but it isn't mine
df_census_income_train = pd.read_csv("./adult/adult.data", names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"])

df_census_income_validation = pd.read_csv("./adult/adult.test", skiprows=1, names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"])



In [None]:
df_census_income_train.head()

In [None]:
df_census_income_validation.head()

In [None]:
df_census_income_train.info()

In [None]:


df_census_income_train.describe()



In [None]:
for column in df_census_income_train.columns:
    print(column + ",", df_census_income_train[column].nunique(), "unique values,")

We will partly adapt https://www.kaggle.com/code/yashhvyass/adult-census-income-logistic-reg-explained-86-2 in order to save dev time. This will be useful to return to for further modelling, but right now I am just going to do the bare minimum to test the bias framework

In [None]:
df_census_income_train["sex"] = df_census_income_train["sex"].apply(lambda x: 1 if x.strip().lower() == "male" else 0)
# According to the graphs in the link provided (and verified below more simply) the dataset is overwhelming white, so group all other races together
# This is also useful since it makes it simpler to compare fairness
df_census_income_train["race"] = df_census_income_train["race"].apply(lambda x: 1 if x.strip().lower() == "white" else 0)
# Similar to race
df_census_income_train["native-country"] = df_census_income_train["native-country"].apply(lambda x: 1 if x.strip().lower() == "united-states" else 0)
df_census_income_train["income"] = df_census_income_train["income"].apply(lambda x: 1 if x.strip().lower() == ">50k" else 0)

# Everything we do to the training data we need to do to the validation data
df_census_income_validation["sex"] = df_census_income_validation["sex"].apply(lambda x: 1 if x.strip().lower() == "male" else 0)
df_census_income_validation["race"] = df_census_income_validation["race"].apply(lambda x: 1 if x.strip().lower() == "white" else 0)
df_census_income_validation["native-country"] = df_census_income_validation["native-country"].apply(lambda x: 1 if x.strip().lower() == "united-states" else 0)
# Turns out this is ever so slightly different from the training set
df_census_income_validation["income"] = df_census_income_validation["income"].apply(lambda x: 1 if x.strip().lower() == ">50k." else 0)


In [None]:
df_census_income_train["race"].value_counts()

In [None]:
df_census_income_train["native-country"].value_counts()

In [None]:
df_census_income_train["income"].value_counts()

In [None]:
print("Training data percent belonging to class 0:")
print(len(df_census_income_train[df_census_income_train["income"] == 0]) / len(df_census_income_train))
print("Validation data percent belonging to class 0:")
print(len(df_census_income_validation[df_census_income_validation["income"] == 0]) / len(df_census_income_validation))

In [None]:
# Getting rid of fnlwgt because the discription in the kaggle link does not make it sound useful
# Getting rid of education-num since it duplicates education, which we can 1 hot encode
df_census_income_train.drop(columns=["fnlwgt", "education-num"], inplace=True)
df_census_income_validation.drop(columns=["fnlwgt", "education-num"], inplace=True)


In [None]:
df_census_income_train = pd.get_dummies(df_census_income_train, columns=df_census_income_train.select_dtypes(include=['object']).columns.tolist())
df_census_income_validation = pd.get_dummies(df_census_income_validation, columns=df_census_income_validation.select_dtypes(include=['object']).columns.tolist())

In [None]:
df_census_income_train.head()

In [None]:
# My class assumes that the target variable will be the last column of the dataframe, which is not the case here. Thise cell corrects that
df_census_income_train = pd.concat([df_census_income_train.drop(columns=["income"]), df_census_income_train["income"]], axis=1)
df_census_income_validation = pd.concat([df_census_income_validation.drop(columns=["income"]), df_census_income_validation["income"]], axis=1)

In [None]:
df_census_income_train.head()

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=0)


random_forest_bias = Bias_Framework(random_forest_classifier, df_census_income_train, df_census_income_validation)

In [None]:
random_forest_bias.set_privilege_function(lambda x: x["sex"] == 1 or x["race"] == 1)

In [None]:
random_forest_bias.run_framework()



In [None]:
test = random_forest_bias.get_raw_data()

In [None]:
# test["reweighing"]["error"]["accuracy"]

In [None]:
# for key in test["no debiasing"]["fairness"].keys():
#     print(key, test["no debiasing"]["fairness"][key]["value"] - test["reweighing"]["fairness"][key]["value"])
# for key in test["no debiasing"]["error"].keys():
#     print(key, test["no debiasing"]["error"][key]["value"] - test["reweighing"]["error"][key]["value"])

In [None]:
random_forest_bias.show_fairea_graph("accuracy", "statistical parity difference")

In [None]:
random_forest_bias.show_all_fairea_graphs()