<a href="https://colab.research.google.com/github/FauxGrit/Titanic-Fairness/blob/main/Fairness_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tutorial: A simple approach to audit machine learning models for bias

*Fortunately for serious minds, a bias recognized is a bias sterilized.*

**Benjamin Haydon was clearly not a data scientist.** 

This is a tutorial to demonstrate how to audit machine learning models. It is an extension of the first tutorial in Kaggle aimed at building a model to predict survival on the Titanic. 

### Importing Necessary Libraries

In [2]:
!pip install aequitas==0.42.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import warnings, os, xlrd, pickle
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import openpyxl

from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn import metrics as sklearn_metrics
from sklearn import linear_model, preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder
from sklearn import discriminant_analysis
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
# from catboost import CatBoostClassifier

from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
import aequitas.plot as ap


# from pycaret.classification import *   ## <--- uncomment to run!!!
from geopy.distance import geodesic
from math import cos, sqrt

import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')
import plotly.graph_objs as go
import plotly
from plotly import tools

init_notebook_mode(connected=True)
pd.set_option('display.max_columns', 100)

# Supress warnings for cleaner presentation
# Turn off durng development!!!
%matplotlib inline
warnings.filterwarnings('ignore')

# Gather Data
Airbnb data from Kaggle https://www.kaggle.com/ivanovskia1/nyc-airbnb-rental-data-october-2017 .

**Preprocessing prior to data load:**
Significant data cleaning done using OpenRefine.


### Load Data into Notebook
* Load CSV file. It must be in same directory location as Jupyter Notebook (or add path to code).
* Print first five rows to confirm data load.
* Print shape to see how many rows and columns.

#### Merged and Cleaned Data

In [4]:
data_url = 'https://raw.githubusercontent.com/FauxGrit/Titanic-Fairness/main/train.csv'
df0 = pd.read_csv(data_url)

df0.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Print shape of dataframe to determine number of rows and columns
print("Cleaned Airbnb Data\nNumber of rows: ", df0.shape[0], "\nNumber of columns: ", df0.shape[1])

Cleaned Airbnb Data
Number of rows:  891 
Number of columns:  12


# Basic Data Exploration

### Numerical Data Basic Stats

#### Clean and Merged Airbnb Data

In [6]:
df0['Children'] = 0
df0.loc[df0['Age'] < 19, 'Children'] = 1

df0.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Children
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [7]:
# Fill NaN values
df0['Age'] = df0['Age'].fillna(df0['Age'].mean())
df0['Fare'] = df0['Fare'].fillna(df0['Fare'].mean())
df0['Embarked'] = df0['Embarked'].fillna('Empty')

df0.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Children
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [8]:
df1 = pd.get_dummies(data=df0, columns=['Sex','Embarked'])
df1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Children,Sex_female,Sex_male,Embarked_C,Embarked_Empty,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,0,1,0,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,1,0,1,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,1,0,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,0,1,0,0,0,1


In [9]:
# Drop 'PassengerId','Name','Ticket','Cabin' since not useful
df1.drop(['PassengerId','Name','Ticket','Cabin', 'Sex_female'], axis=1, inplace=True)
df1.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Children,Sex_male,Embarked_C,Embarked_Empty,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,0,1
1,1,1,38.0,1,0,71.2833,0,0,1,0,0,0
2,1,3,26.0,0,0,7.925,0,0,0,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,0,1


In [10]:
df1.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Children',
       'Sex_male', 'Embarked_C', 'Embarked_Empty', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [11]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,891.0,29.699118,13.002015,0.42,22.0,29.699118,35.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
Children,891.0,0.156004,0.363063,0.0,0.0,0.0,0.0,1.0
Sex_male,891.0,0.647587,0.47799,0.0,0.0,1.0,1.0,1.0
Embarked_C,891.0,0.188552,0.391372,0.0,0.0,0.0,0.0,1.0
Embarked_Empty,891.0,0.002245,0.047351,0.0,0.0,0.0,0.0,1.0


In [12]:
#Separate Input & Response Variables
X = df1.iloc[:,df1.columns != 'Survived'].values
y = df1.iloc[:, 0].values

In [13]:
#Split into Training / Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [14]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
attributes_and_reference_groups={'Age_Level': 'Adults'}
attributes_to_audit = list(attributes_and_reference_groups.keys())

# False Positive Rate --> "It's gonna sink, but we predict you will survive"
metrics = ['fpr']

disparity_tolerance = 1.2

In [16]:
df_fair = pd.DataFrame(X_test, columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Children',
       'Sex_male', 'Embarked_C', 'Embarked_Empty', 'Embarked_Q', 'Embarked_S'])

df_fair['label_value'] = y_test

df_fair['Age_Level'] = 'Adults'
df_fair.loc[df_fair['Children'] > 0, 'Age_Level'] = 'Children'

# df_fair.tail(10)

# Initialize Aequitas
g = Group()
b = Bias()

#Random Forrest Classifier

In [17]:
rf = RandomForestClassifier(max_depth=3, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, random_state=0)

In [18]:
# Test Data
sklearn_metrics.confusion_matrix(y_test, rf.predict(X_test))

array([[129,  10],
       [ 29,  55]])

In [19]:
# Accuracy
rf.score(X_test, y_test)

0.8251121076233184

In [20]:
df_fair['score'] = rf.predict(X_test)

# get_crosstabs returns a dataframe of the group counts and group value bias metrics.
xtab, _ = g.get_crosstabs(df_fair, attr_cols=attributes_to_audit)
bdf = b.get_disparity_predefined_groups(xtab, original_df=df_fair, ref_groups_dict=attributes_and_reference_groups)

ap.disparity(bdf, metrics, 'Age_Level', fairness_threshold = disparity_tolerance)

get_disparity_predefined_group()


In [21]:
ap.absolute(bdf, metrics, 'Age_Level', fairness_threshold = disparity_tolerance)

#XGBoost Classifier

In [22]:
xg = xgb.XGBClassifier()
xg.fit(X_train, y_train)

XGBClassifier()

In [23]:
# Test Data
sklearn_metrics.confusion_matrix(y_test, xg.predict(X_test))

array([[127,  12],
       [ 21,  63]])

In [24]:
# Accuracy
xg.score(X_test, y_test)

0.852017937219731

In [25]:
df_fair['score'] = xg.predict(X_test)

# get_crosstabs returns a dataframe of the group counts and group value bias metrics.
xtab, _ = g.get_crosstabs(df_fair, attr_cols=attributes_to_audit)
bdf = b.get_disparity_predefined_groups(xtab, original_df=df_fair, ref_groups_dict=attributes_and_reference_groups)

ap.disparity(bdf, metrics, 'Age_Level', fairness_threshold = disparity_tolerance)

get_disparity_predefined_group()


In [26]:
ap.absolute(bdf, metrics, 'Age_Level', fairness_threshold = disparity_tolerance)

#Gradient Boosting Classifier

In [27]:
GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier()

In [28]:
# Test Data
sklearn_metrics.confusion_matrix(y_test, gb.predict(X_test))

array([[127,  12],
       [ 23,  61]])

In [29]:
# Accuracy
gb.score(X_test, y_test)

0.8430493273542601

In [30]:
df_fair['score'] = gb.predict(X_test)

# get_crosstabs returns a dataframe of the group counts and group value bias metrics.
xtab, _ = g.get_crosstabs(df_fair, attr_cols=attributes_to_audit)
bdf = b.get_disparity_predefined_groups(xtab, original_df=df_fair, ref_groups_dict=attributes_and_reference_groups)

ap.disparity(bdf, metrics, 'Age_Level', fairness_threshold = disparity_tolerance)

get_disparity_predefined_group()


In [31]:
ap.absolute(bdf, metrics, 'Age_Level', fairness_threshold = disparity_tolerance)

#KNN Classifier

In [32]:
KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)

KNeighborsClassifier()

In [33]:
# Test Data
sklearn_metrics.confusion_matrix(y_test, kn.predict(X_test))

array([[116,  23],
       [ 25,  59]])

In [34]:
# Accuracy
kn.score(X_test, y_test)

0.7847533632286996

In [35]:
df_fair['score'] = kn.predict(X_test)

# get_crosstabs returns a dataframe of the group counts and group value bias metrics.
xtab, _ = g.get_crosstabs(df_fair, attr_cols=attributes_to_audit)
bdf = b.get_disparity_predefined_groups(xtab, original_df=df_fair, ref_groups_dict=attributes_and_reference_groups)

ap.disparity(bdf, metrics, 'Age_Level', fairness_threshold = disparity_tolerance)

get_disparity_predefined_group()


In [36]:
ap.absolute(bdf, metrics, 'Age_Level', fairness_threshold = disparity_tolerance)

In [37]:
bdf[['attribute_name', 'attribute_value'] + b.list_disparities(bdf)]

Unnamed: 0,attribute_name,attribute_value,ppr_disparity,pprev_disparity,precision_disparity,fdr_disparity,for_disparity,fpr_disparity,fnr_disparity,tpr_disparity,tnr_disparity,npv_disparity
0,Age_Level,Adults,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Age_Level,Children,0.261538,1.358547,1.080563,0.804954,1.605263,1.415205,0.916667,1.036232,0.922658,0.881321


In [38]:
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[['attribute_name', 'attribute_value'] + absolute_metrics]

Unnamed: 0,attribute_name,attribute_value,tpr,tnr,for,fdr,fpr,fnr,npv,precision,ppr,pprev,prev
0,Age_Level,Adults,0.69697,0.842975,0.163934,0.292308,0.157025,0.30303,0.836066,0.707692,0.792683,0.347594,0.352941
1,Age_Level,Children,0.722222,0.777778,0.263158,0.235294,0.222222,0.277778,0.736842,0.764706,0.207317,0.472222,0.5


In [39]:
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

Unnamed: 0,model_id,score_threshold,k,attribute_name,attribute_value,pp,pn,fp,fn,tn,tp,group_label_pos,group_label_neg,group_size,total_entities
0,0,binary 0/1,82,Age_Level,Adults,65,122,19,20,102,46,66,121,187,223
1,0,binary 0/1,82,Age_Level,Children,17,19,4,5,14,13,18,18,36,223
