In [1]:
%matplotlib inline

# Built-in
import csv
import os
import re

# Scientific/ML libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


# IPython display
from IPython.display import Markdown, display

# Scipy
from scipy.stats import chi2_contingency

# Scikit-learn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# AIF360 fairness library
from aif360.datasets import BinaryLabelDataset
from aif360.explainers import MetricTextExplainer
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms.preprocessing import Reweighing, DisparateImpactRemover
from aif360.algorithms.inprocessing import AdversarialDebiasing
from aif360.algorithms.postprocessing import EqOddsPostprocessing, RejectOptionClassification
from aif360.algorithms.postprocessing.calibrated_eq_odds_postprocessing import CalibratedEqOddsPostprocessing

# TensorFlow

# Seed for reproducibility
np.random.seed(1)


pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


## 1. Loading and Initial Data Exploration


In [2]:
df_original = pd.read_csv('dataset/diabetic_data.csv')
df_original.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
df = df_original.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [4]:
df['race'].describe()
df['gender'].describe()
df['age'].describe()
df.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


# 2. Data Cleaning

### Removing NaN

In [5]:
df.columns = df.columns.str.strip()

In [6]:
print(df['gender'].unique())

['Female' 'Male' 'Unknown/Invalid']


In [7]:
print(df['gender'][df['gender'] == 'Unknown/Invalid'].count())

3


In [8]:
df = df[df['gender'] != 'Unknown/Invalid']

In [9]:
print(df['age'].unique())

['[0-10)' '[10-20)' '[20-30)' '[30-40)' '[40-50)' '[50-60)' '[60-70)'
 '[70-80)' '[80-90)' '[90-100)']


In [10]:
print(df['race'].unique())

['Caucasian' 'AfricanAmerican' '?' 'Other' 'Asian' 'Hispanic']


In [11]:
print(df['race'][df['race'] == '?'].count())

2271


In [12]:
# Small value, can remove
df = df[df['race'] != '?']

In [13]:
# Target column
print(df['readmitted'].unique())
readmit_counts = df['readmitted'].value_counts()

# Prints the counts
print(readmit_counts)

['NO' '>30' '<30']
readmitted
NO     53316
>30    35007
<30    11169
Name: count, dtype: int64


In [14]:
for col in df.columns:
    if df[col].dtype == 'object':
        print(col, df[col][df[col] == '?'].count())

race 0
gender 0
age 0
weight 96433
payer_code 39711
medical_specialty 48766
diag_1 19
diag_2 336
diag_3 1349
max_glu_serum 0
A1Cresult 0
metformin 0
repaglinide 0
nateglinide 0
chlorpropamide 0
glimepiride 0
acetohexamide 0
glipizide 0
glyburide 0
tolbutamide 0
pioglitazone 0
rosiglitazone 0
acarbose 0
miglitol 0
troglitazone 0
tolazamide 0
examide 0
citoglipton 0
insulin 0
glyburide-metformin 0
glipizide-metformin 0
glimepiride-pioglitazone 0
metformin-rosiglitazone 0
metformin-pioglitazone 0
change 0
diabetesMed 0
readmitted 0


In [None]:
# removing features with high numbers of missing values
# does not get rid of A1C result since it is regarded as an important characteristics
df = df.drop(['encounter_id', 'patient_nbr', 'weight','payer_code'], axis='columns')
df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,Pediatrics-Endocrinology,41,0,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,?,59,0,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,?,11,5,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,?,44,1,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,?,51,0,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [16]:
# Drop NaNs
df = df.dropna(subset=['diag_1', 'diag_2', 'diag_3'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99492 entries, 0 to 101765
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      99492 non-null  object
 1   gender                    99492 non-null  object
 2   age                       99492 non-null  object
 3   admission_type_id         99492 non-null  int64 
 4   discharge_disposition_id  99492 non-null  int64 
 5   admission_source_id       99492 non-null  int64 
 6   time_in_hospital          99492 non-null  int64 
 7   medical_specialty         99492 non-null  object
 8   num_lab_procedures        99492 non-null  int64 
 9   num_procedures            99492 non-null  int64 
 10  num_medications           99492 non-null  int64 
 11  number_outpatient         99492 non-null  int64 
 12  number_emergency          99492 non-null  int64 
 13  number_inpatient          99492 non-null  int64 
 14  diag_1                    

In [17]:
# Replace dashes with underscores
df.columns = df.columns.str.replace('-', '_')

# 3. Encoding Dataset

In [18]:
df_encoded = df.copy()
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99492 entries, 0 to 101765
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      99492 non-null  object
 1   gender                    99492 non-null  object
 2   age                       99492 non-null  object
 3   admission_type_id         99492 non-null  int64 
 4   discharge_disposition_id  99492 non-null  int64 
 5   admission_source_id       99492 non-null  int64 
 6   time_in_hospital          99492 non-null  int64 
 7   medical_specialty         99492 non-null  object
 8   num_lab_procedures        99492 non-null  int64 
 9   num_procedures            99492 non-null  int64 
 10  num_medications           99492 non-null  int64 
 11  number_outpatient         99492 non-null  int64 
 12  number_emergency          99492 non-null  int64 
 13  number_inpatient          99492 non-null  int64 
 14  diag_1                    

In [19]:

# Gender
df_encoded['gender'] = df_encoded['gender'].map({'Female': 0, 'Male': 1})

# Race
df_encoded = pd.get_dummies(df_encoded, columns=['race'], prefix='race', drop_first=False)
df_encoded.insert(0, 'race', df['race'])
df_encoded['race'] = df_encoded['race'].map({'Other':0, 'AfricanAmerican': 1,'Asian': 2, 'Caucasian': 3, 'Hispanic': 4})

# Set new columns as 0/1 values for race
df_encoded = df_encoded.astype({'race_Caucasian': 'int', 'race_AfricanAmerican': 'int', 'race_Asian': 'int', 'race_Hispanic': 'int', 'race_Other': 'int'})


print(df_encoded[['gender', 'race', 'race_Caucasian', 'race_AfricanAmerican', 'race_Asian', 'race_Hispanic', 'race_Other']].head())



   gender  race  race_Caucasian  race_AfricanAmerican  race_Asian  \
0       0     3               1                     0           0   
1       0     3               1                     0           0   
2       0     1               0                     1           0   
3       1     3               1                     0           0   
4       1     3               1                     0           0   

   race_Hispanic  race_Other  
0              0           0  
1              0           0  
2              0           0  
3              0           0  
4              0           0  


In [20]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99492 entries, 0 to 101765
Data columns (total 51 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      99492 non-null  int64 
 1   gender                    99492 non-null  int64 
 2   age                       99492 non-null  object
 3   admission_type_id         99492 non-null  int64 
 4   discharge_disposition_id  99492 non-null  int64 
 5   admission_source_id       99492 non-null  int64 
 6   time_in_hospital          99492 non-null  int64 
 7   medical_specialty         99492 non-null  object
 8   num_lab_procedures        99492 non-null  int64 
 9   num_procedures            99492 non-null  int64 
 10  num_medications           99492 non-null  int64 
 11  number_outpatient         99492 non-null  int64 
 12  number_emergency          99492 non-null  int64 
 13  number_inpatient          99492 non-null  int64 
 14  diag_1                    

In [21]:
age_mapping = {
    '[0-10)': 5,
    '[10-20)': 15,
    '[20-30)': 25,
    '[30-40)': 35,
    '[40-50)': 45,
    '[50-60)': 55,
    '[60-70)': 65,
    '[70-80)': 75,
    '[80-90)': 85,
    '[90-100)': 95
}

df_encoded['age'] = df_encoded['age'].map(age_mapping)

print(df_encoded[['age']].head())

   age
0    5
1   15
2   25
3   35
4   45


In [22]:
for col in df_encoded.columns:
    print(f"Column: {col}")
    print(df_encoded[col].unique())
    print("-" * 50)

Column: race
[3 1 0 2 4]
--------------------------------------------------
Column: gender
[0 1]
--------------------------------------------------
Column: age
[ 5 15 25 35 45 55 65 75 85 95]
--------------------------------------------------
Column: admission_type_id
[6 1 2 3 4 5 8 7]
--------------------------------------------------
Column: discharge_disposition_id
[25  1  3  6  2  5 11  7 10  4 14 18  8 13 12 16 17 22 23  9 20 15 24 28
 19 27]
--------------------------------------------------
Column: admission_source_id
[ 1  7  2  4  5  6 20  3 17  8  9 14 10 22 11 25 13]
--------------------------------------------------
Column: time_in_hospital
[ 1  3  2  4  5 13 12  9  7 10 11  6  8 14]
--------------------------------------------------
Column: medical_specialty
['Pediatrics-Endocrinology' '?' 'InternalMedicine'
 'Family/GeneralPractice' 'Cardiology' 'Surgery-General' 'Orthopedics'
 'Gastroenterology' 'Surgery-Cardiovascular/Thoracic' 'Nephrology'
 'Orthopedics-Reconstructive' 

### Deleting columns with only 1 unique value

In [23]:
df_encoded = df_encoded.drop(columns=['examide', 'citoglipton'])
df_encoded = df_encoded.drop(columns=['metformin_rosiglitazone'])

In [24]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99492 entries, 0 to 101765
Data columns (total 48 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      99492 non-null  int64 
 1   gender                    99492 non-null  int64 
 2   age                       99492 non-null  int64 
 3   admission_type_id         99492 non-null  int64 
 4   discharge_disposition_id  99492 non-null  int64 
 5   admission_source_id       99492 non-null  int64 
 6   time_in_hospital          99492 non-null  int64 
 7   medical_specialty         99492 non-null  object
 8   num_lab_procedures        99492 non-null  int64 
 9   num_procedures            99492 non-null  int64 
 10  num_medications           99492 non-null  int64 
 11  number_outpatient         99492 non-null  int64 
 12  number_emergency          99492 non-null  int64 
 13  number_inpatient          99492 non-null  int64 
 14  diag_1                    

In [25]:
print(df[['medical_specialty', 'diag_1', 'diag_2', 'diag_3','max_glu_serum', 'A1Cresult', 'change', 'diabetesMed', 'readmitted']].nunique())

medical_specialty     73
diag_1               715
diag_2               746
diag_3               787
max_glu_serum          3
A1Cresult              3
change                 2
diabetesMed            2
readmitted             3
dtype: int64


In [26]:
print(df['max_glu_serum'].unique())
print(df['A1Cresult'].unique())
print(df['change'].unique())
print(df['diabetesMed'].unique())
print(df['readmitted'].unique())

[nan '>300' 'Norm' '>200']
[nan '>7' '>8' 'Norm']
['No' 'Ch']
['No' 'Yes']
['NO' '>30' '<30']


In [27]:
df_encoded[['medical_specialty', 'diag_1', 'diag_2', 'diag_3']] = df[['medical_specialty', 'diag_1', 'diag_2', 'diag_3']].replace('?', np.nan)

In [28]:
le = LabelEncoder()
df_encoded['medical_specialty'] = le.fit_transform(df_encoded['medical_specialty'])

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

{'AllergyandImmunology': np.int64(0), 'Anesthesiology': np.int64(1), 'Anesthesiology-Pediatric': np.int64(2), 'Cardiology': np.int64(3), 'Cardiology-Pediatric': np.int64(4), 'DCPTEAM': np.int64(5), 'Dentistry': np.int64(6), 'Dermatology': np.int64(7), 'Emergency/Trauma': np.int64(8), 'Endocrinology': np.int64(9), 'Endocrinology-Metabolism': np.int64(10), 'Family/GeneralPractice': np.int64(11), 'Gastroenterology': np.int64(12), 'Gynecology': np.int64(13), 'Hematology': np.int64(14), 'Hematology/Oncology': np.int64(15), 'Hospitalist': np.int64(16), 'InfectiousDiseases': np.int64(17), 'InternalMedicine': np.int64(18), 'Nephrology': np.int64(19), 'Neurology': np.int64(20), 'Neurophysiology': np.int64(21), 'Obsterics&Gynecology-GynecologicOnco': np.int64(22), 'Obstetrics': np.int64(23), 'ObstetricsandGynecology': np.int64(24), 'Oncology': np.int64(25), 'Ophthalmology': np.int64(26), 'Orthopedics': np.int64(27), 'Orthopedics-Reconstructive': np.int64(28), 'Osteopath': np.int64(29), 'Otolaryn

In [29]:
def transform_diagnosis_code(code):
    if pd.isna(code):
        return code
    code = str(code)  # Convert to string
    if code.startswith('V'):
        return float(code[1:]) + 1000  # Adjust 'V' codes
    elif code.startswith('E'):
        return float(code[1:]) + 300  # Adjust 'E' codes
    return float(code)

df_encoded['diag_1'] = df_encoded['diag_1'].apply(transform_diagnosis_code)
df_encoded['diag_2'] = df_encoded['diag_2'].apply(transform_diagnosis_code)
df_encoded['diag_3'] = df_encoded['diag_3'].apply(transform_diagnosis_code)

print(df_encoded['diag_1'].unique())

[ 250.83  276.    648.      8.    197.    414.    428.    398.    434.
  250.7   157.    518.    999.    410.    682.    402.   1057.    189.
  786.    427.    996.    277.    584.    462.    473.    411.    174.
  486.    998.    511.    432.    626.    295.    196.    250.6   182.
  845.    423.    808.    250.4   722.    403.    250.11  784.    707.
  440.    151.    715.    997.    198.    564.     38.    590.    578.
  250.32  433.    569.    185.    536.    255.    250.13  599.    558.
  574.    491.    560.    244.    250.03  577.    730.    188.    824.
  250.8   332.    562.    291.    296.    510.    401.    263.    438.
   70.    250.02  493.    642.    571.    738.    593.    618.    250.42
  807.    456.    446.    572.    575.    250.41  820.    515.    780.
  250.22  995.    235.    250.82  721.    787.    556.    162.    724.
  282.    514.   1055.    281.    250.33  530.    466.    435.    250.12
 1053.    789.    566.    822.    191.    557.    733.    455.    711.
  

In [30]:
def map_diagnosis(data, cols):

    for col in cols:
        data["temp_diag"] = np.nan
        data.loc[(data[col]>=390) & (data[col]<=459) | (data[col]==785), "temp_diag"] = "Circulatory"
        data.loc[(data[col]>=460) & (data[col]<=519) | (data[col]==786), "temp_diag"] = "Respiratory"
        data.loc[(data[col]>=520) & (data[col]<=579) | (data[col]==787), "temp_diag"] = "Digestive"
        data.loc[(data[col]>=250) & (data[col]<251), "temp_diag"] = "Diabetes"
        data.loc[(data[col]>=800) & (data[col]<=999), "temp_diag"] = "Injury"
        data.loc[(data[col]>=710) & (data[col]<=739), "temp_diag"] = "Muscoloskeletal"
        data.loc[(data[col]>=580) & (data[col]<=629) | (data[col] == 788), "temp_diag"] = "Genitourinary"
        data.loc[(data[col]>=140) & (data[col]<=239), "temp_diag"] = "Neoplasms"

        data["temp_diag"] = data["temp_diag"].fillna("Other")
        data[col] = data["temp_diag"]
        data = data.drop("temp_diag", axis=1)

    return data

df_encoded[['diag_1', 'diag_2', 'diag_3']] = df_encoded[['diag_1', 'diag_2', 'diag_3']].replace('?', np.nan)
df_encoded = map_diagnosis(df_encoded, ['diag_1', 'diag_2', 'diag_3'])

pd.set_option('display.max_columns', None)
display(df_encoded)

  data.loc[(data[col]>=390) & (data[col]<=459) | (data[col]==785), "temp_diag"] = "Circulatory"
  data.loc[(data[col]>=390) & (data[col]<=459) | (data[col]==785), "temp_diag"] = "Circulatory"
  data.loc[(data[col]>=390) & (data[col]<=459) | (data[col]==785), "temp_diag"] = "Circulatory"


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_pioglitazone,change,diabetesMed,readmitted,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other
0,3,0,5,6,25,1,1,37,41,0,1,0,0,0,Diabetes,Other,Other,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,0,0,1,0,0
1,3,0,15,1,1,7,3,72,59,0,18,0,0,0,Other,Diabetes,Other,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,Ch,Yes,>30,0,0,1,0,0
2,1,0,25,1,1,7,2,72,11,5,13,2,0,1,Other,Diabetes,Other,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,1,0,0,0,0
3,3,1,35,1,1,7,2,72,44,1,16,0,0,0,Other,Diabetes,Circulatory,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,Ch,Yes,NO,0,0,1,0,0
4,3,1,45,1,1,7,1,72,51,0,8,0,0,0,Neoplasms,Neoplasms,Diabetes,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Ch,Yes,NO,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,1,75,1,3,7,3,72,51,0,16,0,0,0,Diabetes,Other,Circulatory,9,,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,Ch,Yes,>30,1,0,0,0,0
101762,1,0,85,1,4,5,5,72,33,3,18,0,0,1,Digestive,Other,Digestive,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Yes,NO,1,0,0,0,0
101763,3,1,75,1,1,7,1,72,53,0,9,1,0,0,Other,Genitourinary,Other,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,Ch,Yes,NO,0,0,1,0,0
101764,3,0,85,2,3,7,10,62,45,2,21,0,0,1,Injury,Other,Injury,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,Up,No,No,No,No,Ch,Yes,NO,0,0,1,0,0


In [31]:
df_encoded = pd.get_dummies(df_encoded, columns=['diag_1'], prefix='diag_1')

df_encoded = pd.get_dummies(df_encoded, columns=['diag_2'], prefix='diag_2')

df_encoded = pd.get_dummies(df_encoded, columns=['diag_3'], prefix='diag_3')

# Identify the columns related to diag_1, diag_2, and diag_3
diag_columns = [col for col in df_encoded.columns if col.startswith(('diag_1_', 'diag_2_', 'diag_3_'))]
df_encoded[diag_columns] = df_encoded[diag_columns].astype(int)

pd.set_option('display.max_columns', None)
display(df_encoded)

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_pioglitazone,change,diabetesMed,readmitted,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,diag_1_Circulatory,diag_1_Diabetes,diag_1_Digestive,diag_1_Genitourinary,diag_1_Injury,diag_1_Muscoloskeletal,diag_1_Neoplasms,diag_1_Other,diag_1_Respiratory,diag_2_Circulatory,diag_2_Diabetes,diag_2_Digestive,diag_2_Genitourinary,diag_2_Injury,diag_2_Muscoloskeletal,diag_2_Neoplasms,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Genitourinary,diag_3_Injury,diag_3_Muscoloskeletal,diag_3_Neoplasms,diag_3_Other,diag_3_Respiratory
0,3,0,5,6,25,1,1,37,41,0,1,0,0,0,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,3,0,15,1,1,7,3,72,59,0,18,0,0,0,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,Ch,Yes,>30,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,0,25,1,1,7,2,72,11,5,13,2,0,1,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,3,1,35,1,1,7,2,72,44,1,16,0,0,0,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,Ch,Yes,NO,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,3,1,45,1,1,7,1,72,51,0,8,0,0,0,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Ch,Yes,NO,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,1,75,1,3,7,3,72,51,0,16,0,0,0,9,,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,Ch,Yes,>30,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
101762,1,0,85,1,4,5,5,72,33,3,18,0,0,1,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Yes,NO,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
101763,3,1,75,1,1,7,1,72,53,0,9,1,0,0,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,Ch,Yes,NO,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
101764,3,0,85,2,3,7,10,62,45,2,21,0,0,1,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,Up,No,No,No,No,Ch,Yes,NO,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [32]:
# 'max_glu_serum'
df_encoded['max_glu_serum'] = df_encoded['max_glu_serum'].map({
    '>300': 2,
    '>200': 1,
    'Norm': 0
}).fillna(3)

# 'A1Cresult'
df_encoded['A1Cresult'] = df_encoded['A1Cresult'].map({
    '>8': 2,
    '>7': 1,
    'Norm': 0
}).fillna(3)

# 'change'
df_encoded['change'] = df_encoded['change'].map({
    'No': 0,
    'Ch': 1,
})

# 'diabetesMed'
df_encoded['diabetesMed'] = df_encoded['diabetesMed'].map({
    'Yes': 1,
    'No': 0
})

df_encoded['readmitted'] = df_encoded['readmitted'].map({
    '<30': 1,
    '>30': 0,
    'NO': 0
})

for col in ['max_glu_serum', 'A1Cresult','change', 'diabetesMed', 'readmitted']:
    print(f"Column: {col}")
    print(df_encoded[col].unique())
    print("-" * 50)

Column: max_glu_serum
[3. 2. 0. 1.]
--------------------------------------------------
Column: A1Cresult
[3. 1. 2. 0.]
--------------------------------------------------
Column: change
[0 1]
--------------------------------------------------
Column: diabetesMed
[0 1]
--------------------------------------------------
Column: readmitted
[0 1]
--------------------------------------------------


In [33]:
df_encoded['max_glu_serum'] = df_encoded['max_glu_serum'].astype(int)
df_encoded['A1Cresult'] = df_encoded['A1Cresult'].astype(int)

In [34]:
print(df_encoded.columns[df_encoded.columns.get_loc('A1Cresult') + 1:].tolist())

['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide_metformin', 'glipizide_metformin', 'glimepiride_pioglitazone', 'metformin_pioglitazone', 'change', 'diabetesMed', 'readmitted', 'race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other', 'diag_1_Circulatory', 'diag_1_Diabetes', 'diag_1_Digestive', 'diag_1_Genitourinary', 'diag_1_Injury', 'diag_1_Muscoloskeletal', 'diag_1_Neoplasms', 'diag_1_Other', 'diag_1_Respiratory', 'diag_2_Circulatory', 'diag_2_Diabetes', 'diag_2_Digestive', 'diag_2_Genitourinary', 'diag_2_Injury', 'diag_2_Muscoloskeletal', 'diag_2_Neoplasms', 'diag_2_Other', 'diag_2_Respiratory', 'diag_3_Circulatory', 'diag_3_Diabetes', 'diag_3_Digestive', 'diag_3_Genitourinary', 'diag_3_Injury', 'diag_3_Muscoloskeletal', 'diag_3_Neoplasms', 'diag_3_Other', 'diag

In [35]:
medicine_cols = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
                 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide_metformin', 'glipizide_metformin', 
                 'glimepiride_pioglitazone', 'metformin_pioglitazone']

med_mapping = {"Up": 2, "Down": 1, "Steady": 0, "No": 3}

df_encoded[medicine_cols] = df_encoded[medicine_cols].replace(med_mapping)

  df_encoded[medicine_cols] = df_encoded[medicine_cols].replace(med_mapping)


In [36]:
modified_column_names = []
for col in df_encoded.columns:
    if any(medicine_name in col for medicine_name in medicine_cols):
        modified_column_names.append(f"{col}_medicine")
    else:
        modified_column_names.append(col)

df_encoded.columns = modified_column_names

df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99492 entries, 0 to 101765
Data columns (total 72 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   race                               99492 non-null  int64
 1   gender                             99492 non-null  int64
 2   age                                99492 non-null  int64
 3   admission_type_id                  99492 non-null  int64
 4   discharge_disposition_id           99492 non-null  int64
 5   admission_source_id                99492 non-null  int64
 6   time_in_hospital                   99492 non-null  int64
 7   medical_specialty                  99492 non-null  int64
 8   num_lab_procedures                 99492 non-null  int64
 9   num_procedures                     99492 non-null  int64
 10  num_medications                    99492 non-null  int64
 11  number_outpatient                  99492 non-null  int64
 12  number_emergency      

In [37]:
# Save base encoded data train/test split for variable processing later

# 1) Split entire data into trainval/test
df_trainval, df_test = train_test_split(df_encoded, test_size=0.15, random_state=42)

# 2) (Option A) Further split trainval into train/val:
df_train, df_val = train_test_split(df_trainval, test_size=0.12, random_state=42)
# Save to CSV

df_train.to_csv('dataset/train_test_extracted_base/train.csv', index=False)
df_test.to_csv('dataset/train_test_extracted_base/test.csv', index=False)
df_val.to_csv('dataset/train_test_extracted_base/val.csv', index=False)

print("Train and test data saved successfully.")

Train and test data saved successfully.


In [None]:
import pandas as pd


overall_counts = df_original['readmitted'].value_counts(dropna=False)
overall_percentages = (overall_counts / len(df_original)) * 100

print("Overall readmitted Distribution:")
overall_dist_df = pd.DataFrame({
    'Count': overall_counts,
    'Percentage': overall_percentages
})
display(overall_dist_df)

def analyze_readmitted(df, group_col):
    pivot_table = df.groupby([group_col, 'readmitted']).size().unstack(fill_value=0)
    
    pivot_table['Total'] = pivot_table.sum(axis=1)
    
    # Percentage of the entire dataset for each group
    pivot_table['% of Dataset'] = pivot_table['Total'] / len(df) * 100
    
    # For each readmitted category, add "Count" and "Rate in Group"
    for cat in ['NO', '<30', '>30']:
        pivot_table[f'{cat} Count'] = pivot_table[cat]
        pivot_table[f'{cat} Rate in Group (%)'] = (pivot_table[cat] / pivot_table['Total']) * 100
    
    pivot_table['<30 vs NO'] = pivot_table.apply(
        lambda row: row['<30'] / row['NO'] if row['NO'] > 0 else float('nan'), axis=1
    )
    pivot_table['<30 vs >30'] = pivot_table.apply(
        lambda row: row['<30'] / row['>30'] if row['>30'] > 0 else float('nan'), axis=1
    )
    pivot_table['<30 vs All Others'] = pivot_table.apply(
        lambda row: row['<30'] / (row['NO'] + row['>30']) if (row['NO'] + row['>30']) > 0 else float('nan'), axis=1
    )
    
    # Reorder columns for clarity
    columns_order = [
        'Total',
        '% of Dataset',
        'NO Count', 'NO Rate in Group (%)',
        '<30 Count', '<30 Rate in Group (%)',
        '>30 Count', '>30 Rate in Group (%)',
        '<30 vs NO', '<30 vs >30', '<30 vs All Others'
    ]
    pivot_table = pivot_table[columns_order]
    
    print(f"\nAnalysis by '{group_col}':")
    display(pivot_table)
    return pivot_table

# 2) Analyze readmitted for the specified columns
for col in ['race', 'age', 'gender']:
    analyze_readmitted(df_original, col)


Overall readmitted Distribution:


Unnamed: 0_level_0,Count,Percentage
readmitted,Unnamed: 1_level_1,Unnamed: 2_level_1
NO,54864,53.911916
>30,35545,34.928169
<30,11357,11.159916



Analysis by 'race':


readmitted,Total,% of Dataset,NO Count,NO Rate in Group (%),<30 Count,<30 Rate in Group (%),>30 Count,>30 Rate in Group (%),<30 vs NO,<30 vs >30,<30 vs All Others
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
?,2273,2.233555,1547,68.059833,188,8.271007,538,23.66916,0.121526,0.349442,0.090168
AfricanAmerican,19210,18.876639,10421,54.247788,2155,11.218116,6634,34.534097,0.206794,0.324842,0.126356
Asian,641,0.629876,415,64.74259,65,10.140406,161,25.117005,0.156627,0.403727,0.112847
Caucasian,76099,74.778413,40383,53.0664,8592,11.290556,27124,35.643044,0.212763,0.316767,0.127276
Hispanic,2037,2.001651,1183,58.075601,212,10.407462,642,31.516937,0.179205,0.330218,0.116164
Other,1506,1.479866,915,60.756972,145,9.628154,446,29.614874,0.15847,0.325112,0.106539



Analysis by 'age':


readmitted,Total,% of Dataset,NO Count,NO Rate in Group (%),<30 Count,<30 Rate in Group (%),>30 Count,>30 Rate in Group (%),<30 vs NO,<30 vs >30,<30 vs All Others
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
[0-10),161,0.158206,132,81.987578,3,1.863354,26,16.149068,0.022727,0.115385,0.018987
[10-20),691,0.679009,427,61.794501,40,5.788712,224,32.416787,0.093677,0.178571,0.061444
[20-30),1657,1.628245,911,54.978877,236,14.242607,510,30.778515,0.259056,0.462745,0.16608
[30-40),3775,3.70949,2164,57.324503,424,11.231788,1187,31.443709,0.195933,0.357203,0.126529
[40-50),9685,9.516931,5380,55.549819,1027,10.604027,3278,33.846154,0.190892,0.313301,0.118619
[50-60),17256,16.956547,9671,56.044274,1668,9.666203,5917,34.289522,0.172474,0.2819,0.107005
[60-70),22483,22.09284,12084,53.747276,2502,11.128408,7897,35.124316,0.207051,0.316829,0.125219
[70-80),26068,25.615628,13524,51.879699,3069,11.773055,9475,36.347246,0.22693,0.323905,0.133441
[80-90),17197,16.898571,8896,51.729953,2078,12.083503,6223,36.186544,0.233588,0.333923,0.137443
[90-100),2793,2.744532,1675,59.971357,310,11.099177,808,28.929467,0.185075,0.383663,0.124849



Analysis by 'gender':


readmitted,Total,% of Dataset,NO Count,NO Rate in Group (%),<30 Count,<30 Rate in Group (%),>30 Count,>30 Rate in Group (%),<30 vs NO,<30 vs >30,<30 vs All Others
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Female,54708,53.758623,29038,53.07816,6152,11.245156,19518,35.676683,0.21186,0.315196,0.126699
Male,47055,46.238429,25823,54.878334,5205,11.061524,16027,34.060142,0.201564,0.324764,0.124373
Unknown/Invalid,3,0.002948,3,100.0,0,0.0,0,0.0,0.0,,0.0


# Bias Analysis

In [39]:
df_encoded.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,metformin_medicine,repaglinide_medicine,nateglinide_medicine,chlorpropamide_medicine,glimepiride_medicine,acetohexamide_medicine,glipizide_medicine,glyburide_medicine,tolbutamide_medicine,pioglitazone_medicine,rosiglitazone_medicine,acarbose_medicine,miglitol_medicine,troglitazone_medicine,tolazamide_medicine,insulin_medicine,glyburide_metformin_medicine,glipizide_metformin_medicine,glimepiride_pioglitazone_medicine,metformin_pioglitazone_medicine,change,diabetesMed,readmitted,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,diag_1_Circulatory,diag_1_Diabetes,diag_1_Digestive,diag_1_Genitourinary,diag_1_Injury,diag_1_Muscoloskeletal,diag_1_Neoplasms,diag_1_Other,diag_1_Respiratory,diag_2_Circulatory,diag_2_Diabetes,diag_2_Digestive,diag_2_Genitourinary,diag_2_Injury,diag_2_Muscoloskeletal,diag_2_Neoplasms,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Genitourinary,diag_3_Injury,diag_3_Muscoloskeletal,diag_3_Neoplasms,diag_3_Other,diag_3_Respiratory
0,3,0,5,6,25,1,1,37,41,0,1,0,0,0,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,3,0,15,1,1,7,3,72,59,0,18,0,0,0,9,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,0,25,1,1,7,2,72,11,5,13,2,0,1,6,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,3,3,3,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,3,1,35,1,1,7,2,72,44,1,16,0,0,0,7,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,3,1,45,1,1,7,1,72,51,0,8,0,0,0,5,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,0,3,3,3,3,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0


In [40]:
label_name = ['readmitted']
sensitive_attribute = ['gender']

df_bld_gender = BinaryLabelDataset(df=df_encoded, label_names=label_name, protected_attribute_names=sensitive_attribute, favorable_label=1, unfavorable_label=0)

privileged_group = [{'gender':1}]
unprivileged_group = [{'gender':0}]

train_metric = BinaryLabelDatasetMetric(df_bld_gender, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
explainer_org_train = MetricTextExplainer(train_metric)
print("Train set:", explainer_org_train.statistical_parity_difference())
print("Train set:", explainer_org_train.disparate_impact())

Train set: Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): 0.0018869230976938378
Train set: Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 1.0169619905788583


In [41]:
# Closer look at bias towards asian females
df_asian_women = df_encoded.copy()

# {'Other':0, 'AfricanAmerican': 1,'Asian': 2, 'Caucasian': 3, 'Hispanic': 4}
def combined_gender_race(row):
  if row['gender'] == 1 and row['race'] == 2:
    return 0
  else:
    return 1

df_asian_women['gender_race'] = df_asian_women.apply(combined_gender_race, axis=1)

In [42]:
gender_race_counts = df_asian_women['gender_race'].value_counts()

print("Number of 0s:", gender_race_counts.get(0, 0))
print("Number of 1s:", gender_race_counts.get(1, 0))

Number of 0s: 323
Number of 1s: 99169


In [43]:


df_asian_women = df_asian_women.drop(columns=['gender', 'race', 'race_AfricanAmerican', 'race_Asian', 'race_Hispanic', 'race_Other', 'race_Caucasian'])
new_order = ['gender_race'] + [col for col in df_asian_women.columns if col != 'gender_race']
df_asian_women = df_asian_women[new_order]

pd.set_option('display.max_columns', None)
df_asian_women


Unnamed: 0,gender_race,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,metformin_medicine,repaglinide_medicine,nateglinide_medicine,chlorpropamide_medicine,glimepiride_medicine,acetohexamide_medicine,glipizide_medicine,glyburide_medicine,tolbutamide_medicine,pioglitazone_medicine,rosiglitazone_medicine,acarbose_medicine,miglitol_medicine,troglitazone_medicine,tolazamide_medicine,insulin_medicine,glyburide_metformin_medicine,glipizide_metformin_medicine,glimepiride_pioglitazone_medicine,metformin_pioglitazone_medicine,change,diabetesMed,readmitted,diag_1_Circulatory,diag_1_Diabetes,diag_1_Digestive,diag_1_Genitourinary,diag_1_Injury,diag_1_Muscoloskeletal,diag_1_Neoplasms,diag_1_Other,diag_1_Respiratory,diag_2_Circulatory,diag_2_Diabetes,diag_2_Digestive,diag_2_Genitourinary,diag_2_Injury,diag_2_Muscoloskeletal,diag_2_Neoplasms,diag_2_Other,diag_2_Respiratory,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_Genitourinary,diag_3_Injury,diag_3_Muscoloskeletal,diag_3_Neoplasms,diag_3_Other,diag_3_Respiratory
0,1,5,6,25,1,1,37,41,0,1,0,0,0,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,1,15,1,1,7,3,72,59,0,18,0,0,0,9,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,25,1,1,7,2,72,11,5,13,2,0,1,6,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,3,3,3,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,1,35,1,1,7,2,72,44,1,16,0,0,0,7,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,1,45,1,1,7,1,72,51,0,8,0,0,0,5,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,0,3,3,3,3,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,75,1,3,7,3,72,51,0,16,0,0,0,9,3,2,0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
101762,1,85,1,4,5,5,72,33,3,18,0,0,1,9,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
101763,1,75,1,1,7,1,72,53,0,9,1,0,0,13,3,3,0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
101764,1,85,2,3,7,10,62,45,2,21,0,0,1,9,3,3,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,2,3,3,3,3,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [44]:
df_asian_women_train, df_asian_women_test = train_test_split(df_asian_women, test_size=0.3, stratify=df_asian_women[['gender_race', 'readmitted']], random_state=42)
df_asian_women_val, df_asian_women_test = train_test_split(df_asian_women_test, test_size=0.5, stratify=df_asian_women_test[['gender_race', 'readmitted']], random_state=42)

In [45]:

label_name = ['readmitted']
sensitive_attribute = ['gender_race']

train_bld = BinaryLabelDataset(df=df_asian_women_train, label_names=label_name, protected_attribute_names=sensitive_attribute, favorable_label=1, unfavorable_label=0)
val_bld = BinaryLabelDataset(df=df_asian_women_val, label_names=label_name, protected_attribute_names=sensitive_attribute, favorable_label=1, unfavorable_label=0)
test_bld = BinaryLabelDataset(df=df_asian_women_test, label_names=label_name, protected_attribute_names=sensitive_attribute, favorable_label=1, unfavorable_label=0)


privileged_group = [{'gender_race':0}]
unprivileged_group = [{'gender_race':1}]

train_metric = BinaryLabelDatasetMetric(train_bld, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
explainer_org_train = MetricTextExplainer(train_metric)
print("Train set:", explainer_org_train.statistical_parity_difference())
print("Train set:", explainer_org_train.disparate_impact())

val_metric = BinaryLabelDatasetMetric(val_bld, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
explainer_org_val = MetricTextExplainer(val_metric)
print("Validation set:", explainer_org_val.statistical_parity_difference())
print("Validation set:", explainer_org_val.disparate_impact())

test_metric = BinaryLabelDatasetMetric(test_bld, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
explainer_org_test = MetricTextExplainer(test_metric)
print("Test set:", explainer_org_test.statistical_parity_difference())
print("Test set:", explainer_org_test.disparate_impact())

Train set: Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.016099851177310623
Train set: Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.8745321942733724
Validation set: Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.01280586179080398
Validation set: Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.8975531056735682
Test set: Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.010247298919567827
Test set: Disparate impact (probability of favorable outcome for unprivileged instances / prob

In [46]:
# Scaling the data
scaler = StandardScaler()
scaler.fit(train_bld.features)

x_train = scaler.transform(train_bld.features)
y_train = train_bld.labels.ravel()
w_train = train_bld.instance_weights.ravel()

x_val = scaler.transform(val_bld.features)
y_val = val_bld.labels.ravel()

x_test = scaler.transform(test_bld.features)
y_test = test_bld.labels.ravel()

print("Train data:", "x_train", x_train.shape, "y_train", y_train.shape)
print("Validation data:", "x_val", x_val.shape, "y_val", y_val.shape)
print("Test data:", "x_test", x_test.shape, "y_test", y_test.shape)

Train data: x_train (69644, 65) y_train (69644,)
Validation data: x_val (14924, 65) y_val (14924,)
Test data: x_test (14924, 65) y_test (14924,)


In [47]:
# Training Logistic Regression model
model = LogisticRegression(solver='liblinear', random_state=1)

LR_model = model.fit(x_train, y_train, sample_weight=train_bld.instance_weights)

y_train_pred = LR_model.predict(x_train)

# positive class index
pos_idx = np.where(LR_model.classes_ == train_bld.favorable_label)[0][0]

train_bld_pred = train_bld.copy()
train_bld_pred = y_train_pred

In [48]:
valid_bld_pred = val_bld.copy(deepcopy=True)
valid_bld_pred.scores = LR_model.predict_proba(x_val)[:, pos_idx].reshape(-1, 1)

test_bld_pred = test_bld.copy(deepcopy=True)
test_bld_pred.scores = LR_model.predict_proba(x_test)[:, pos_idx].reshape(-1, 1)

In [49]:
num_thresh = 100
balanced_acc = np.zeros(num_thresh)
class_threshold = np.linspace(0.01, 0.99, num_thresh)

for idx, class_thresh in enumerate(class_threshold):

    fav_idx = valid_bld_pred.scores > class_thresh
    valid_bld_pred.labels[fav_idx] = valid_bld_pred.favorable_label
    valid_bld_pred.labels[~fav_idx] = valid_bld_pred.unfavorable_label

    # computing metrics based on two BinaryLabelDatasets: a dataset containing groud-truth labels and a dataset containing predictions
    classified_metric_orig_valid = ClassificationMetric(val_bld,
                                                        valid_bld_pred,
                                                        unprivileged_groups=unprivileged_group,
                                                        privileged_groups=privileged_group)

    balanced_acc[idx] = 0.5 * (classified_metric_orig_valid.true_positive_rate() + classified_metric_orig_valid.true_negative_rate())

best_idx = np.where(balanced_acc == np.max(balanced_acc))[0][0]
best_class_thresh = class_threshold[best_idx]

print("Best balanced acuuracy (no fairness constraints) = %.4f" % np.max(balanced_acc))
print("Optimal classification threshold (no fairness constraints) = %.4f" % best_class_thresh)

Best balanced acuuracy (no fairness constraints) = 0.5847
Optimal classification threshold (no fairness constraints) = 0.0991


In [50]:
test_metrics = {
    'balanced accuracy': [],
    'equal opportunity difference': [],
    'average odds difference': [],
    'statistical parity difference': [],
    'Disparate Impact': []
}

print("Classification threshold = %.4f" % best_class_thresh)

for thresh in tqdm(class_threshold):

    fav_idx = test_bld_pred.scores > thresh
    test_bld_pred.labels[fav_idx] = test_bld_pred.favorable_label
    test_bld_pred.labels[~fav_idx] = test_bld_pred.unfavorable_label

    classification_metric_orig_test = ClassificationMetric(test_bld,
                                                           test_bld_pred,
                                                           unprivileged_groups=unprivileged_group,
                                                           privileged_groups=privileged_group)

    balanced_acc = 0.5 * (classification_metric_orig_test.true_positive_rate() + classification_metric_orig_test.true_negative_rate())

    acc = classification_metric_orig_test.accuracy()

    eq_opp_diff = classification_metric_orig_test.equal_opportunity_difference()

    avg_odd_diff = classification_metric_orig_test.average_odds_difference()

    spd = classification_metric_orig_test.statistical_parity_difference()

    disparate_impact = classification_metric_orig_test.disparate_impact()

    theil_idx = classification_metric_orig_test.theil_index()

    if thresh == best_class_thresh:
        display(Markdown(" ##### Metrics using the optimal classification threshold on test set"))
        print("Accuracy:", acc)
        print("Balanced Accuracy:", balanced_acc)
        print("Equal opportunity difference", eq_opp_diff)
        print("Average odds difference", avg_odd_diff)
        print("Statistical parity difference", spd)
        print("Disparate Impact", disparate_impact)
        print("Theil index", theil_idx)

    test_metrics['balanced accuracy'].append(balanced_acc)
    test_metrics['equal opportunity difference'].append(classification_metric_orig_test.equal_opportunity_difference())
    test_metrics['average odds difference'].append(classification_metric_orig_test.average_odds_difference())
    test_metrics['statistical parity difference'].append(classification_metric_orig_test.statistical_parity_difference())
    test_metrics['Disparate Impact'].append(classification_metric_orig_test.disparate_impact())

Classification threshold = 0.0991


  0%|          | 0/100 [00:00<?, ?it/s]

 ##### Metrics using the optimal classification threshold on test set

 23%|██▎       | 23/100 [00:00<00:00, 221.68it/s]

Accuracy: 0.552130796033235
Balanced Accuracy: 0.5944225134422987
Equal opportunity difference -0.35230677052127024
Average odds difference -0.16699880788389657
Statistical parity difference -0.028993997599039645
Disparate Impact 0.9431717647058823
Theil index 0.09947693209588412


  return metric_fun(privileged=False) / metric_fun(privileged=True)
  return metric_fun(privileged=False) / metric_fun(privileged=True)
100%|██████████| 100/100 [00:00<00:00, 232.95it/s]


In [51]:
# Closer look at bias towards asian females
df_other_women = df_encoded.copy()

# {'Other':0, 'AfricanAmerican': 1,'Asian': 2, 'Caucasian': 3, 'Hispanic': 4}
def combined_gender_race(row):
  if row['gender'] == 1 and row['race'] == 0:
    return 0
  else:
    return 1

df_other_women['gender_race'] = df_other_women.apply(combined_gender_race, axis=1)
gender_race_counts = df_other_women['gender_race'].value_counts()

print("Number of 0s:", gender_race_counts.get(0, 0))
print("Number of 1s:", gender_race_counts.get(1, 0))


df_other_women = df_other_women.drop(columns=['gender', 'race', 'race_AfricanAmerican', 'race_Asian', 'race_Hispanic', 'race_Other', 'race_Caucasian'])
new_order = ['gender_race'] + [col for col in df_other_women.columns if col != 'gender_race']
df_other_women = df_other_women[new_order]

pd.set_option('display.max_columns', None)
df_other_women

df_other_women_train, df_other_women_test = train_test_split(df_other_women, test_size=0.3, stratify=df_other_women[['gender_race', 'readmitted']], random_state=42)
df_other_women_val, df_other_women_test = train_test_split(df_other_women_test, test_size=0.5, stratify=df_other_women_test[['gender_race', 'readmitted']], random_state=42)

label_name = ['readmitted']
sensitive_attribute = ['gender_race']

train_bld = BinaryLabelDataset(df=df_other_women_train, label_names=label_name, protected_attribute_names=sensitive_attribute, favorable_label=1, unfavorable_label=0)
val_bld = BinaryLabelDataset(df=df_other_women_val, label_names=label_name, protected_attribute_names=sensitive_attribute, favorable_label=1, unfavorable_label=0)
test_bld = BinaryLabelDataset(df=df_other_women_test, label_names=label_name, protected_attribute_names=sensitive_attribute, favorable_label=1, unfavorable_label=0)


privileged_group = [{'gender_race':1}]
unprivileged_group = [{'gender_race':0}]

train_metric = BinaryLabelDatasetMetric(train_bld, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
explainer_org_train = MetricTextExplainer(train_metric)
print("Train set:", explainer_org_train.statistical_parity_difference())
print("Train set:", explainer_org_train.disparate_impact())

val_metric = BinaryLabelDatasetMetric(val_bld, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
explainer_org_val = MetricTextExplainer(val_metric)
print("Validation set:", explainer_org_val.statistical_parity_difference())
print("Validation set:", explainer_org_val.disparate_impact())

test_metric = BinaryLabelDatasetMetric(test_bld, unprivileged_groups=unprivileged_group, privileged_groups=privileged_group)
explainer_org_test = MetricTextExplainer(test_metric)
print("Test set:", explainer_org_test.statistical_parity_difference())
print("Test set:", explainer_org_test.disparate_impact())

Number of 0s: 757
Number of 1s: 98735
Train set: Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.021871384494089888
Train set: Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.805479491966989
Validation set: Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.02392087201392412
Validation set: Disparate impact (probability of favorable outcome for unprivileged instances / probability of favorable outcome for privileged instances): 0.7872119907518137
Test set: Statistical parity difference (probability of favorable outcome for unprivileged instances - probability of favorable outcome for privileged instances): -0.02470473956667496
Test set: Disparate impact (probability of favorable outco

In [52]:
# Scaling the data
scaler = StandardScaler()
scaler.fit(train_bld.features)

x_train = scaler.transform(train_bld.features)
y_train = train_bld.labels.ravel()
w_train = train_bld.instance_weights.ravel()

x_val = scaler.transform(val_bld.features)
y_val = val_bld.labels.ravel()

x_test = scaler.transform(test_bld.features)
y_test = test_bld.labels.ravel()

print("Train data:", "x_train", x_train.shape, "y_train", y_train.shape)
print("Validation data:", "x_val", x_val.shape, "y_val", y_val.shape)
print("Test data:", "x_test", x_test.shape, "y_test", y_test.shape)
# Training Logistic Regression model
model = LogisticRegression(solver='liblinear', random_state=1)

LR_model = model.fit(x_train, y_train, sample_weight=train_bld.instance_weights)

y_train_pred = LR_model.predict(x_train)

# positive class index
pos_idx = np.where(LR_model.classes_ == train_bld.favorable_label)[0][0]

train_bld_pred = train_bld.copy()
train_bld_pred = y_train_pred
valid_bld_pred = val_bld.copy(deepcopy=True)
valid_bld_pred.scores = LR_model.predict_proba(x_val)[:, pos_idx].reshape(-1, 1)

test_bld_pred = test_bld.copy(deepcopy=True)
test_bld_pred.scores = LR_model.predict_proba(x_test)[:, pos_idx].reshape(-1, 1)
num_thresh = 100
balanced_acc = np.zeros(num_thresh)
class_threshold = np.linspace(0.01, 0.99, num_thresh)

for idx, class_thresh in enumerate(class_threshold):

    fav_idx = valid_bld_pred.scores > class_thresh
    valid_bld_pred.labels[fav_idx] = valid_bld_pred.favorable_label
    valid_bld_pred.labels[~fav_idx] = valid_bld_pred.unfavorable_label

    # computing metrics based on two BinaryLabelDatasets: a dataset containing groud-truth labels and a dataset containing predictions
    classified_metric_orig_valid = ClassificationMetric(val_bld,
                                                        valid_bld_pred,
                                                        unprivileged_groups=unprivileged_group,
                                                        privileged_groups=privileged_group)

    balanced_acc[idx] = 0.5 * (classified_metric_orig_valid.true_positive_rate() + classified_metric_orig_valid.true_negative_rate())

best_idx = np.where(balanced_acc == np.max(balanced_acc))[0][0]
best_class_thresh = class_threshold[best_idx]

print("Best balanced acuuracy (no fairness constraints) = %.4f" % np.max(balanced_acc))
print("Optimal classification threshold (no fairness constraints) = %.4f" % best_class_thresh)
test_metrics = {
    'balanced accuracy': [],
    'equal opportunity difference': [],
    'average odds difference': [],
    'statistical parity difference': [],
    'Disparate Impact': []
}

print("Classification threshold = %.4f" % best_class_thresh)


Train data: x_train (69644, 65) y_train (69644,)
Validation data: x_val (14924, 65) y_val (14924,)
Test data: x_test (14924, 65) y_test (14924,)
Best balanced acuuracy (no fairness constraints) = 0.5942
Optimal classification threshold (no fairness constraints) = 0.1090
Classification threshold = 0.1090


In [53]:

for thresh in tqdm(class_threshold):

    fav_idx = test_bld_pred.scores > thresh
    test_bld_pred.labels[fav_idx] = test_bld_pred.favorable_label
    test_bld_pred.labels[~fav_idx] = test_bld_pred.unfavorable_label

    classification_metric_orig_test = ClassificationMetric(test_bld,
                                                           test_bld_pred,
                                                           unprivileged_groups=unprivileged_group,
                                                           privileged_groups=privileged_group)

    balanced_acc = 0.5 * (classification_metric_orig_test.true_positive_rate() + classification_metric_orig_test.true_negative_rate())

    acc = classification_metric_orig_test.accuracy()

    eq_opp_diff = classification_metric_orig_test.equal_opportunity_difference()

    avg_odd_diff = classification_metric_orig_test.average_odds_difference()

    spd = classification_metric_orig_test.statistical_parity_difference()

    disparate_impact = classification_metric_orig_test.disparate_impact()

    theil_idx = classification_metric_orig_test.theil_index()

    if thresh == best_class_thresh:
        display(Markdown(" ##### Metrics using the optimal classification threshold on test set"))
        print("Accuracy:", acc)
        print("Balanced Accuracy:", balanced_acc)
        print("Equal opportunity difference", eq_opp_diff)
        print("Average odds difference", avg_odd_diff)
        print("Statistical parity difference", spd)
        print("Disparate Impact", disparate_impact)
        print("Theil index", theil_idx)

    test_metrics['balanced accuracy'].append(balanced_acc)
    test_metrics['equal opportunity difference'].append(classification_metric_orig_test.equal_opportunity_difference())
    test_metrics['average odds difference'].append(classification_metric_orig_test.average_odds_difference())
    test_metrics['statistical parity difference'].append(classification_metric_orig_test.statistical_parity_difference())
    test_metrics['Disparate Impact'].append(classification_metric_orig_test.disparate_impact())

  0%|          | 0/100 [00:00<?, ?it/s]

 ##### Metrics using the optimal classification threshold on test set

 18%|█▊        | 18/100 [00:00<00:00, 169.77it/s]

Accuracy: 0.6402439024390244
Balanced Accuracy: 0.5984143663898036
Equal opportunity difference -0.4471471471471472
Average odds difference -0.3018586775908952
Statistical parity difference -0.18695760332634423
Disparate Impact 0.49629941690682955
Theil index 0.11110999416536257


  return metric_fun(privileged=False) / metric_fun(privileged=True)
100%|██████████| 100/100 [00:00<00:00, 154.04it/s]
