In [67]:
from typing import Callable, Dict, List, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import scipy
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics, tree
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
np.seterr(all="ignore")
import warnings
warnings.filterwarnings('ignore')

In [68]:
# Load data
# df = pd.read_csv("data/heart_2020_cleaned.csv")
# train, val = train_test_split(df, test_size=0.3, random_state=42, stratify=df["HeartDisease"])
# val, test = train_test_split(val, test_size=0.5, random_state=42, stratify=val["HeartDisease"])
# train.to_csv("data/heart_train.csv", index=False)
# val.to_csv("data/heart_val.csv", index=False)
# test.to_csv("data/heart_test.csv", index=False)

In [69]:
train = pd.read_csv('data/heart_train.csv')
val = pd.read_csv('data/heart_val.csv')
test = pd.read_csv('data/heart_test.csv')

In [70]:
# Preprocessing data

categorical_features = [
    "Smoking",
    "AlcoholDrinking",
    "Stroke",
    "DiffWalking",
    "Sex",
    "AgeCategory",
    "Race",
    "Diabetic",
    "PhysicalActivity",
    "GenHealth",
    "Asthma",
    "KidneyDisease",
    "SkinCancer"
]

continuos_features = [
    "BMI",
    "PhysicalHealth",
    "MentalHealth"
]

target_variable = "HeartDisease"

def data_preprocessing(data: pd.DataFrame, categorical_features: List[str], continuous_features: List[str], target_variable: str):

    df = data.copy()

    # protected variables
    sex = df["Sex"].values
    age = df["AgeCategory"].values
    race = df["Race"].values

    # target
    target = df[target_variable].values

    df_processed = df[categorical_features + continuous_features]
    df_processed = pd.get_dummies(df_processed, prefix=None, prefix_sep='_', dummy_na=False, columns=categorical_features, drop_first=True)

    return df_processed, df, target, sex, age, race

#df_processed, df_original, target, sex, age, race = data_preprocessing(df, categorical_features=categorical_features, continuous_features=continuos_features, target_variable=target_variable)
train_processed, train_original, train_target, train_sex, train_age, train_race = data_preprocessing(train, categorical_features=categorical_features, continuous_features=continuos_features, target_variable=target_variable)
val_processed, val_original, val_target, val_sex, val_age, val_race = data_preprocessing(val, categorical_features=categorical_features, continuous_features=continuos_features, target_variable=target_variable)

In [71]:
from typing import List


def reproject_features(data: pd.DataFrame, protected_cols: List[str], nonprotected_cols: List[str]) -> np.ndarray:
    """
    generate a fair representation of nonprotected columns which are independent from any columns in protected_cols
    
    data : pd.DataFrame
        dataframe with columns to be projected
    protected_cols : List[str]
        list of protected columns
    nonprotected_cols : List[str]
        list of non-protected columns   
    """
    # make a copy of data
    df: pd.DataFrame = data.copy()
    ## df is our data
    # extract data about protected columns
    nonprotect: np.ndarray = df[nonprotected_cols].values
    protect: np.ndarray = df[protected_cols].values
    # extract data about nonprotected columns
    debiased_nonprotect: np.ndarray = df[nonprotected_cols].values
    # crease an orthonormal basis
    base_protect: np.ndarray = scipy.linalg.orth(protect)

    # go through all protected attributes and calculate their contribution to the reprojection to the hyperplane
    for j in range(debiased_nonprotect.shape[1]):
        debiased_nonprotect[:,j] -= base_protect @ base_protect.T @ nonprotect[:,j]
    return debiased_nonprotect

def reproject_features_w_regul(data: pd.DataFrame, protected_cols: List[str], nonprotected_cols: List[str], lambda_: float) -> np.ndarray:
    """
    generate a fair representation of nonprotected columns which are independent from any columns in protected_cols
    dat_: a data frame
    protected_cols: list of strings, the protected columns
    nonprotected_col: string, all other data columns 
    lambda_: float number between 0 and 1, 0 means totally fair; 1 means same as raw data
    """
    
    # run the normal reproject_features function
    r: np.ndarray = reproject_features(data, protected_cols, nonprotected_cols)
    
    # extract data about nonprotected variables
    nonprotect: np.ndarray = data[nonprotected_cols].values
    # standardize columns

    return r + lambda_*(nonprotect - r)

In [72]:
# just renaming stuff
# X_train, X_val = train_processed.values, val_processed.values
y_train, y_val = train_target, val_target

In [84]:
# Fairness metric function
import itertools

def equailized_odds(preds: np.ndarray, groups: np.ndarray, test: np.ndarray, sum_of_differences: bool = True, verbose: bool = False) -> Union[float, Dict]:
    """
    Calculates the equailized odds of a binary classification problem.
    preds: predictions of the model
    groups: group labels of the test data
    test: test data
    sum_of_differences: if True, the sum of the differences is returned, else the mean of the differences is returned
    verbose: if True, prints the results
    """

    df = pd.DataFrame(list(zip(preds, groups, test)), columns=['preds', 'groups', 'test'])
    
    # save all results
    all_results = {}

    total_class_difference = 0
    for target in df['test'].unique():
        results = {}
        for group in df['groups'].unique():
            
            # get the group and amount of corrects in the group
            selection = df.loc[(df['test'] == target) & (df['groups'] == group)]
            corrects = selection.loc[selection['preds'] == 'Yes']
    
            # if there are no corrects in the group, skip
            if len(corrects) == 0:
                if target == 'Yes':
                    results[group] = 0
                continue

            # get the odds ratio
            score = round(len(corrects) / len(selection), 3)

            # add the score to the results
            results[group] = score

            if verbose:
                print(f'Target [{target}] and group [{group}]: {score} ')
    
        class_differences = np.std(list(results.values()))
        
        if verbose:
            print(results)
            print(f'Class differences std: {class_differences}')
                

        # sum up differences or take average
        total_class_difference += class_differences

        all_results[target] = results

    if verbose:
        print(f'Total class difference: {total_class_difference}')

    print(f'Total class difference: {total_class_difference}')
    return total_class_difference, all_results

preds = [0,1,1,1,1]
preds = ['No', 'Yes', 'Yes', 'Yes', 'Yes']
groups = ["A", "A", "B", "B", "C"]
test = [0,0,1,1,1]
test = ['No', 'No', 'Yes', 'Yes', 'Yes']

print(equailized_odds(preds, groups, test, verbose=True, sum_of_differences=True))

Target [No] and group [A]: 0.5 
{'A': 0.5}
Class differences std: 0.0
Target [Yes] and group [B]: 1.0 
Target [Yes] and group [C]: 1.0 
{'A': 0, 'B': 1.0, 'C': 1.0}
Class differences std: 0.4714045207910317
Total class difference: 0.4714045207910317
Total class difference: 0.4714045207910317
(0.4714045207910317, {'No': {'A': 0.5}, 'Yes': {'A': 0, 'B': 1.0, 'C': 1.0}})


# Optuna Study

In [87]:
optuna.logging.set_verbosity(1)


class Objective(object):
    def __init__(
        self,
        X_train: np.ndarray,
        X_val: np.ndarray,
        y_train: np.ndarray,
        y_val: np.ndarray,
        group_val: np.ndarray,
        evaluation_func: Callable,
    ):
        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
        self.group_val = group_val
        self.evaluation_func = evaluation_func

    def __call__(self, trial) -> float:
        """This method is called by Optuna to compute the objective
        function."""
        # Initialize general hyper parameters

        params = {
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
            "max_depth": trial.suggest_int("max_depth", 5, 29, step=2),
            #"min_samples_split": trial.suggest_loguniform("min_samples_split", 1e-3, 0.01),
            "min_samples_split": trial.suggest_float("min_samples_split", 1e-5, 0.01),
            #"min_samples_leaf": trial.suggest_loguniform("min_samples_leaf", 1e-3, 0.01),
            "min_samples_leaf": trial.suggest_float("min_samples_leaf", 1e-5, 0.01),
        }

        # parameters for fitting a model
        whitebox_model = DecisionTreeClassifier(
            **params, random_state=42).fit(self.X_train, self.y_train)

        preds: np.ndarray = whitebox_model.predict(self.X_val)

        return self.evaluation_func(preds, self.group_val, self.y_val, verbose = True)[0], metrics.f1_score(self.y_val, preds, labels = ['Yes'], pos_label = 'Yes')
        #return metrics.f1_score(self.y_val, preds, labels = ['Yes'], pos_label = 'Yes')

In [91]:
# Run optimization
sampler = optuna.samplers.TPESampler() 

study = optuna.create_study(directions=["minimize", "maximize"],
#study = optuna.create_study(direction="maximize",
                            sampler=sampler,
                            pruner=optuna.pruners.MedianPruner(
                                n_startup_trials=2, n_warmup_steps=5, interval_steps=3
                                ),
                            )

# Scaling
# Scale continuous variables
scaler = ColumnTransformer(
    [('scaler', StandardScaler(), continuos_features)], remainder='passthrough')
X_train = scaler.fit_transform(train_processed)
X_val = scaler.transform(val_processed)

ros = RandomOverSampler(random_state =42)
X_train_resampled , y_train_resampled , = ros.fit_resample(X_train , y_train)
# Define objective
#objective = Objective(X_train, X_val, y_train, y_val, val_race, equailized_odds)
objective = Objective(X_train_resampled, X_val, y_train_resampled, y_val, val_race, equailized_odds)

# Make a study to optimize the objective.
study.optimize(objective, n_trials=10, n_jobs=1, show_progress_bar=True)

print(study.best_trials)

#best_params = study.best_params

scaler = ColumnTransformer(
    [('scaler', StandardScaler(), continuos_features)], remainder='passthrough')
    
whitebox_model = Pipeline(steps=[('scaler', scaler), ('clf', DecisionTreeClassifier(**best_params, random_state = 42))])
# whitebox_model = Pipeline(steps=[('scaler', scaler), ('clf', DecisionTreeClassifier(random_state = 42))])

whitebox_model.fit(train_processed, y_train)
y_pred_whitebox = whitebox_model.predict(val_processed)
y_pred_proba_whitebox = whitebox_model.predict_proba(val_processed)

print(Counter(y_pred_whitebox))

print(classification_report(y_val, y_pred_whitebox))

print(equailized_odds(y_pred_whitebox, val_race, y_val, verbose=False))

[32m[I 2022-05-22 13:41:40,909][0m A new study created in memory with name: no-name-4bad823c-959c-4307-8c5a-a2306bf867ad[0m
  0%|          | 0/10 [00:00<?, ?it/s]

Target [No] and group [White]: 0.319 
Target [No] and group [Black]: 0.332 
Target [No] and group [Other]: 0.265 
Target [No] and group [Asian]: 0.176 
Target [No] and group [Hispanic]: 0.215 
Target [No] and group [American Indian/Alaskan Native]: 0.338 
{'White': 0.319, 'Black': 0.332, 'Other': 0.265, 'Asian': 0.176, 'Hispanic': 0.215, 'American Indian/Alaskan Native': 0.338}
Class differences std: 0.06144215888857495
Target [Yes] and group [White]: 0.811 
Target [Yes] and group [Black]: 0.802 
Target [Yes] and group [Other]: 0.766 
Target [Yes] and group [Asian]: 0.786 
Target [Yes] and group [Hispanic]: 0.696 
Target [Yes] and group [American Indian/Alaskan Native]: 0.767 
{'White': 0.811, 'Black': 0.802, 'Other': 0.766, 'Asian': 0.786, 'Hispanic': 0.696, 'American Indian/Alaskan Native': 0.767}
Class differences std: 0.037530728151150466
Total class difference: 0.09897288703972543
Total class difference: 0.09897288703972543


 10%|█         | 1/10 [00:01<00:16,  1.78s/it]

[32m[I 2022-05-22 13:41:43,099][0m Trial 0 finished with values: [0.09897288703972543, 0.31636625811103103] and parameters: {'criterion': 'gini', 'max_depth': 23, 'min_samples_split': 0.0012424539745925075, 'min_samples_leaf': 0.003999971355412032}. [0m
Target [No] and group [White]: 0.321 
Target [No] and group [Black]: 0.331 
Target [No] and group [Other]: 0.267 
Target [No] and group [Asian]: 0.184 
Target [No] and group [Hispanic]: 0.22 
Target [No] and group [American Indian/Alaskan Native]: 0.346 
{'White': 0.321, 'Black': 0.331, 'Other': 0.267, 'Asian': 0.184, 'Hispanic': 0.22, 'American Indian/Alaskan Native': 0.346}
Class differences std: 0.0600039350561463
Target [Yes] and group [White]: 0.797 
Target [Yes] and group [Black]: 0.814 
Target [Yes] and group [Other]: 0.766 
Target [Yes] and group [Asian]: 0.786 
Target [Yes] and group [Hispanic]: 0.688 
Target [Yes] and group [American Indian/Alaskan Native]: 0.8 
{'White': 0.797, 'Black': 0.814, 'Other': 0.766, 'Asian': 0.78

 20%|██        | 2/10 [00:03<00:13,  1.73s/it]

[32m[I 2022-05-22 13:41:44,788][0m Trial 1 finished with values: [0.10163892301365895, 0.3114030881365685] and parameters: {'criterion': 'entropy', 'max_depth': 21, 'min_samples_split': 0.0013850459199764274, 'min_samples_leaf': 0.0061142762710445905}. [0m
Target [No] and group [White]: 0.306 
Target [No] and group [Black]: 0.323 
Target [No] and group [Other]: 0.26 
Target [No] and group [Asian]: 0.18 
Target [No] and group [Hispanic]: 0.209 
Target [No] and group [American Indian/Alaskan Native]: 0.327 
{'White': 0.306, 'Black': 0.323, 'Other': 0.26, 'Asian': 0.18, 'Hispanic': 0.209, 'American Indian/Alaskan Native': 0.327}
Class differences std: 0.05662375826453062
Target [Yes] and group [White]: 0.768 
Target [Yes] and group [Black]: 0.776 
Target [Yes] and group [Other]: 0.681 
Target [Yes] and group [Asian]: 0.786 
Target [Yes] and group [Hispanic]: 0.674 
Target [Yes] and group [American Indian/Alaskan Native]: 0.756 
{'White': 0.768, 'Black': 0.776, 'Other': 0.681, 'Asian': 

 30%|███       | 3/10 [00:04<00:11,  1.58s/it]

[32m[I 2022-05-22 13:41:46,201][0m Trial 2 finished with values: [0.10188012675437841, 0.31055406346364267] and parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 0.002607186152009599, 'min_samples_leaf': 0.00901664241812518}. [0m
Target [No] and group [White]: 0.331 
Target [No] and group [Black]: 0.346 
Target [No] and group [Other]: 0.287 
Target [No] and group [Asian]: 0.191 
Target [No] and group [Hispanic]: 0.229 
Target [No] and group [American Indian/Alaskan Native]: 0.368 
{'White': 0.331, 'Black': 0.346, 'Other': 0.287, 'Asian': 0.191, 'Hispanic': 0.229, 'American Indian/Alaskan Native': 0.368}
Class differences std: 0.06378087487640789
Target [Yes] and group [White]: 0.804 
Target [Yes] and group [Black]: 0.806 
Target [Yes] and group [Other]: 0.716 
Target [Yes] and group [Asian]: 0.81 
Target [Yes] and group [Hispanic]: 0.683 
Target [Yes] and group [American Indian/Alaskan Native]: 0.811 
{'White': 0.804, 'Black': 0.806, 'Other': 0.716, 'Asian': 0.8

 40%|████      | 4/10 [00:06<00:09,  1.61s/it]

[32m[I 2022-05-22 13:41:47,849][0m Trial 3 finished with values: [0.11574453739380772, 0.30579635443512493] and parameters: {'criterion': 'gini', 'max_depth': 23, 'min_samples_split': 0.002822126005113079, 'min_samples_leaf': 0.008652830657081662}. [0m
Target [No] and group [White]: 0.331 
Target [No] and group [Black]: 0.346 
Target [No] and group [Other]: 0.287 
Target [No] and group [Asian]: 0.191 
Target [No] and group [Hispanic]: 0.229 
Target [No] and group [American Indian/Alaskan Native]: 0.368 
{'White': 0.331, 'Black': 0.346, 'Other': 0.287, 'Asian': 0.191, 'Hispanic': 0.229, 'American Indian/Alaskan Native': 0.368}
Class differences std: 0.06378087487640789
Target [Yes] and group [White]: 0.804 
Target [Yes] and group [Black]: 0.806 
Target [Yes] and group [Other]: 0.716 
Target [Yes] and group [Asian]: 0.81 
Target [Yes] and group [Hispanic]: 0.683 
Target [Yes] and group [American Indian/Alaskan Native]: 0.811 
{'White': 0.804, 'Black': 0.806, 'Other': 0.716, 'Asian': 0

 50%|█████     | 5/10 [00:08<00:07,  1.60s/it]

[32m[I 2022-05-22 13:41:49,428][0m Trial 4 finished with values: [0.11574453739380772, 0.30579635443512493] and parameters: {'criterion': 'gini', 'max_depth': 13, 'min_samples_split': 0.003985694243953256, 'min_samples_leaf': 0.008620218004672955}. [0m
Target [No] and group [White]: 0.311 
Target [No] and group [Black]: 0.333 
Target [No] and group [Other]: 0.267 
Target [No] and group [Asian]: 0.176 
Target [No] and group [Hispanic]: 0.227 
Target [No] and group [American Indian/Alaskan Native]: 0.342 
{'White': 0.311, 'Black': 0.333, 'Other': 0.267, 'Asian': 0.176, 'Hispanic': 0.227, 'American Indian/Alaskan Native': 0.342}
Class differences std: 0.05959865770300537
Target [Yes] and group [White]: 0.82 
Target [Yes] and group [Black]: 0.825 
Target [Yes] and group [Other]: 0.787 
Target [Yes] and group [Asian]: 0.786 
Target [Yes] and group [Hispanic]: 0.723 
Target [Yes] and group [American Indian/Alaskan Native]: 0.811 
{'White': 0.82, 'Black': 0.825, 'Other': 0.787, 'Asian': 0.

 60%|██████    | 6/10 [00:09<00:06,  1.65s/it]

[32m[I 2022-05-22 13:41:51,189][0m Trial 5 finished with values: [0.09389151410196986, 0.32397030999854465] and parameters: {'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 0.005823028851292857, 'min_samples_leaf': 0.000844805460130891}. [0m
Target [No] and group [White]: 0.323 
Target [No] and group [Black]: 0.332 
Target [No] and group [Other]: 0.269 
Target [No] and group [Asian]: 0.177 
Target [No] and group [Hispanic]: 0.218 
Target [No] and group [American Indian/Alaskan Native]: 0.338 
{'White': 0.323, 'Black': 0.332, 'Other': 0.269, 'Asian': 0.177, 'Hispanic': 0.218, 'American Indian/Alaskan Native': 0.338}
Class differences std: 0.061104873419028996
Target [Yes] and group [White]: 0.813 
Target [Yes] and group [Black]: 0.795 
Target [Yes] and group [Other]: 0.759 
Target [Yes] and group [Asian]: 0.786 
Target [Yes] and group [Hispanic]: 0.696 
Target [Yes] and group [American Indian/Alaskan Native]: 0.778 
{'White': 0.813, 'Black': 0.795, 'Other': 0.759, 'Asian':

 70%|███████   | 7/10 [00:11<00:05,  1.67s/it]

[32m[I 2022-05-22 13:41:52,902][0m Trial 6 finished with values: [0.09848319452751231, 0.31453807591310573] and parameters: {'criterion': 'entropy', 'max_depth': 17, 'min_samples_split': 0.007767384187416649, 'min_samples_leaf': 0.00421397225896351}. [0m
Target [No] and group [White]: 0.33 
Target [No] and group [Black]: 0.321 
Target [No] and group [Other]: 0.274 
Target [No] and group [Asian]: 0.175 
Target [No] and group [Hispanic]: 0.215 
Target [No] and group [American Indian/Alaskan Native]: 0.352 
{'White': 0.33, 'Black': 0.321, 'Other': 0.274, 'Asian': 0.175, 'Hispanic': 0.215, 'American Indian/Alaskan Native': 0.352}
Class differences std: 0.06406095187831318
Target [Yes] and group [White]: 0.799 
Target [Yes] and group [Black]: 0.749 
Target [Yes] and group [Other]: 0.681 
Target [Yes] and group [Asian]: 0.786 
Target [Yes] and group [Hispanic]: 0.656 
Target [Yes] and group [American Indian/Alaskan Native]: 0.811 
{'White': 0.799, 'Black': 0.749, 'Other': 0.681, 'Asian': 

 80%|████████  | 8/10 [00:13<00:03,  1.65s/it]

[32m[I 2022-05-22 13:41:54,501][0m Trial 7 finished with values: [0.12316819903690336, 0.30556082748149554] and parameters: {'criterion': 'entropy', 'max_depth': 27, 'min_samples_split': 0.0027131624511479743, 'min_samples_leaf': 0.009848968399388196}. [0m
Target [No] and group [White]: 0.319 
Target [No] and group [Black]: 0.34 
Target [No] and group [Other]: 0.278 
Target [No] and group [Asian]: 0.186 
Target [No] and group [Hispanic]: 0.222 
Target [No] and group [American Indian/Alaskan Native]: 0.361 
{'White': 0.319, 'Black': 0.34, 'Other': 0.278, 'Asian': 0.186, 'Hispanic': 0.222, 'American Indian/Alaskan Native': 0.361}
Class differences std: 0.06294618512842715
Target [Yes] and group [White]: 0.793 
Target [Yes] and group [Black]: 0.802 
Target [Yes] and group [Other]: 0.695 
Target [Yes] and group [Asian]: 0.81 
Target [Yes] and group [Hispanic]: 0.683 
Target [Yes] and group [American Indian/Alaskan Native]: 0.789 
{'White': 0.793, 'Black': 0.802, 'Other': 0.695, 'Asian':

 90%|█████████ | 9/10 [00:14<00:01,  1.60s/it]

[32m[I 2022-05-22 13:41:56,000][0m Trial 8 finished with values: [0.11510619535337505, 0.3091328147507923] and parameters: {'criterion': 'entropy', 'max_depth': 9, 'min_samples_split': 0.00031897135755917165, 'min_samples_leaf': 0.008464050882476109}. [0m
Target [No] and group [White]: 0.331 
Target [No] and group [Black]: 0.346 
Target [No] and group [Other]: 0.287 
Target [No] and group [Asian]: 0.191 
Target [No] and group [Hispanic]: 0.229 
Target [No] and group [American Indian/Alaskan Native]: 0.368 
{'White': 0.331, 'Black': 0.346, 'Other': 0.287, 'Asian': 0.191, 'Hispanic': 0.229, 'American Indian/Alaskan Native': 0.368}
Class differences std: 0.06378087487640789
Target [Yes] and group [White]: 0.804 
Target [Yes] and group [Black]: 0.806 
Target [Yes] and group [Other]: 0.716 
Target [Yes] and group [Asian]: 0.81 
Target [Yes] and group [Hispanic]: 0.683 
Target [Yes] and group [American Indian/Alaskan Native]: 0.811 
{'White': 0.804, 'Black': 0.806, 'Other': 0.716, 'Asian'

100%|██████████| 10/10 [00:16<00:00,  1.63s/it]


[32m[I 2022-05-22 13:41:57,622][0m Trial 9 finished with values: [0.11574453739380772, 0.30579635443512493] and parameters: {'criterion': 'gini', 'max_depth': 25, 'min_samples_split': 0.0038181379959557047, 'min_samples_leaf': 0.008371904399506127}. [0m
[FrozenTrial(number=5, values=[0.09389151410196986, 0.32397030999854465], datetime_start=datetime.datetime(2022, 5, 22, 13, 41, 49, 430214), datetime_complete=datetime.datetime(2022, 5, 22, 13, 41, 51, 189863), params={'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 0.005823028851292857, 'min_samples_leaf': 0.000844805460130891}, distributions={'criterion': CategoricalDistribution(choices=('gini', 'entropy')), 'max_depth': IntUniformDistribution(high=29, low=5, step=2), 'min_samples_split': UniformDistribution(high=0.01, low=1e-05), 'min_samples_leaf': UniformDistribution(high=0.01, low=1e-05)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=5, state=TrialState.COMPLETE, value=None)]
Counter({'No': 47632,

In [30]:
from collections import Counter
Counter(val_race)

Counter({'White': 36714,
         'Black': 3401,
         'Other': 1697,
         'Asian': 1189,
         'Hispanic': 4139,
         'American Indian/Alaskan Native': 829})

# EDA

In [None]:
plt.rcParams["figure.figsize"] = (20,20)

# Checking for correlation between variables
from dython import nominal

data_test = train_original.copy()

# Encode the object as an enumerated type or categorical variable.
data_test[categorical_features] = data_test[categorical_features].apply(lambda x : pd.factorize(x)[0])
nominal.associations(data_test)

In [None]:
# Train whitebox model

# Scale continuous variables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
# scaler = ColumnTransformer([('scaler', StandardScaler(), continuos_features)], remainder='passthrough')

# Whitebox model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

scaler = ColumnTransformer(
    [('scaler', StandardScaler(), continuos_features)], remainder='passthrough')
    
# whitebox_model = Pipeline(steps=[('scaler', scaler), ('clf', DecisionTreeClassifier(min_samples_split = 0.01, min_samples_leaf= 0.01, max_features="auto", max_depth = 5, criterion = "gini", random_state = 42))])
whitebox_model = Pipeline(steps=[('scaler', scaler), ('clf', DecisionTreeClassifier(random_state = 42))])

whitebox_model.fit(train_processed, y_train)
y_pred_whitebox = whitebox_model.predict(val_processed)
y_pred_proba_whitebox = whitebox_model.predict_proba(val_processed)

print(Counter(y_pred_whitebox))

print(classification_report(y_val, y_pred_whitebox))

print(equailized_odds(y_pred_whitebox, val_race, y_val, verbose=False))

# # plot tree
# plt.figure(figsize=(25,20))  # set plot size (denoted in inches)
# tree.plot_tree(whitebox_model['clf'], fontsize=9, feature_names=df_processed.columns)
# plt.show()