In [1]:
import optuna
from sklearn import metrics
from typing import Callable, List
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load data
# df = pd.read_csv("data/heart_2020_cleaned.csv")
# train, val = train_test_split(df, test_size=0.3, random_state=42, stratify=df["HeartDisease"])
# val, test = train_test_split(val, test_size=0.5, random_state=42, stratify=val["HeartDisease"])
# train.to_csv("data/heart_train.csv", index=False)
# val.to_csv("data/heart_val.csv", index=False)
# test.to_csv("data/heart_test.csv", index=False)

In [49]:
train = pd.read_csv('data/heart_train.csv').iloc[:150000, :]
val = pd.read_csv('data/heart_val.csv')
test = pd.read_csv('data/heart_test.csv')

In [50]:
# Preprocessing data

categorical_features = [
    "Smoking",
    "AlcoholDrinking",
    "Stroke",
    "DiffWalking",
    "Sex",
    "AgeCategory",
    "Race",
    "Diabetic",
    "PhysicalActivity",
    "GenHealth",
    "Asthma",
    "KidneyDisease",
    "SkinCancer"
]

continuos_features = [
    "BMI",
    "PhysicalHealth",
    "MentalHealth"
]

target_variable = "HeartDisease"

def data_preprocessing(data: pd.DataFrame, categorical_features: List[str], continuous_features: List[str], target_variable: str):

    df = data.copy()

    # protected variables
    sex = df["Sex"].values
    age = df["AgeCategory"].values
    race = df["Race"].values

    # target
    target = df[target_variable].values

    df_processed = df[categorical_features + continuous_features]
    df_processed = pd.get_dummies(df_processed, prefix=None, prefix_sep='_', dummy_na=False, columns=categorical_features, drop_first=True)

    return df_processed, df, target, sex, age, race

#df_processed, df_original, target, sex, age, race = data_preprocessing(df, categorical_features=categorical_features, continuous_features=continuos_features, target_variable=target_variable)
train_processed, train_original, train_target, train_sex, train_age, train_race = data_preprocessing(train, categorical_features=categorical_features, continuous_features=continuos_features, target_variable=target_variable)
val_processed, val_original, val_target, val_sex, val_age, val_race = data_preprocessing(val, categorical_features=categorical_features, continuous_features=continuos_features, target_variable=target_variable)

In [51]:
# just renaming stuff
# X_train, X_val = train_processed.values, val_processed.values
y_train, y_val = train_target, val_target

In [56]:
# Fairness metric function
import itertools

def equailized_odds(preds: np.ndarray, groups: np.ndarray, test: np.ndarray, sum_of_differences: bool = True, verbose: bool = False):
    """
    Calculates the equailized odds of a binary classification problem.
    preds: predictions of the model
    groups: group labels of the test data
    test: test data
    sum_of_differences: if True, the sum of the differences is returned, else the mean of the differences is returned
    verbose: if True, prints the results
    """

    df = pd.DataFrame(list(zip(preds, groups, test)), columns=['preds', 'groups', 'test'])
    
    total_class_difference = 0
    for target in df['test'].unique():
        results = {}
        for group in df['groups'].unique():
            
            # get the group and amount of corrects in the group
            selection = df.loc[(df['test'] == target) & (df['groups'] == group)]
            corrects = selection.loc[selection['preds'] == "Yes"]
    
            # if there are no corrects in the group, skip
            if len(corrects) == 0 or len(selection) == 0:
                results[group] = float('inf')
                #results[group] = 1
                continue

            # get the odds ratio
            score = round(len(corrects) / len(selection), 3)

            # add the score to the results
            results[group] = score

            if verbose:
                print(f'Target [{target}] and group [{group}]: {score} ')
    
        if verbose:
            print(results)
        group_combinations = list(itertools.combinations(list(results.keys()), 2))
    
        # save differences between groups (pairwise)
        class_differences = 0
    
        # for each combination of groups
        for combination in group_combinations:
            difference = abs(results[combination[0]] - results[combination[1]])
            class_differences += difference

        # sum up differences or take average
        if sum_of_differences:
            total_class_difference += class_differences
        else:
            total_class_difference += class_differences / len(group_combinations)

    if verbose:
        print(f'Total class difference: {total_class_difference}')

    return total_class_difference

preds = [0,1,1,1,1]
groups = ["A", "A", "B", "B", "C"]
test = [0,0,1,1,1]

print(equailized_odds(preds, groups, test))

0


In [53]:
# Train whitebox model

# Scale continuous variables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
# scaler = ColumnTransformer([('scaler', StandardScaler(), continuos_features)], remainder='passthrough')

# Whitebox model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

scaler = ColumnTransformer(
    [('scaler', StandardScaler(), continuos_features)], remainder='passthrough')
    
whitebox_model = Pipeline(steps=[('scaler', scaler), ('clf', DecisionTreeClassifier(min_samples_split = 0.01, min_samples_leaf= 0.01, max_features="auto", max_depth = 5, criterion = "gini", random_state = 42))])
# whitebox_model = Pipeline(steps=[('scaler', scaler), ('clf', DecisionTreeClassifier(random_state = 42))])

whitebox_model.fit(train_processed, y_train)
y_pred_whitebox = whitebox_model.predict(val_processed)
y_pred_proba_whitebox = whitebox_model.predict_proba(val_processed)

print(Counter(y_pred_whitebox))

print(classification_report(y_val, y_pred_whitebox))

print(equailized_odds(y_pred_whitebox, val_race, y_val, verbose=True))

# # plot tree
# plt.figure(figsize=(25,20))  # set plot size (denoted in inches)
# tree.plot_tree(whitebox_model['clf'], fontsize=9, feature_names=df_processed.columns)
# plt.show()



Counter({'No': 47969})


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          No       0.91      1.00      0.96     43863
         Yes       0.00      0.00      0.00      4106

    accuracy                           0.91     47969
   macro avg       0.46      0.50      0.48     47969
weighted avg       0.84      0.91      0.87     47969

{'White': inf, 'Black': inf, 'Other': inf, 'Asian': inf, 'Hispanic': inf, 'American Indian/Alaskan Native': inf}
{'White': inf, 'Black': inf, 'Other': inf, 'Asian': inf, 'Hispanic': inf, 'American Indian/Alaskan Native': inf}
Total class difference: nan
nan


  _warn_prf(average, modifier, msg_start, len(result))


# Optuna Study

In [57]:
optuna.logging.set_verbosity(1)


class Objective(object):
    def __init__(
        self,
        X_train: np.ndarray,
        X_val: np.ndarray,
        y_train: np.ndarray,
        y_val: np.ndarray,
        group_val: np.ndarray,
        evaluation_func: Callable,
    ):
        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
        self.group_val = group_val
        self.evaluation_func = evaluation_func

    def __call__(self, trial) -> float:
        """This method is called by Optuna to compute the objective
        function."""
        # Initialize general hyper parameters

        params = {
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
            "max_depth": trial.suggest_int("max_depth", 5, 15, step=2),
            "min_samples_split": trial.suggest_loguniform("min_samples_split", 1e-3, 0.01),
            "min_samples_leaf": trial.suggest_loguniform("min_samples_leaf", 1e-3, 0.01),
        }

        # parameters for fitting a model
        whitebox_model = DecisionTreeClassifier(
            **params, random_state=42).fit(self.X_train, self.y_train)

        preds: np.ndarray = whitebox_model.predict(self.X_val)

        return self.evaluation_func(preds, self.group_val, self.y_val)

In [58]:
# Run optimization
study = optuna.create_study(direction="minimize")

# Scaling
# Scale continuous variables
scaler = ColumnTransformer(
    [('scaler', StandardScaler(), continuos_features)], remainder='passthrough')
X_train = scaler.fit_transform(train_processed)
X_val = scaler.transform(val_processed)

# Define objective
objective = Objective(X_train, X_val, y_train, y_val, val_race, equailized_odds)

# Make a study to optimize the objective.
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)

best_params = study.best_params

print(best_params)

scaler = ColumnTransformer(
    [('scaler', StandardScaler(), continuos_features)], remainder='passthrough')
    
whitebox_model = Pipeline(steps=[('scaler', scaler), ('clf', DecisionTreeClassifier(**best_params, random_state = 42))])
# whitebox_model = Pipeline(steps=[('scaler', scaler), ('clf', DecisionTreeClassifier(random_state = 42))])

whitebox_model.fit(train_processed, y_train)
y_pred_whitebox = whitebox_model.predict(val_processed)
y_pred_proba_whitebox = whitebox_model.predict_proba(val_processed)

print(Counter(y_pred_whitebox))

print(classification_report(y_val, y_pred_whitebox))

print(equailized_odds(y_pred_whitebox, val_race, y_val, verbose=True))

[32m[I 2022-05-21 15:53:55,079][0m A new study created in memory with name: no-name-fc3c9665-9697-45e6-a8b0-e145f622d5d0[0m
[32m[I 2022-05-21 15:53:56,342][0m Trial 7 finished with value: 0.552 and parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 0.002890827925115521, 'min_samples_leaf': 0.004398391722995249}. Best is trial 7 with value: 0.552.[0m
[32m[I 2022-05-21 15:53:56,369][0m Trial 2 finished with value: 0.552 and parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 0.0014690761337544014, 'min_samples_leaf': 0.0033885231400634552}. Best is trial 7 with value: 0.552.[0m
[32m[I 2022-05-21 15:53:56,379][0m Trial 1 finished with value: 0.485 and parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 0.0014312015195407566, 'min_samples_leaf': 0.0012098629736921607}. Best is trial 1 with value: 0.485.[0m
[32m[I 2022-05-21 15:53:56,668][0m Trial 5 finished with value: 0.552 and parameters: {'criterion': 'entropy

{'criterion': 'entropy', 'max_depth': 9, 'min_samples_split': 0.008515532694739222, 'min_samples_leaf': 0.009570977132308832}
Counter({'No': 47969})


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          No       0.91      1.00      0.96     43863
         Yes       0.00      0.00      0.00      4106

    accuracy                           0.91     47969
   macro avg       0.46      0.50      0.48     47969
weighted avg       0.84      0.91      0.87     47969

{'White': 1, 'Black': 1, 'Other': 1, 'Asian': 1, 'Hispanic': 1, 'American Indian/Alaskan Native': 1}
{'White': 1, 'Black': 1, 'Other': 1, 'Asian': 1, 'Hispanic': 1, 'American Indian/Alaskan Native': 1}
Total class difference: 0
0


  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
from collections import Counter
Counter(val_race)

Counter({'White': 36714,
         'Black': 3401,
         'Other': 1697,
         'Asian': 1189,
         'Hispanic': 4139,
         'American Indian/Alaskan Native': 829})

# EDA

In [None]:
plt.rcParams["figure.figsize"] = (20,20)

# Checking for correlation between variables
from dython import nominal

data_test = train_original.copy()

# Encode the object as an enumerated type or categorical variable.
data_test[categorical_features] = data_test[categorical_features].apply(lambda x : pd.factorize(x)[0])
nominal.associations(data_test)