In [2]:
# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import pandas as pd

# DiCE imports
import dice_ml
from dice_ml.utils import helpers  # helper functions


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
dataset = pd.read_csv('data_clean.csv')
#dataset = dataset.drop("Unnamed: 0", axis=1)
dataset.head()

Unnamed: 0,Gender,Age,BMI,Smoking,Alcohol,Sleep,Exercise,Fruit,Diabetes,Kidney,Stroke,Heartdis
0,Male,Age 18 to 24,33.284898,Every day,12,6,900.0,7.0,Yes,No,No,0
1,Female,Age 18 to 24,23.294675,Every day,10,7,21.0,7.0,No,No,No,0
2,Female,Age 75 to 79,27.249276,Not at all,2,6,84.0,14.0,Yes,No,No,0
3,Male,Age 60 to 64,28.126081,Not at all,2,8,560.0,0.933333,Yes,No,No,0
4,Male,Age 65 to 69,26.496727,Not at all,6,8,90.0,5.833333,No,No,No,0


In [5]:
dataset.info

<bound method DataFrame.info of      Gender              Age        BMI     Smoking  Alcohol  Sleep  Exercise  \
0      Male     Age 18 to 24  33.284898   Every day       12      6     900.0   
1    Female     Age 18 to 24  23.294675   Every day       10      7      21.0   
2    Female     Age 75 to 79  27.249276  Not at all        2      6      84.0   
3      Male     Age 60 to 64  28.126081  Not at all        2      8     560.0   
4      Male     Age 65 to 69  26.496727  Not at all        6      8      90.0   
..      ...              ...        ...         ...      ...    ...       ...   
395    Male     Age 60 to 64  26.578450  Not at all        4      7     960.0   
396    Male  Age 80 or older  26.289704  Not at all        2      9     250.0   
397  Female     Age 60 to 64  25.790533  Not at all        1      8     420.0   
398  Female     Age 50 to 54  24.933391   Every day        2      5      90.0   
399  Female     Age 55 to 59  31.614201   Every day        2      6     140.0

In [7]:
# Split the dataset into two
target = dataset["Heartdis"]
train_dataset, test_dataset, y_train, y_test = train_test_split(dataset,
                                                                target,
                                                                test_size=0.2,
                                                                random_state=0,
                                                                stratify=target)
x_train = train_dataset.drop('Heartdis', axis=1)
x_test = test_dataset.drop('Heartdis', axis=1)

In [11]:
numerical = ["BMI", "Alcohol", "Sleep", "Exercise", "Fruit"]
categorical = x_train.columns.difference(numerical)

In [12]:
# Step 1: dice_ml.Data
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numerical, outcome_name='Heartdis')

In [13]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)

In [14]:
# Using sklearn backend
m = dice_ml.Model(model=model, backend="sklearn")
# Using method=random for generating CFs
# method : random, genetic, kdtree (a verif)
exp = dice_ml.Dice(d, m, method="random")

In [15]:
# Find clues where the value is “Yes”.
yes_indices = y_test.index[y_test == 1].tolist()
# Choose a random index among them
random_index = np.random.choice(yes_indices)
print(y_test.loc[random_index])

1


In [16]:
# ?exp.generate_counterfactuals

In [25]:
# Gender 	Age 	BMI 	Smoking 	Alcohol 	Sleep 	Exercise 	Fruit 	Diabetes 	Kidney 	Stroke 	Heartdis
query_instance = {'Gender': 'Female',
                  'Age': 'Age 55 to 59',
                  'BMI': 100,
                  'Smoking': 'Every day',
                  'Alcohol': 20,
                  'Sleep': 5,
                  'Exercise': 3,
                  'Fruit': 1,
                  'Diabetes': 'Yes',
                  'Kidney': 'No',
                  'Stroke': 'No'
                 }

query_instance = pd.DataFrame([query_instance])

In [30]:
print(x_test.loc[random_index])
print(y_test.loc[random_index])

Gender              Male
Age         Age 60 to 64
BMI            23.745543
Smoking        Every day
Alcohol                2
Sleep                  6
Exercise          2520.0
Fruit           1.166667
Diabetes              No
Kidney                No
Stroke                No
Name: 202, dtype: object
1


In [28]:
query_instance = x_test.loc[[random_index]]
e1 = exp.generate_counterfactuals(query_instance, total_CFs=2, desired_class=0, features_to_vary=["BMI", "Smoking", "Alcohol", "Sleep", "Exercise", "Fruit"])
e1.visualize_as_dataframe(show_only_changes=True)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  4.43it/s]

Query instance (original outcome : 0)





Unnamed: 0,Gender,Age,BMI,Smoking,Alcohol,Sleep,Exercise,Fruit,Diabetes,Kidney,Stroke,Heartdis
0,Male,Age 60 to 64,23.745543,Every day,2,6,2520.0,1.166667,No,No,No,0



Diverse Counterfactual set (new outcome: 0)


Unnamed: 0,Gender,Age,BMI,Smoking,Alcohol,Sleep,Exercise,Fruit,Diabetes,Kidney,Stroke,Heartdis
0,-,-,-,-,-,9,-,28.8,-,-,-,-
1,-,-,-,Some days,-,12,-,-,-,-,-,-
