In [49]:
pip install shap

Note: you may need to restart the kernel to use updated packages.


In [2]:
#importing packages
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests                                      # reading data
from io import StringIO

from sklearn.datasets import fetch_openml            # common data set access
from sklearn.preprocessing import StandardScaler     # scaling transform
from sklearn.model_selection import train_test_split # validation tools
from sklearn.metrics import zero_one_loss as J01
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

import sklearn.tree as tree

# Fix the random seed for reproducibility
# !! Important !! : do not change this
seed = 1234
np.random.seed(seed)  

In [15]:
# Reading in the CSV file
dia_data = pd.read_csv("data/diabetic_data.csv")
X = dia_data.iloc[:, :-1]  # All columns except the last one
Y = dia_data.iloc[:, -1]   # Last column

In [16]:
X = X.drop(columns=['diag_1', 'diag_2', 'diag_3'])

In [17]:
X = X.replace('?', np.nan)


#Spliting in the data into 70% training 10% validation 20% test data
Xtr, Xtemp, Ytr, Ytemp = train_test_split(X, Y, test_size=0.3, random_state=seed, shuffle=True)
Xva, Xtest, Yva, Ytest = train_test_split(Xtemp, Ytemp, test_size=0.67, random_state=seed, shuffle=True)
# checking the dimensions
print(f"Training data: {Xtr.shape} {Ytr.shape}")
print(f"Validation data: {Xva.shape} {Yva.shape}")
print(f"Testing data: {Xtest.shape} {Ytest.shape}")


Training data: (71236, 46) (71236,)
Validation data: (10074, 46) (10074,)
Testing data: (20456, 46) (20456,)


In [18]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer


numerical_columns = Xtr.select_dtypes(include=['number']).columns
categorical_columns = Xtr.select_dtypes(include=['object']).columns

numerical_imputer = SimpleImputer(strategy='median')
Xtr[numerical_columns] = numerical_imputer.fit_transform(Xtr[numerical_columns])


categorical_imputer = SimpleImputer(strategy='most_frequent')
Xtr[categorical_columns] = categorical_imputer.fit_transform(Xtr[categorical_columns])


In [23]:
Xtr=Xtr.reset_index()
Xtr

Unnamed: 0,level_0,index,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,0,47866,146752038.0,25174899.0,AfricanAmerican,Female,[10-20),[75-100),1.0,1.0,...,No,No,Down,No,No,No,No,No,Ch,Yes
1,1,24673,83354610.0,298521.0,AfricanAmerican,Female,[90-100),[75-100),1.0,14.0,...,No,No,No,No,No,No,No,No,No,No
2,2,61033,170636856.0,77827617.0,Caucasian,Male,[80-90),[75-100),2.0,6.0,...,No,No,No,No,No,No,No,No,No,Yes
3,3,74444,221787444.0,38516562.0,Caucasian,Male,[80-90),[75-100),1.0,1.0,...,No,No,No,No,No,No,No,No,No,No
4,4,56971,163020096.0,46248840.0,AfricanAmerican,Female,[30-40),[75-100),1.0,1.0,...,No,No,Down,No,No,No,No,No,Ch,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71231,71231,89460,289025640.0,50693553.0,Caucasian,Male,[80-90),[75-100),1.0,6.0,...,No,No,No,No,No,No,No,No,Ch,Yes
71232,71232,60620,169681170.0,66287709.0,Caucasian,Male,[50-60),[75-100),5.0,1.0,...,No,No,No,No,No,No,No,No,No,Yes
71233,71233,34086,106962006.0,24745671.0,Caucasian,Female,[80-90),[75-100),2.0,1.0,...,No,No,Steady,No,No,No,No,No,No,Yes
71234,71234,58067,164976492.0,59602239.0,Caucasian,Female,[80-90),[75-100),1.0,1.0,...,No,No,No,No,No,No,No,No,No,No


In [26]:
Xtr = pd.get_dummies(Xtr, columns=categorical_columns)

In [27]:
Xtr

Unnamed: 0,level_0,index,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,...,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,0,47866,146752038.0,25174899.0,1.0,1.0,7.0,3.0,37.0,1.0,...,1,0,1,0,1,0,1,0,0,1
1,1,24673,83354610.0,298521.0,1.0,14.0,7.0,3.0,57.0,0.0,...,1,0,1,0,1,0,0,1,1,0
2,2,61033,170636856.0,77827617.0,2.0,6.0,1.0,4.0,71.0,0.0,...,1,0,1,0,1,0,0,1,0,1
3,3,74444,221787444.0,38516562.0,1.0,1.0,7.0,1.0,29.0,0.0,...,1,0,1,0,1,0,0,1,1,0
4,4,56971,163020096.0,46248840.0,1.0,1.0,7.0,10.0,49.0,1.0,...,1,0,1,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71231,71231,89460,289025640.0,50693553.0,1.0,6.0,7.0,9.0,72.0,4.0,...,1,0,1,0,1,0,1,0,0,1
71232,71232,60620,169681170.0,66287709.0,5.0,1.0,1.0,3.0,1.0,3.0,...,1,0,1,0,1,0,0,1,0,1
71233,71233,34086,106962006.0,24745671.0,2.0,1.0,4.0,5.0,14.0,4.0,...,1,0,1,0,1,0,0,1,0,1
71234,71234,58067,164976492.0,59602239.0,1.0,1.0,7.0,3.0,45.0,6.0,...,1,0,1,0,1,0,0,1,1,0


In [78]:
Xtr.to_csv('data/X_TRAINING_SET_Diabetes_ENCODED.csv')

In [29]:
Xva.to_csv('data/X_VALIDATION_SET_NOT-ENCODED.csv')

In [30]:
Xtest.to_csv('data/X_TESTING_SET_NOT-ENCODED.csv')

In [31]:
Ytr.to_csv('data/Y_TRAINING_SET.csv')

In [32]:
Yva.to_csv('data/Y_VALIDATION_SET.csv')

In [None]:
Ytest.to_csv('data/Y_TESTING_SET_NOT-ENCODED.csv')