In [49]:
pip install shap

Note: you may need to restart the kernel to use updated packages.


In [2]:
#importing packages
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests                                      # reading data
from io import StringIO

from sklearn.datasets import fetch_openml            # common data set access
from sklearn.preprocessing import StandardScaler     # scaling transform
from sklearn.model_selection import train_test_split # validation tools
from sklearn.metrics import zero_one_loss as J01
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

import sklearn.tree as tree

# Fix the random seed for reproducibility
# !! Important !! : do not change this
seed = 1234
np.random.seed(seed)  

In [76]:
# Reading in the CSV file
dia_data = pd.read_csv("data/diabetic_data.csv")
X = dia_data.iloc[:, :-1]  # All columns except the last one
Y = dia_data.iloc[:, -1]   # Last column

In [77]:
X = X.drop(columns=['diag_1', 'diag_2', 'diag_3'])

In [78]:
X = X.replace('?', np.nan)



In [79]:
numerical_columns = X.select_dtypes(include=['number']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

numerical_imputer = SimpleImputer(strategy='median')
X[numerical_columns] = numerical_imputer.fit_transform(X[numerical_columns])


categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_columns] = categorical_imputer.fit_transform(X[categorical_columns])

X=X.reset_index()
X = pd.get_dummies(X, columns=categorical_columns)


In [80]:
X

Unnamed: 0,index,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,0,2278392.0,8222157.0,6.0,25.0,1.0,1.0,41.0,0.0,1.0,...,1,0,1,0,1,0,0,1,1,0
1,1,149190.0,55629189.0,1.0,1.0,7.0,3.0,59.0,0.0,18.0,...,1,0,1,0,1,0,1,0,0,1
2,2,64410.0,86047875.0,1.0,1.0,7.0,2.0,11.0,5.0,13.0,...,1,0,1,0,1,0,0,1,0,1
3,3,500364.0,82442376.0,1.0,1.0,7.0,2.0,44.0,1.0,16.0,...,1,0,1,0,1,0,1,0,0,1
4,4,16680.0,42519267.0,1.0,1.0,7.0,1.0,51.0,0.0,8.0,...,1,0,1,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,101761,443847548.0,100162476.0,1.0,3.0,7.0,3.0,51.0,0.0,16.0,...,1,0,1,0,1,0,1,0,0,1
101762,101762,443847782.0,74694222.0,1.0,4.0,5.0,5.0,33.0,3.0,18.0,...,1,0,1,0,1,0,0,1,0,1
101763,101763,443854148.0,41088789.0,1.0,1.0,7.0,1.0,53.0,0.0,9.0,...,1,0,1,0,1,0,1,0,0,1
101764,101764,443857166.0,31693671.0,2.0,3.0,7.0,10.0,45.0,2.0,21.0,...,1,0,1,0,1,0,1,0,0,1


In [82]:

#Spliting in the data into 70% training 10% validation 20% test data
Xtr, Xtemp, Ytr, Ytemp = train_test_split(X, Y, test_size=0.3, random_state=seed, shuffle=True)
Xva, Xtest, Yva, Ytest = train_test_split(Xtemp, Ytemp, test_size=0.67, random_state=seed, shuffle=True)
# checking the dimensions
print(f"Training data: {Xtr.shape} {Ytr.shape}")
print(f"Validation data: {Xva.shape} {Yva.shape}")
print(f"Testing data: {Xtest.shape} {Ytest.shape}")


Training data: (71236, 213) (71236,)
Validation data: (10074, 213) (10074,)
Testing data: (20456, 213) (20456,)


In [34]:
Xtr.to_csv('data/X_TRAINING_SET_Diabetes_ENCODED.csv')

In [29]:
Xva.to_csv('data/X_VALIDATION_SET_ENCODED.csv')

In [30]:
Xtest.to_csv('data/X_TESTING_SET_ENCODED.csv')

In [31]:
Ytr.to_csv('data/Y_TRAINING_SET.csv')

In [32]:
Yva.to_csv('data/Y_VALIDATION_SET.csv')

In [33]:
Ytest.to_csv('data/Y_TESTING_SET.csv')