In [49]:
pip install shap

Note: you may need to restart the kernel to use updated packages.


In [50]:
#importing packages
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests                                      # reading data
from io import StringIO

from sklearn.datasets import fetch_openml            # common data set access
from sklearn.preprocessing import StandardScaler     # scaling transform
from sklearn.model_selection import train_test_split # validation tools
from sklearn.metrics import zero_one_loss as J01
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

import sklearn.tree as tree

# Fix the random seed for reproducibility
# !! Important !! : do not change this
seed = 1234
np.random.seed(seed)  

In [51]:
# Reading in the CSV file
dia_data = pd.read_csv("data/diabetic_data.csv")
X = dia_data.iloc[:, :-1]  # All columns except the last one
Y = dia_data.iloc[:, -1]   # Last column

In [52]:
#Spliting in the data into 70% training 10% validation 20% test data
Xtr, Xtemp, Ytr, Ytemp = train_test_split(X, Y, test_size=0.3, random_state=seed, shuffle=True)
Xva, Xtest, Yva, Ytest = train_test_split(Xtemp, Ytemp, test_size=0.67, random_state=seed, shuffle=True)
# checking the dimensions
print(f"Training data: {Xtr.shape} {Ytr.shape}")
print(f"Validation data: {Xva.shape} {Yva.shape}")
print(f"Testing data: {Xtest.shape} {Ytest.shape}")


Training data: (71236, 49) (71236,)
Validation data: (10074, 49) (10074,)
Testing data: (20456, 49) (20456,)


In [53]:
#feature cleanup
Xtr_f1=Xtr.iloc[:,1:17]
Xtr_f2=Xtr.iloc[:,17:33]

In [54]:
Xtr_f1 = Xtr_f1.replace('?', np.nan)

numerical_columns = Xtr_f1.select_dtypes(include=['number']).columns
categorical_columns = Xtr_f1.select_dtypes(include=['object']).columns

numerical_imputer = SimpleImputer(strategy='median')
Xtr_f1[numerical_columns] = numerical_imputer.fit_transform(Xtr_f1[numerical_columns])


categorical_imputer = SimpleImputer(strategy='most_frequent')
Xtr_f1[categorical_columns] = categorical_imputer.fit_transform(Xtr_f1[categorical_columns])
Xtr_f1=Xtr_f1.reset_index()

In [57]:
#abhi can you check this ?
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
Xtr_categorical = Xtr_f1[categorical_columns]
Xtr_encoded = encoder.fit_transform(Xtr_categorical)
concatDF=pd.concat([Xtr_Abhinand, Xtr_encoded_df],axis=1)
concatDF=concatDF.drop(categorical_columns, axis=1)

`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


KeyError: "['race', 'gender', 'age', 'weight', 'payer_code', 'medical_specialty'] not found in axis"

In [56]:
Xtr

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,103,104,105,106,107,108,109,110,111,112
0,146752038,25174899,AfricanAmerican,Female,[10-20),?,1,1,7,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,83354610,298521,AfricanAmerican,Female,[90-100),?,1,14,7,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,170636856,77827617,Caucasian,Male,[80-90),[75-100),2,6,1,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,221787444,38516562,?,Male,[80-90),?,1,1,7,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,163020096,46248840,AfricanAmerican,Female,[30-40),?,1,1,7,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71231,289025640,50693553,Caucasian,Male,[80-90),?,1,6,7,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71232,169681170,66287709,Caucasian,Male,[50-60),?,5,1,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71233,106962006,24745671,Caucasian,Female,[80-90),?,2,1,4,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71234,164976492,59602239,Caucasian,Female,[80-90),?,1,1,7,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#dia_1, dia_2 and dia_3 have too many distinct number so should not do one hot encoding 
categorical_columns =['max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide']
#, 'number_diagnoses',
#       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
#       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
#       'glyburide', 'tolbutamide']
Xtr_categorical = Xtr[categorical_columns]
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
Xtr_encoded = encoder.fit_transform(Xtr_categorical)
Xtr_encoded_df = pd.DataFrame(Xtr_encoded, columns=encoder.get_feature_names_out(categorical_columns))
Xtr = pd.concat([Xtr.reset_index(drop=True), Xtr_encoded_df], axis='columns')
Xtr = Xtr.drop(categorical_columns, axis=1)

`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.


In [27]:
Xtr

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
47866,146752038,25174899,AfricanAmerican,Female,[10-20),?,1,1,7,3,...,No,No,Down,No,No,No,No,No,Ch,Yes
24673,83354610,298521,AfricanAmerican,Female,[90-100),?,1,14,7,3,...,No,No,No,No,No,No,No,No,No,No
61033,170636856,77827617,Caucasian,Male,[80-90),[75-100),2,6,1,4,...,No,No,No,No,No,No,No,No,No,Yes
74444,221787444,38516562,?,Male,[80-90),?,1,1,7,1,...,No,No,No,No,No,No,No,No,No,No
56971,163020096,46248840,AfricanAmerican,Female,[30-40),?,1,1,7,10,...,No,No,Down,No,No,No,No,No,Ch,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89460,289025640,50693553,Caucasian,Male,[80-90),?,1,6,7,9,...,No,No,No,No,No,No,No,No,Ch,Yes
60620,169681170,66287709,Caucasian,Male,[50-60),?,5,1,1,3,...,No,No,No,No,No,No,No,No,No,Yes
34086,106962006,24745671,Caucasian,Female,[80-90),?,2,1,4,5,...,No,No,Steady,No,No,No,No,No,No,Yes
58067,164976492,59602239,Caucasian,Female,[80-90),?,1,1,7,3,...,No,No,No,No,No,No,No,No,No,No


In [None]:
Xtr.to_csv('data/Diabetes_ENCODED.csv')