In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [68]:
#load the data
diabetic_data = pd.read_csv(r"C:\Users\aasha\Downloads\diabetic_data_cleaned.csv")

In [116]:
#Transposing the data
diabetic_data.head().T

Unnamed: 0,0,1,2,3,4
race,Caucasian,Caucasian,AfricanAmerican,Caucasian,Caucasian
gender,Female,Female,Female,Male,Male
age,5,15,25,35,45
admission_type_id,6,1,1,1,1
discharge_disposition_id,25,1,1,1,1
time_in_hospital,1,3,2,2,1
medical_specialty,Pediatrics-Endocrinology,0,0,0,0
num_lab_procedures,41,59,11,44,51
num_procedures,0,0,5,1,0
num_medications,1,18,13,16,8


In [118]:
#checking the rows and columns
diabetic_data.shape

(99493, 44)

In [120]:
#identifying the categorical columns
diabetic_data.select_dtypes(include=['object', 'category'])

Unnamed: 0,race,gender,medical_specialty,diag_1,diag_2,diag_3,metformin,repaglinide,nateglinide,chlorpropamide,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,Pediatrics-Endocrinology,250.83,0,0,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Female,0,276,250.01,255,No,No,No,No,...,No,Up,No,No,No,No,No,Yes,Yes,>30
2,AfricanAmerican,Female,0,648,250,V27,No,No,No,No,...,No,No,No,No,No,No,No,No,Yes,No
3,Caucasian,Male,0,8,250.43,403,No,No,No,No,...,No,Up,No,No,No,No,No,Yes,Yes,No
4,Caucasian,Male,0,197,157,250,No,No,No,No,...,No,Steady,No,No,No,No,No,Yes,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99488,AfricanAmerican,Male,0,250.13,291,458,Steady,No,No,No,...,No,Down,No,No,No,No,No,Yes,Yes,>30
99489,AfricanAmerican,Female,0,560,276,787,No,No,No,No,...,No,Steady,No,No,No,No,No,No,Yes,No
99490,Caucasian,Male,0,38,590,296,Steady,No,No,No,...,No,Down,No,No,No,No,No,Yes,Yes,No
99491,Caucasian,Female,Surgery-General,996,285,998,No,No,No,No,...,No,Up,No,No,No,No,No,Yes,Yes,No


In [122]:
# Convert categorical variables into dummy/indicator variables to enable machine learning models to interpret them.
diabetic_data_dummies = pd.get_dummies(diabetic_data, drop_first=True)

In [124]:
#checking the top 5 rows of the dataset
diabetic_data_dummies.head()

Unnamed: 0,age,admission_type_id,discharge_disposition_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-pioglitazone_Steady,change_Yes,diabetesMed_Yes,readmitted_>30,readmitted_No
0,5,6,25,1,41,0,1,0,0,0,...,True,False,False,False,False,False,False,False,False,True
1,15,1,1,3,59,0,18,0,0,0,...,True,False,False,False,False,False,True,True,True,False
2,25,1,1,2,11,5,13,2,0,1,...,True,False,False,False,False,False,False,True,False,True
3,35,1,1,2,44,1,16,0,0,0,...,True,False,False,False,False,False,True,True,False,True
4,45,1,1,1,51,0,8,0,0,0,...,True,False,False,False,False,False,True,True,False,True


In [126]:
#identifying the numeric columns
numeric_cols = diabetic_data_dummies.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [132]:
#incase our targeted column is numeric
numeric_cols = [col for col in numeric_cols if col != 'readmitted']

In [134]:
diabetic_data_dummies[numeric_cols].head()

Unnamed: 0,age,admission_type_id,discharge_disposition_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted_binary
0,5,6,25,1,41,0,1,0,0,0,1,0
1,15,1,1,3,59,0,18,0,0,0,9,0
2,25,1,1,2,11,5,13,2,0,1,6,0
3,35,1,1,2,44,1,16,0,0,0,7,0
4,45,1,1,1,51,0,8,0,0,0,5,0


In [146]:
# Standardize numeric columns to ensure equal weighting across features.
# Especially useful for distance-based or gradient-based algorithms.scaler = StandardScaler()
diabetic_data_dummies[numeric_cols] = scaler.fit_transform(diabetic_data_dummies[numeric_cols])

In [138]:
#Sanity check after our scaling
diabetic_data_dummies[numeric_cols].describe().T[['mean', 'std']]

Unnamed: 0,mean,std
age,-2.18534e-16,1.000005
admission_type_id,1.828259e-17,1.000005
discharge_disposition_id,-2.856654e-18,1.000005
time_in_hospital,-9.027027000000001e-17,1.000005
num_lab_procedures,9.284126e-17,1.000005
num_procedures,1.542593e-17,1.000005
num_medications,1.384049e-16,1.000005
number_outpatient,2.3995900000000003e-17,1.000005
number_emergency,-2.028224e-17,1.000005
number_inpatient,-4.6991960000000005e-17,1.000005


In [140]:
#our feature X
X = diabetic_data_dummies.drop('readmitted_binary', axis=1)
#our target Y
y = diabetic_data_dummies['readmitted_binary']

In [148]:
# Split the dataset into training and test sets (80/20) to evaluate model performance on unseen data.
# Stratify on target to preserve class distribution.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [144]:
diabetic_data.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,readmitted_binary
0,Caucasian,Female,5,6,25,1,Pediatrics-Endocrinology,41,0,1,...,No,No,No,No,No,No,No,No,No,0
1,Caucasian,Female,15,1,1,3,0,59,0,18,...,Up,No,No,No,No,No,Yes,Yes,>30,0
2,AfricanAmerican,Female,25,1,1,2,0,11,5,13,...,No,No,No,No,No,No,No,Yes,No,0
3,Caucasian,Male,35,1,1,2,0,44,1,16,...,Up,No,No,No,No,No,Yes,Yes,No,0
4,Caucasian,Male,45,1,1,1,0,51,0,8,...,Steady,No,No,No,No,No,Yes,Yes,No,0
