## Simple Implementation of Deep Learning (Neural Network)

In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df.head()

Unnamed: 0,id,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,...,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted
0,1,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,...,No,No,No,No,No,No,No,No,No,NO
1,2,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,3,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,...,No,No,No,No,No,No,No,No,Yes,NO
3,4,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,5,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
df.shape

(101766, 51)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 51 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   id                        101766 non-null  int64 
 1   encounter_id              101766 non-null  int64 
 2   patient_nbr               101766 non-null  int64 
 3   race                      101766 non-null  object
 4   gender                    101766 non-null  object
 5   age                       101766 non-null  object
 6   weight                    101766 non-null  object
 7   admission_type_id         101766 non-null  int64 
 8   discharge_disposition_id  101766 non-null  int64 
 9   admission_source_id       101766 non-null  int64 
 10  time_in_hospital          101766 non-null  int64 
 11  payer_code                101766 non-null  object
 12  medical_specialty         101766 non-null  object
 13  num_lab_procedures        101766 non-null  int64 
 14  num_

In [7]:
df.isna().sum()

id                          0
encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
weight                      0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
payer_code                  0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamid

In [9]:
for col in df.columns:
    if df[col].dtype == 'object':
        print(f'Column {col} has {df[col].nunique()} variables. The variables are:')
        print(df[col].unique())
        print()
        print()

Column race has 6 variables. The variables are:
['Caucasian' 'AfricanAmerican' '?' 'Other' 'Asian' 'Hispanic']


Column gender has 3 variables. The variables are:
['Female' 'Male' 'Unknown/Invalid']


Column age has 10 variables. The variables are:
['[0-10)' '[10-20)' '[20-30)' '[30-40)' '[40-50)' '[50-60)' '[60-70)'
 '[70-80)' '[80-90)' '[90-100)']


Column weight has 10 variables. The variables are:
['?' '[75-100)' '[50-75)' '[0-25)' '[100-125)' '[25-50)' '[125-150)'
 '[175-200)' '[150-175)' '>200']


Column payer_code has 18 variables. The variables are:
['?' 'MC' 'MD' 'HM' 'UN' 'BC' 'SP' 'CP' 'SI' 'DM' 'CM' 'CH' 'PO' 'WC' 'OT'
 'OG' 'MP' 'FR']


Column medical_specialty has 73 variables. The variables are:
['Pediatrics-Endocrinology' '?' 'InternalMedicine'
 'Family/GeneralPractice' 'Cardiology' 'Surgery-General' 'Orthopedics'
 'Gastroenterology' 'Surgery-Cardiovascular/Thoracic' 'Nephrology'
 'Orthopedics-Reconstructive' 'Psychiatry' 'Emergency/Trauma'
 'Pulmonology' 'Surgery-Neuro

['No']


Column citoglipton has 1 variables. The variables are:
['No']


Column insulin has 4 variables. The variables are:
['No' 'Up' 'Steady' 'Down']


Column glyburide.metformin has 4 variables. The variables are:
['No' 'Steady' 'Down' 'Up']


Column glipizide.metformin has 2 variables. The variables are:
['No' 'Steady']


Column glimepiride.pioglitazone has 2 variables. The variables are:
['No' 'Steady']


Column metformin.rosiglitazone has 2 variables. The variables are:
['No' 'Steady']


Column metformin.pioglitazone has 2 variables. The variables are:
['No' 'Steady']


Column change has 2 variables. The variables are:
['No' 'Ch']


Column diabetesMed has 2 variables. The variables are:
['No' 'Yes']


Column readmitted has 3 variables. The variables are:
['NO' '>30' '<30']




In [10]:
df = df.replace('?', np.NaN)

In [11]:
df.isna().sum()

id                              0
encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride   

In [12]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])

In [13]:
df.isna().sum()

id                          0
encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
weight                      0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
payer_code                  0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamid

# Preprocessing

In [14]:
y = df['diabetesMed']
X = df.drop(['diabetesMed','readmitted', 'id', 'encounter_id', 'patient_nbr' ], axis = 1)

In [16]:
X

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,tolazamide,examide,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change
0,Caucasian,Female,[0-10),[75-100),6,25,1,1,MC,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Female,[10-20),[75-100),1,1,7,3,MC,InternalMedicine,...,No,No,No,Up,No,No,No,No,No,Ch
2,AfricanAmerican,Female,[20-30),[75-100),1,1,7,2,MC,InternalMedicine,...,No,No,No,No,No,No,No,No,No,No
3,Caucasian,Male,[30-40),[75-100),1,1,7,2,MC,InternalMedicine,...,No,No,No,Up,No,No,No,No,No,Ch
4,Caucasian,Male,[40-50),[75-100),1,1,7,1,MC,InternalMedicine,...,No,No,No,Steady,No,No,No,No,No,Ch
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),[75-100),1,3,7,3,MC,InternalMedicine,...,No,No,No,Down,No,No,No,No,No,Ch
101762,AfricanAmerican,Female,[80-90),[75-100),1,4,5,5,MC,InternalMedicine,...,No,No,No,Steady,No,No,No,No,No,No
101763,Caucasian,Male,[70-80),[75-100),1,1,7,1,MC,InternalMedicine,...,No,No,No,Down,No,No,No,No,No,Ch
101764,Caucasian,Female,[80-90),[75-100),2,3,7,10,MC,Surgery-General,...,No,No,No,Up,No,No,No,No,No,Ch


In [17]:
le = LabelEncoder()

In [18]:
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

In [19]:
X

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,tolazamide,examide,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change
0,2,0,0,8,6,25,1,1,7,37,...,0,0,0,1,1,0,0,0,0,1
1,2,0,1,8,1,1,7,3,7,18,...,0,0,0,3,1,0,0,0,0,0
2,0,0,2,8,1,1,7,2,7,18,...,0,0,0,1,1,0,0,0,0,1
3,2,1,3,8,1,1,7,2,7,18,...,0,0,0,3,1,0,0,0,0,0
4,2,1,4,8,1,1,7,1,7,18,...,0,0,0,2,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,0,1,7,8,1,3,7,3,7,18,...,0,0,0,0,1,0,0,0,0,0
101762,0,0,8,8,1,4,5,5,7,18,...,0,0,0,2,1,0,0,0,0,1
101763,2,1,7,8,1,1,7,1,7,18,...,0,0,0,0,1,0,0,0,0,0
101764,2,0,8,8,2,3,7,10,7,62,...,0,0,0,3,1,0,0,0,0,0


In [21]:
y = le.fit_transform(y)

In [23]:
y.shape

(101766,)

In [25]:
df['diabetesMed'].value_counts()

Yes    78363
No     23403
Name: diabetesMed, dtype: int64

# Split The Dataset

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3)

In [31]:
y_test.shape

(30530,)

In [32]:
# Initialize the SMOTE instance
sm = SMOTE()

In [33]:
X_train, y_train = sm.fit_resample(X_train, y_train)

In [34]:
X_train.shape

(109752, 46)

In [38]:
yes =0
no = 0

for value in y_train:
    if value == 0:
        no += 1
    else:
        yes +=1
            

In [39]:
print(yes, no)

54876 54876


## Scale The Train and Test data

In [40]:
sc = StandardScaler()

In [41]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

In [42]:
X_train

array([[-1.90222516, -0.77388918,  0.66515364, ..., -0.00301853,
         0.        , -1.52804234],
       [ 0.47305661,  1.29197368,  1.31605589, ..., -0.00301853,
         0.        , -1.52804234],
       [ 0.47305661, -0.77388918, -1.93845538, ..., -0.00301853,
         0.        ,  0.65443213],
       ...,
       [ 0.47305661,  1.29197368, -0.63665087, ..., -0.00301853,
         0.        ,  0.65443213],
       [ 0.47305661, -0.77388918,  0.01425139, ..., -0.00301853,
         0.        ,  0.65443213],
       [-1.90222516, -0.77388918,  0.01425139, ..., -0.00301853,
         0.        ,  0.65443213]])

# Modelling

In [43]:
X_train.shape

(109752, 46)

In [45]:
model = keras.Sequential([
    keras.layers.Dense(16, input_shape =(46,), activation = 'relu' ),
    keras.layers.Dense(8,  activation = 'relu' ),
    keras.layers.Dense(1,  activation = 'sigmoid')    
])

In [46]:
model.compile(optimizer= 'adam', loss = 'binary_crossentropy', metrics= ['accuracy'])

In [48]:
model.fit(X_train, y_train, epochs= 5, batch_size = 8)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1b289d2c710>

In [50]:
model.evaluate(X_test, y_test)



[1.5815496112736582e-07, 1.0]

In [51]:
predicted = model.predict(X_test)



In [59]:
real_pred = []
for value in predicted:
    if value == 1:
        real_pred.append(1)
    else:
        real_pred.append(0)
    

In [61]:
set(real_pred)

{0, 1}

In [62]:
print('accuracy is :', accuracy_score(real_pred, y_test))

accuracy is : 0.9897477890599411
