# **Diabetes Prediction With Deep Learning**

In [None]:
# import libraries
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import metrics
import plotly.express as px
import plotly.graph_objects as go
# set random seeds to get reproducible results
import os
seed = 100
os.environ['PYTHONHASHSEED']=str(seed)
keras.utils.set_random_seed(seed) # set all random seeds for the program (Python, NumPy, and TensorFlow)

In [None]:
#load train data
df = pd.read_csv('train.csv')
print(f'Number of entries: {len(df)}')
df.head()

Number of entries: 668


Unnamed: 0,id,A1,A2,A3,A4,A5,A6,A7,A8,class
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


In [None]:
counts = df['class'].value_counts()
counts = counts.reset_index()
counts.columns = ['condition', 'count']

fig = px.scatter(counts, x="condition", y="count",
                 size="count", size_max=200, color="count",
                 hover_name="count", text="condition",
                 title="Diabetes countplot")

# Set the x-axis and y-axis labels
fig.update_layout(xaxis_title="Positive or negative", yaxis_title="count", height=600, width=1000, template="plotly_dark")


# Show the chart
fig.show()

In [None]:
# extract labels
y = df['class']
print(y.value_counts())

0    437
1    231
Name: class, dtype: int64


In [None]:
# remove unnecessary columns
X = df.drop(['id', 'class'], axis=1)

print(X.info())
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      668 non-null    int64  
 1   A2      668 non-null    int64  
 2   A3      668 non-null    int64  
 3   A4      668 non-null    int64  
 4   A5      668 non-null    int64  
 5   A6      668 non-null    float64
 6   A7      668 non-null    float64
 7   A8      668 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 41.9 KB
None


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


## Model Building
### let's use a simple model with 8 features and 2 hidden layers

## Splitting the data

In [None]:
# split data to train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=100)
print(f'training data set size: {len(X_train)}')
print(f'validation data set size: {len(X_val)}')

training data set size: 467
validation data set size: 201


In [None]:
# define the keras model
model1 = keras.Sequential()
model1.add(layers.Dense(12, input_dim=8, activation='relu'))
model1.add(layers.Dense(8, activation='relu'))
model1.add(layers.Dense(1, activation='sigmoid'))

model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 12)                108       
                                                                 
 dense_1 (Dense)             (None, 8)                 104       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 221
Trainable params: 221
Non-trainable params: 0
_________________________________________________________________


In [None]:
# compile the keras model
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# train model
model1.fit(X_train, y_train, batch_size=45, epochs=100, validation_data=(X_val, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f68a0cc63a0>

##Let's calculate the validation accuracy

In [None]:
# get model predictions
y_pred = model1.predict(X_val)
print(y_pred[:20])

[[0.18297721]
 [0.09044032]
 [0.16465206]
 [0.24072677]
 [0.40788078]
 [0.00923567]
 [0.4318643 ]
 [0.22168751]
 [0.3195449 ]
 [0.11932524]
 [0.27842098]
 [0.11713725]
 [0.05053964]
 [0.19609019]
 [0.22153142]
 [0.3060912 ]
 [0.7089891 ]
 [0.2467063 ]
 [0.21907848]
 [0.17753765]]


In [None]:
# convert to categorical predictions
y_pred_categorical = [1 if pred > 0.5 else 0 for pred in y_pred]
print(y_pred_categorical[:20])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]


In [None]:
# measure accuracy
accuracy = metrics.accuracy_score(y_val, y_pred_categorical)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7263681592039801


In [None]:
# clear session
keras.backend.clear_session()

# set random seed
keras.utils.set_random_seed(seed) # set all random seeds for the program (Python, NumPy, and TensorFlow)

## M2

In [None]:
# define the keras model
model2 = keras.Sequential()
model2.add(layers.Dense(64, input_dim=8, activation='relu'))
model2.add(layers.Dense(32, activation='relu'))
model2.add(layers.Dense(1, activation='sigmoid'))

model2.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                576       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,689
Trainable params: 2,689
Non-trainable params: 0
_________________________________________________________________


In [None]:
# compile the keras model
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# train model
model2.fit(X_train, y_train, batch_size=45, epochs=100, validation_data=(X_val, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f68a0930c40>

In [None]:
# get model predictions
y_pred = model2.predict(X_val)
print(y_pred[:10])

[[2.4484600e-01]
 [2.1778463e-01]
 [6.5356886e-01]
 [2.6924753e-01]
 [9.5903951e-01]
 [5.6636939e-04]
 [6.6409713e-01]
 [2.9108113e-01]
 [7.9367244e-01]
 [1.2974343e-01]]


In [None]:
# convert to categorical predictions
y_pred_categorical = [1 if pred > 0.5 else 0 for pred in y_pred]
print(y_pred_categorical[:10])

[0, 0, 1, 0, 1, 0, 1, 0, 1, 0]


In [None]:
# measure accuracy
accuracy = metrics.accuracy_score(y_val, y_pred_categorical)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6865671641791045


## Model 3
### Early Stopping

In [None]:
# clear session
keras.backend.clear_session()

# set random seed
keras.utils.set_random_seed(seed) # set all random seeds for the program (Python, NumPy, and TensorFlow)

In [None]:
callback = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=7, restore_best_weights=True)

# define the keras model
model3 = keras.Sequential()
model3.add(layers.Dense(12, input_dim=8, activation='relu'))
model3.add(layers.Dense(8, activation='relu'))
model3.add(layers.Dense(1, activation='sigmoid'))

model3.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 12)                108       
                                                                 
 dense_1 (Dense)             (None, 8)                 104       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 221
Trainable params: 221
Non-trainable params: 0
_________________________________________________________________


In [None]:
# compile the keras model
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model with early stopping
model3.fit(X_train, y_train, batch_size=45, epochs=100, validation_data=(X_val, y_val), callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


<keras.callbacks.History at 0x7f689bc4d040>

In [None]:
# get model predictions
y_pred = model3.predict(X_val)

# convert to categorical predictions
y_pred_categorical = [1 if pred > 0.5 else 0 for pred in y_pred]
print(y_pred_categorical[:10])

# measure accuracy
accuracy = metrics.accuracy_score(y_val, y_pred_categorical)
print(f'Accuracy: {accuracy}')

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Accuracy: 0.6616915422885572


##M4
### L1 and L2 Regularization

In [None]:
from keras import regularizers

# define the keras model with L1 and L2 regularization
model4 = keras.Sequential()
model4.add(layers.Dense(12, input_dim=8, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.001, l2=0.001)))
model4.add(layers.Dense(8, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.001, l2=0.001)))
model4.add(layers.Dense(4, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.001, l2=0.001)))
model4.add(layers.Dense(1, activation='relu'))

model4.summary()



# Compile the model
model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 12)                108       
                                                                 
 dense_1 (Dense)             (None, 8)                 104       
                                                                 
 dense_2 (Dense)             (None, 4)                 36        
                                                                 
 dense_3 (Dense)             (None, 1)                 5         
                                                                 
Total params: 253
Trainable params: 253
Non-trainable params: 0
_________________________________________________________________


In [None]:
# train model without early stopping
model4.fit(X_train, y_train, batch_size=42, epochs=125, validation_data=(X_val, y_val))

Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/125
Epoch 75/125
Epoch 76/125
Epoch 77/125
Epoch 78

<keras.callbacks.History at 0x7fdcedf82e80>

In [None]:
# get model predictions
y_pred = model4.predict(X_val)

# convert to categorical predictions
y_pred_categorical = [1 if pred > 0.5 else 0 for pred in y_pred]
print(y_pred_categorical[:10])

# measure accuracy
accuracy = metrics.accuracy_score(y_val, y_pred_categorical)
print(f'Accuracy: {accuracy}')

[0, 0, 0, 0, 1, 0, 1, 0, 0, 0]
Accuracy: 0.8208955223880597


#### Summary

|Model | Number of hidden layers, neurons | Epoch  |Batch size|Early Stopping| Accuracy|
|------|---------------|------------|---------|--------|--------|
|M1 | 2,8 | 100|45|No| 0.726|
|M2   | 2,8 | 100|45|Yes| 0.6865 |
|M3   | 2,8| 100|45|Yes| 0.6619
|M4   | 3,8| 125 |42|No, with L1 and L2 Regularization| 0.8209

## Predict

In [None]:
df_test = pd.read_csv('test.csv')

# summarise the details
print(f'Number of entries: {len(df_test)}')

X_test = df_test.drop(['id'], axis=1)
print(X_test.info())

Number of entries: 100
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      100 non-null    int64  
 1   A2      100 non-null    int64  
 2   A3      100 non-null    int64  
 3   A4      100 non-null    int64  
 4   A5      100 non-null    int64  
 5   A6      100 non-null    float64
 6   A7      100 non-null    float64
 7   A8      100 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 6.4 KB
None


In [None]:
test_pred = model4.predict(X_test)
print(test_pred)

test_pred_categorical = [1 if pred > 0.5 else 0 for pred in test_pred]
print(test_pred_categorical)


[[0.41292864]
 [0.6882069 ]
 [0.8273219 ]
 [0.07378696]
 [0.27512586]
 [0.4985994 ]
 [0.26091456]
 [0.8078294 ]
 [0.49760455]
 [0.1316018 ]
 [0.39264482]
 [0.16876067]
 [0.        ]
 [0.9194529 ]
 [0.20593901]
 [0.31075448]
 [0.09307902]
 [0.36548233]
 [0.23612909]
 [0.2770449 ]
 [0.26270628]
 [0.5252879 ]
 [0.17658709]
 [0.7290636 ]
 [0.3893944 ]
 [0.6423561 ]
 [0.        ]
 [0.48078978]
 [0.6586178 ]
 [0.29798377]
 [0.32806903]
 [0.46343052]
 [0.36294013]
 [0.46863353]
 [0.6328267 ]
 [0.33841938]
 [0.16748242]
 [0.        ]
 [0.        ]
 [0.4034347 ]
 [0.6783767 ]
 [0.27450132]
 [0.48341995]
 [0.3699587 ]
 [1.0486205 ]
 [0.312001  ]
 [0.12916245]
 [0.85423994]
 [0.7248579 ]
 [0.20992891]
 [0.30959392]
 [0.4736246 ]
 [0.13721918]
 [0.3528223 ]
 [0.5274674 ]
 [0.33057368]
 [0.16183977]
 [0.3327793 ]
 [0.30218327]
 [0.367445  ]
 [0.4318698 ]
 [0.1056485 ]
 [0.3477416 ]
 [0.2661841 ]
 [0.81649244]
 [0.24296407]
 [0.09048007]
 [0.24014111]
 [0.13488041]
 [0.03497447]
 [0.2668293 ]
 [0.24

In [None]:
# create data frame for submission
df_test = pd.DataFrame(df_test['id'])
df_test['prediction'] = test_pred_categorical
# save data frame to .csv file
df_test.to_csv('/content/test-predictions.csv', index=False)

In [None]:
import json

import pandas as pd

test_file_path = "/content/test-predictions.csv"
df_test = pd.read_csv(test_file_path)
df_test = df_test[["id", "prediction"]]

data = []
for index, row in df_test.iterrows():
    data.append({'id': int(row['id']), 'prediction': int(row['prediction'])})

print(data[0:5])

submission_file_path = "submission.json"
with open(submission_file_path, 'w') as fp:
    fp.write('\n'.join(json.dumps(i) for i in data))

[{'id': 1, 'prediction': 0}, {'id': 2, 'prediction': 1}, {'id': 3, 'prediction': 1}, {'id': 4, 'prediction': 0}, {'id': 5, 'prediction': 0}]
