# 1. Prepare the environment

In [None]:
# List all NVIDIA GPUs as avaialble in this computer (or Colab's session)
!nvidia-smi -L

In [None]:
import sys
print( f"Python {sys.version}\n" )

import numpy as np
print( f"NumPy {np.__version__}\n" )

import pandas as pd
print( f"Pandas {pd.__version__}\n" )

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
print( f"TensorFlow {tf.__version__}" )
print( f"tf.keras.backend.image_data_format() = {tf.keras.backend.image_data_format()}" )

# Count the number of GPUs as detected by tensorflow
gpus = tf.config.list_physical_devices('GPU')
print( f"TensorFlow detected { len(gpus) } GPU(s):" )
for i, gpu in enumerate(gpus):
  print( f".... GPU No. {i}: Name = {gpu.name} , Type = {gpu.device_type}" )

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# 2. Prepare the data in NumPy 


1. Load train and test data
2. Data preprocessing
3. Format the data (e.g, shape, dtype) to suit with the model's requirement (e.g., MLP)

In [None]:
# Set fixed seeding values for reproducability during experiments
# Skip this cell if random initialization (with varied results) is needed
url = 'https://raw.githubusercontent.com/Bolympus1/DADS7202/main/diabetes_binary_health_indicators_BRFSS2015.csv'

df = pd.read_csv(url,sep=",")
df.head()

In [None]:
# Histrogramsns.set_theme(style="white",palette="pastel")
df.hist(figsize=(20,15));

In [None]:
plt.figure(figsize = (15,28))
for i,col in enumerate(df.loc[:,df.columns != 'Diabetes_binary']):
  plt.subplot(7,3,i+1)
  sns.boxplot(x = col, data = df)
plt.show()

Oversampling

In [None]:
X = df.loc[:,df.columns != 'Diabetes_binary']
y = df['Diabetes_binary']
print( X.columns )
print( X.shape )

In [None]:
## SMOTEENN
## Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours.

from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [None]:
df_resampled = pd.concat([pd.DataFrame(y_resampled), pd.DataFrame(X_resampled)], axis=1)

In [None]:
df_resampled['Diabetes_binary'].value_counts().plot.pie(autopct='%.2f')
print(df_resampled['Diabetes_binary'].value_counts()) # 290,010

In [None]:
df = df_resampled

In [None]:
x = df.iloc[:,1:].to_numpy()
y = df.iloc[:,:1].to_numpy().squeeze()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=35) #80/20

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
input_dim = 21     # the number of features per one input
output_dim = 1     # the number of output classes

In [None]:
# Data format: data type
# Most DL frameworks use float32 as a default data type
x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)

print( f"x_train.shape={x_train.shape} , x_train.dtype={x_train.dtype} , min(x_train)={np.min(x_train)} , max(x_train)={np.max(x_train)}" )
print( f"x_test.shape={x_test.shape} , x_test.dtype={x_test.dtype} , min(x_test)={np.min(x_test)} , max(x_test)={np.max(x_test)}" )

# 3. Create the network architecture

In [None]:
model = tf.keras.models.Sequential()

# Input layer
model.add( tf.keras.Input(shape=(input_dim,)) )

# Hidden layer
model.add( tf.keras.layers.Dense(512, activation='relu', name='hidden1') )   # use default weight initialization, don't use any regularization
model.add( tf.keras.layers.BatchNormalization(axis=-1, name='bn1') )  
model.add( tf.keras.layers.Dense(1024, activation='relu', name='hidden2') )   # use default weight initialization, don't use any regularization
model.add( tf.keras.layers.BatchNormalization(axis=-1, name='bn2') )
model.add( tf.keras.layers.Dense(64, activation='relu', name='hidden3') )   # use default weight initialization, don't use any regularization
model.add( tf.keras.layers.BatchNormalization(axis=-1, name='bn3') )
model.add( tf.keras.layers.Dense(32, activation='relu', name='hidden4') )   # use default weight initialization, don't use any regularization
model.add( tf.keras.layers.BatchNormalization(axis=-1, name='bn4') )
model.add( tf.keras.layers.Dense(512, activation='relu', name='hidden5') )   # use default weight initialization, don't use any regularization
model.add( tf.keras.layers.BatchNormalization(axis=-1, name='bn5') )
model.add( tf.keras.layers.Dense(512, activation='relu', name='hidden6') )   # use default weight initialization, don't use any regularization
model.add( tf.keras.layers.BatchNormalization(axis=-1, name='bn6') )
model.add( tf.keras.layers.Dropout(0.2) )                        # drop rate = 10%

# Output layer
model.add( tf.keras.layers.Dense(output_dim, activation='sigmoid', name='output') )

model.summary()

# 4. Compile the model

Examples of compiling the model (only run one cell provided below)

In [None]:
# Compile + hyperparameter tuning
model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001) , 
                       loss="binary_crossentropy",
                       metrics=['acc'] 
                     )

In [None]:
from datetime import datetime
start_time = datetime.now()

# 5. Train the model on train set

In [None]:
checkpoint_filepath = "bestmodel_epoch{epoch:02d}_valloss{val_loss:.2f}.hdf5"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_filepath,
                                                                                              save_weights_only=True,
                                                                                              monitor='val_acc',
                                                                                              mode='max',
                                                                                              save_best_only=True)

In [None]:
history = model.fit ( x_train, y_train, batch_size=512, epochs=300, verbose=1, validation_split=0.2, callbacks=[model_checkpoint_callback] )

In [None]:
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

In [None]:
# Inspect the saved files (sorted by modification date, newest first)
!ls -lt

In [None]:
# Check the keys in 'history'
history.history.keys()

In [None]:
# Summarize history for accuracy
plt.figure(figsize=(15,5))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Train accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.grid()
plt.show()

# Summarize history for loss
plt.figure(figsize=(15,5))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Train loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.grid()
plt.show()

# 6. Evaluate the model on test set

In [None]:
results = model.evaluate(x_test, y_test, batch_size=512)
print( f"{model.metrics_names} = {results}" )

# 7. Inference

In [None]:
# Test using the model on x_test[i]
i = 0
y_pred = model.predict( x_test[i].reshape(1,-1) )    # reshape x_test[i] from (n_feature,) to (1, n_feature)

print( f"{'#'*10} Input: x_test[{i}] {'#'*10}" )
print( f"shape={x_test[i].shape}\nvalue={x_test[i]}\n" )

print( f"{'#'*10} Ground truth: y_test[{i}] {'#'*10}" )
print( f"shape={y_test[i].shape}\nvalue={y_test[i]}\n" )

print( f"{'#'*10} Prediction: y_pred {'#'*10}" )
print( f"type={type(y_pred)}\ndtype={y_pred.dtype}\nshape={y_pred.shape}" )
print( f"value={y_pred}" )
print( f"np.argmax(y_pred)={np.argmax(y_pred)}" )

In [None]:
# extract the predicted probabilities
from unicodedata import digit


p_pred = model.predict(x_test)
p_pred = p_pred.flatten()
print(p_pred.round(4))


# extract the predicted class labels
y_pred = np.where(p_pred > 0.5, 1, 0)
print(y_pred)

print(confusion_matrix(y_test, y_pred))


print(classification_report(y_test, y_pred, digits=6))