In [50]:
# Import the modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
import tensorflow as tf
import ydf
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

## Dataset 2 - use one hot encoding with StandardScaler to deal with dataset

In [40]:
# Read in the cleaned/prepared csv file
# c:\Users\schre\Downloads\diabetes_prediction_cleaned_sn.csv
dm_prediction_df = pd.read_csv("Resources/diabetes_prediction_cleaned_one_hot_encoding.csv")
dm_prediction_df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,hba1c_level,blood_glucose_level,diabetes_status,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,1.691761,0,1,-0.282234,0.787897,0.047704,0,1,0,0,0,0,0,0,1,0
1,0.538015,0,0,0.018883,0.787897,-1.42621,0,1,0,0,1,0,0,0,0,0
2,-0.61573,0,0,0.018883,-0.120279,0.489878,0,0,1,0,0,0,0,0,1,0
3,-0.260731,0,0,-0.58335,-0.120279,0.416183,0,1,0,0,0,1,0,0,0,0
4,1.514261,1,1,-1.035025,-1.028455,0.416183,0,0,1,0,0,1,0,0,0,0


# Create the labels set (y) from the “diabetes_status” column, and then create the features (X) DataFrame from the remaining columns.

In [41]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = dm_prediction_df['diabetes_status']

# Separate the X variable, the features
X = dm_prediction_df.drop(columns=['diabetes_status'])

In [42]:
# Review the y variable Series
display(y.head())
display(y.tail())

0    0
1    0
2    0
3    0
4    0
Name: diabetes_status, dtype: int64

99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes_status, dtype: int64

In [43]:
# Review the X variable DataFrame
display(X.head())
display(X.tail())

Unnamed: 0,age,hypertension,heart_disease,bmi,hba1c_level,blood_glucose_level,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,1.691761,0,1,-0.282234,0.787897,0.047704,1,0,0,0,0,0,0,1,0
1,0.538015,0,0,0.018883,0.787897,-1.42621,1,0,0,1,0,0,0,0,0
2,-0.61573,0,0,0.018883,-0.120279,0.489878,0,1,0,0,0,0,0,1,0
3,-0.260731,0,0,-0.58335,-0.120279,0.416183,1,0,0,0,1,0,0,0,0
4,1.514261,1,1,-1.035025,-1.028455,0.416183,0,1,0,0,1,0,0,0,0


Unnamed: 0,age,hypertension,heart_disease,bmi,hba1c_level,blood_glucose_level,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
99995,1.691761,0,0,0.018883,0.787897,-1.180558,1,0,0,1,0,0,0,0,0
99996,-1.769475,0,0,-1.4867,0.787897,-0.934905,1,0,0,1,0,0,0,0,0
99997,1.070513,0,0,0.018883,-0.120279,0.416183,0,1,0,0,0,0,1,0,0
99998,-0.793229,0,0,1.22335,-1.028455,-0.934905,1,0,0,0,0,0,0,1,0
99999,0.67114,0,0,-0.733909,0.787897,-1.180558,1,0,0,0,1,0,0,0,0


In [44]:
print(X.shape)

(100000, 15)


## Split the data into training and testing datasets by using train_test_split.

In [45]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split. Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

# Create a Support Vector Machine Model with the Original Data
## Fit a Support Vector Machine model by using the training data (X_train and y_train).

In [27]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

## Make predictions on the test set and compute accuracy / classification report

In [28]:
predictions = model.predict(X_test)
predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [29]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     22875
           1       0.93      0.56      0.70      2125

    accuracy                           0.96     25000
   macro avg       0.95      0.78      0.84     25000
weighted avg       0.96      0.96      0.95     25000



## Try a nonlinear kernel for the support vector classifier

In [30]:
model = SVC(kernel='poly')
model.fit(X_train, y_train)

predictions = model.predict(X_test)
predictions

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     22875
           1       1.00      0.62      0.76      2125

    accuracy                           0.97     25000
   macro avg       0.98      0.81      0.87     25000
weighted avg       0.97      0.97      0.96     25000



## Neural Network Models

In [44]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = 15
hidden_nodes_layer1 =  30
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [45]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [46]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=100)

Epoch 1/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 376us/step - accuracy: 0.9243 - loss: 0.2005
Epoch 2/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 372us/step - accuracy: 0.9636 - loss: 0.1128
Epoch 3/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 374us/step - accuracy: 0.9660 - loss: 0.1010
Epoch 4/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 378us/step - accuracy: 0.9670 - loss: 0.0985
Epoch 5/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 390us/step - accuracy: 0.9675 - loss: 0.0947
Epoch 6/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 372us/step - accuracy: 0.9679 - loss: 0.0918
Epoch 7/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 379us/step - accuracy: 0.9680 - loss: 0.0931
Epoch 8/100
[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 376us/step - accuracy: 0.9692 - loss: 0.0897


In [47]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

782/782 - 0s - 393us/step - accuracy: 0.9676 - loss: 0.0943
Loss: 0.09434571117162704, Accuracy: 0.9676399827003479


## Step 2 - using keras tuner to select the highest performing model

In [48]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=2), activation=activation, input_dim=15))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [49]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [50]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train,y_train,epochs=20,validation_data=(X_test,y_test))

Trial 60 Complete [00h 00m 33s]
val_accuracy: 0.959559977054596

Best val_accuracy So Far: 0.968280017375946
Total elapsed time: 00h 13m 01s


In [51]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 19,
 'num_layers': 2,
 'units_0': 5,
 'units_1': 5,
 'units_2': 7,
 'units_3': 5,
 'units_4': 9,
 'units_5': 9,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/bracket': 2,
 'tuner/round': 2,
 'tuner/trial_id': '0012'}

In [52]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  model.build_from_config(
  saveable.load_own_variables(weights_store.get(inner_path))


782/782 - 0s - 463us/step - accuracy: 0.9683 - loss: 0.0930
Loss: 0.09303949028253555, Accuracy: 0.968280017375946


## Yggdrasil decision tree model

In [27]:
#need X_train, y_train combined into one dataframe
train_df=pd.concat([X_train,y_train],axis=1)
train_df.head()

test_df=pd.concat([X_test,y_test],axis=1)
test_df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,hba1c_level,blood_glucose_level,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,diabetes_status
39679,-0.837604,0,0,-0.131675,-1.028455,0.539009,0,1,0,0,0,0,0,1,0,0
56089,-1.547601,0,0,-1.637259,-1.028455,0.539009,0,1,0,1,0,0,0,0,0,0
86698,1.070513,0,0,-0.131675,-1.028455,0.514444,0,1,0,1,0,0,0,0,0,0
64026,-1.769475,0,0,-2.088934,0.787897,-1.42621,1,0,0,1,0,0,0,0,0,0
33740,-0.216357,0,0,4.234517,-1.028455,1.521618,0,1,0,0,0,1,0,0,0,0


In [28]:
model = ydf.RandomForestLearner(label='diabetes_status',
                                task=ydf.Task.CLASSIFICATION).train(train_df)

Train model on 75000 examples
Model trained in 0:00:01.364545


In [29]:
evaluation = model.evaluate(test_df)

print(evaluation)

accuracy: 0.96784
confusion matrix:
    label (row) \ prediction (col)
    +-------+-------+-------+
    |       |     0 |     1 |
    +-------+-------+-------+
    |     0 | 22862 |    13 |
    +-------+-------+-------+
    |     1 |   791 |  1334 |
    +-------+-------+-------+
characteristics:
    name: '1' vs others
    ROC AUC: 0.931697
    PR AUC: 0.834077
    Num thresholds: 182
loss: 0.36414
num examples: 25000
num examples (weighted): 25000



## Gaussian Naive Bayes

In [46]:
gnb = GaussianNB()
predictions = gnb.fit(X_train, y_train).predict(X_test)

In [47]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.61      0.75     22875
           1       0.18      0.96      0.31      2125

    accuracy                           0.64     25000
   macro avg       0.59      0.78      0.53     25000
weighted avg       0.92      0.64      0.71     25000



## XGBoost model

In [51]:
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
bst.fit(X_train, y_train)
# make predictions
predictions = bst.predict(X_test)

In [52]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     22875
           1       1.00      0.62      0.76      2125

    accuracy                           0.97     25000
   macro avg       0.98      0.81      0.87     25000
weighted avg       0.97      0.97      0.96     25000

