In [1]:
!pip install tensorflow
!pip install pyspark



In [2]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.metrics import accuracy_score
import tensorflow as tf
import numpy as np

In [3]:
#import data into dataframe
diabetes_df = pd.read_csv('https://raw.githubusercontent.com/ComfyKoala/diabetes-classification/main/JV/Resources/first_half.csv')
diabetes_df.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [4]:
#renaming column names
diabetes_df = diabetes_df.rename(columns={
    'hbA1c_level': 'A1C',
    'blood_glucose_level': 'RBG',
    'hypertension': 'HTN',
    'race:AfricanAmerican': 'AfricanAmerican',
    'race:Asian': 'Asian',
    'race:Caucasian': 'Caucasian',
    'race:Hispanic': 'Hispanic',
    'race:Other': 'Other'
})
diabetes_df.head()



Unnamed: 0,year,gender,age,location,AfricanAmerican,Asian,Caucasian,Hispanic,Other,HTN,heart_disease,smoking_history,bmi,A1C,RBG,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


## Prepping data for nn Model

In [5]:
#dropping 'location' and 'smoking_history'
diabetes_df.drop(columns=['location', 'smoking_history'])

Unnamed: 0,year,gender,age,AfricanAmerican,Asian,Caucasian,Hispanic,Other,HTN,heart_disease,bmi,A1C,RBG,diabetes
0,2020,Female,32.0,0,0,0,0,1,0,0,27.32,5.0,100,0
1,2015,Female,29.0,0,1,0,0,0,0,0,19.95,5.0,90,0
2,2015,Male,18.0,0,0,0,0,1,0,0,23.76,4.8,160,0
3,2015,Male,41.0,0,0,1,0,0,0,0,27.32,4.0,159,0
4,2016,Female,52.0,1,0,0,0,0,0,0,23.75,6.5,90,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2016,Female,60.0,0,1,0,0,0,0,0,27.32,6.2,155,1
49996,2015,Male,33.0,0,1,0,0,0,0,0,24.92,4.8,140,0
49997,2016,Male,6.0,0,0,0,1,0,0,0,15.72,4.0,85,0
49998,2016,Male,67.0,1,0,0,0,0,0,0,31.78,5.0,80,0


In [6]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
cutoff_value = 10
counts = diabetes_df['bmi'].value_counts()
classifications_to_replace = counts[counts < cutoff_value].index.tolist()


# Replace in dataframe
for cls in classifications_to_replace:
    diabetes_df['bmi'] = diabetes_df['bmi'].replace(cls,"Other")

# Check to make sure replacement was successful
diabetes_df['bmi'].value_counts()

Unnamed: 0_level_0,count
bmi,Unnamed: 1_level_1
27.32,12778
Other,8713
25.0,62
22.4,56
24.5,52
...,...
22.18,10
20.99,10
33.38,10
24.35,10


In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
categorical_columns = diabetes_df.select_dtypes(include=['object']).columns
diabetes_df = pd.get_dummies(diabetes_df, columns=categorical_columns, drop_first=True)

In [11]:
#checking columns
print(diabetes_df.columns)

Index(['year', 'age', 'AfricanAmerican', 'Asian', 'Caucasian', 'Hispanic',
       'Other', 'HTN', 'heart_disease', 'A1C',
       ...
       'bmi_39.2', 'bmi_39.22', 'bmi_39.4', 'bmi_39.5', 'bmi_39.65',
       'bmi_39.75', 'bmi_40.0', 'bmi_40.02', 'bmi_41.15', 'bmi_Other'],
      dtype='object', length=1563)


In [12]:
# Split our preprocessed data into our features and target arrays
X = diabetes_df.drop('diabetes', axis=1)
y = diabetes_df['diabetes']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate Model


In [14]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
layer1 = 9
layer2 = 3
layer3 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=layer1, input_dim=number_input_features, activation='relu')
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=layer2, activation='sigmoid'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=layer2, activation='sigmoid'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.4212 - loss: 0.7609
Epoch 2/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9147 - loss: 0.3127
Epoch 3/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9126 - loss: 0.2692
Epoch 4/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9133 - loss: 0.2184
Epoch 5/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9510 - loss: 0.1511
Epoch 6/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9583 - loss: 0.1287
Epoch 7/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9627 - loss: 0.1166
Epoch 8/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9654 - loss: 0.1074
Epoch 9/100
[1m

In [17]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

391/391 - 1s - 2ms/step - accuracy: 0.9430 - loss: 0.2466
Loss: 0.24657724797725677, Accuracy: 0.9429600238800049


In [18]:
#export model to h5 file
nn.save('diabetes_model.h5')



## 94.5% accuracy w/ 24.4% loss