In [1]:
!pip install tensorflow
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=eb3dbed6a33e17228afa47b74612510ac0b936ce6916d060f5e643fd3eaab23a
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
#Import Dependencies
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np

In [3]:
#import data into dataframe
diabetes_df = pd.read_csv('https://raw.githubusercontent.com/ComfyKoala/diabetes-classification/main/JV/Resources/first_half.csv')
diabetes_df.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [4]:
#renaming column names
diabetes_df = diabetes_df.rename(columns={
    'hbA1c_level': 'A1C',
    'blood_glucose_level': 'RBG',
    'hypertension': 'HTN',
    'race:AfricanAmerican': 'AfricanAmerican',
    'race:Asian': 'Asian',
    'race:Caucasian': 'Caucasian',
    'race:Hispanic': 'Hispanic',
    'race:Other': 'Other'
})
diabetes_df.head()



Unnamed: 0,year,gender,age,location,AfricanAmerican,Asian,Caucasian,Hispanic,Other,HTN,heart_disease,smoking_history,bmi,A1C,RBG,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


## Prepping data for nn Model

In [5]:
#dropping 'location' and 'smoking_history'
diabetes_df=diabetes_df.drop(columns=['location', 'smoking_history'])

In [6]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
#cutoff_value = 10
#counts = diabetes_df['bmi'].value_counts()
#classifications_to_replace = counts[counts < cutoff_value].index.tolist()


# Replace in dataframe
#for cls in classifications_to_replace:
    #diabetes_df['bmi'] = diabetes_df['bmi'].replace(cls,"Other")

# Check to make sure replacement was successful
#diabetes_df['bmi'].value_counts()

# Binning was attempted but we kept running into issues with the 2nd dataset.

In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
categorical_columns = diabetes_df.select_dtypes(include=['object']).columns
diabetes_df = pd.get_dummies(diabetes_df, columns=categorical_columns, drop_first=True)

In [8]:
diabetes_df.head()

Unnamed: 0,year,age,AfricanAmerican,Asian,Caucasian,Hispanic,Other,HTN,heart_disease,bmi,A1C,RBG,diabetes,gender_Male,gender_Other
0,2020,32.0,0,0,0,0,1,0,0,27.32,5.0,100,0,False,False
1,2015,29.0,0,1,0,0,0,0,0,19.95,5.0,90,0,False,False
2,2015,18.0,0,0,0,0,1,0,0,23.76,4.8,160,0,True,False
3,2015,41.0,0,0,1,0,0,0,0,27.32,4.0,159,0,True,False
4,2016,52.0,1,0,0,0,0,0,0,23.75,6.5,90,0,False,False


In [9]:
#checking columns
print(diabetes_df.columns)

Index(['year', 'age', 'AfricanAmerican', 'Asian', 'Caucasian', 'Hispanic',
       'Other', 'HTN', 'heart_disease', 'bmi', 'A1C', 'RBG', 'diabetes',
       'gender_Male', 'gender_Other'],
      dtype='object')


In [10]:
diabetes_df["diabetes"].dtype

dtype('int64')

In [11]:
# Split our preprocessed data into our features and target arrays
X = diabetes_df.drop('diabetes', axis=1)
y = diabetes_df['diabetes']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate Model


In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
layer1 = 9
layer2 = 3
layer3 = 5

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=layer1, input_dim=number_input_features, activation='relu')
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=layer2, activation='sigmoid'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=layer3, activation='sigmoid'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.7673 - loss: 0.4636
Epoch 2/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.9208 - loss: 0.1792
Epoch 3/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9591 - loss: 0.1391
Epoch 4/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9594 - loss: 0.1276
Epoch 5/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9592 - loss: 0.1241
Epoch 6/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9595 - loss: 0.1210
Epoch 7/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9602 - loss: 0.1189
Epoch 8/100
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9606 - loss: 0.1157
Epoch 9/100
[1

In [16]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

391/391 - 1s - 2ms/step - accuracy: 0.9682 - loss: 0.0953
Loss: 0.09531337767839432, Accuracy: 0.9681599736213684


In [17]:
#export model to h5 file
nn.save('diabetes_model.h5')



## 96.7% accuracy w/ 9.5% loss