## Running the model with 2nd half data

In [1]:
!pip install tensorflow
!pip install pyspark



In [2]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.metrics import accuracy_score
import tensorflow as tf
import numpy as np

In [3]:
#import data into dataframe
diabetes2_df = pd.read_csv('https://raw.githubusercontent.com/ComfyKoala/diabetes-classification/main/JV/Resources/second_half.csv')
diabetes2_df.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2015,Male,38.0,Mississippi,0,0,0,0,1,0,0,No Info,22.87,6.5,80,0
1,2015,Female,4.0,Mississippi,1,0,0,0,0,0,0,No Info,14.98,5.8,126,0
2,2016,Female,17.0,Mississippi,0,0,0,0,1,0,0,No Info,23.97,4.0,85,0
3,2015,Male,2.0,Mississippi,0,0,0,1,0,0,0,never,27.32,4.5,200,0
4,2016,Female,28.0,Mississippi,0,1,0,0,0,0,0,No Info,25.0,4.8,126,0


## Data prep

In [4]:
#renaming column names
diabetes2_df = diabetes2_df.rename(columns={
    'hbA1c_level': 'A1C',
    'blood_glucose_level': 'RBG',
    'hypertension': 'HTN',
    'race:AfricanAmerican': 'AfricanAmerican',
    'race:Asian': 'Asian',
    'race:Caucasian': 'Caucasian',
    'race:Hispanic': 'Hispanic',
    'race:Other': 'Other'
})
diabetes2_df.head()


Unnamed: 0,year,gender,age,location,AfricanAmerican,Asian,Caucasian,Hispanic,Other,HTN,heart_disease,smoking_history,bmi,A1C,RBG,diabetes
0,2015,Male,38.0,Mississippi,0,0,0,0,1,0,0,No Info,22.87,6.5,80,0
1,2015,Female,4.0,Mississippi,1,0,0,0,0,0,0,No Info,14.98,5.8,126,0
2,2016,Female,17.0,Mississippi,0,0,0,0,1,0,0,No Info,23.97,4.0,85,0
3,2015,Male,2.0,Mississippi,0,0,0,1,0,0,0,never,27.32,4.5,200,0
4,2016,Female,28.0,Mississippi,0,1,0,0,0,0,0,No Info,25.0,4.8,126,0


In [5]:
#dropping 'location' and 'smoking_history'
diabetes2_df.drop(columns=['location', 'smoking_history'])

Unnamed: 0,year,gender,age,AfricanAmerican,Asian,Caucasian,Hispanic,Other,HTN,heart_disease,bmi,A1C,RBG,diabetes
0,2015,Male,38.0,0,0,0,0,1,0,0,22.87,6.5,80,0
1,2015,Female,4.0,1,0,0,0,0,0,0,14.98,5.8,126,0
2,2016,Female,17.0,0,0,0,0,1,0,0,23.97,4.0,85,0
3,2015,Male,2.0,0,0,0,1,0,0,0,27.32,4.5,200,0
4,2016,Female,28.0,0,1,0,0,0,0,0,25.00,4.8,126,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2018,Female,33.0,0,0,0,0,1,0,0,21.21,6.5,90,0
49996,2016,Female,80.0,0,1,0,0,0,0,0,36.66,5.7,100,0
49997,2018,Male,46.0,0,1,0,0,0,0,0,36.12,6.2,158,0
49998,2018,Female,51.0,1,0,0,0,0,0,0,29.29,6.0,155,0


In [6]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
cutoff_value = 10
counts = diabetes2_df['bmi'].value_counts()
classifications_to_replace = counts[counts < cutoff_value].index.tolist()


# Replace in dataframe
for cls in classifications_to_replace:
    diabetes2_df['bmi'] = diabetes2_df['bmi'].replace(cls,"Other")

# Check to make sure replacement was successful
diabetes2_df['bmi'].value_counts()

Unnamed: 0_level_0,count
bmi,Unnamed: 1_level_1
27.32,12717
Other,8448
24.96,56
23.0,55
22.05,53
...,...
26.94,10
17.17,10
26.61,10
30.91,10


In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
categorical_columns = diabetes2_df.select_dtypes(include=['object']).columns
diabetes2_df = pd.get_dummies(diabetes2_df, columns=categorical_columns, drop_first=True)

In [8]:
diabetes2_df.head()

Unnamed: 0,year,age,AfricanAmerican,Asian,Caucasian,Hispanic,Other,HTN,heart_disease,A1C,...,bmi_39.3,bmi_39.47,bmi_40.0,bmi_40.1,bmi_40.31,bmi_40.5,bmi_40.75,bmi_41.96,bmi_43.09,bmi_Other
0,2015,38.0,0,0,0,0,1,0,0,6.5,...,False,False,False,False,False,False,False,False,False,False
1,2015,4.0,1,0,0,0,0,0,0,5.8,...,False,False,False,False,False,False,False,False,False,False
2,2016,17.0,0,0,0,0,1,0,0,4.0,...,False,False,False,False,False,False,False,False,False,False
3,2015,2.0,0,0,0,1,0,0,0,4.5,...,False,False,False,False,False,False,False,False,False,False
4,2016,28.0,0,1,0,0,0,0,0,4.8,...,False,False,False,False,False,False,False,False,False,False


## Import model

In [9]:
#import model
model = load_model('diabetes_model.h5')



In [10]:
#pass diabetes2_df into model
predictions = model.predict(diabetes2_df)

ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 1562, but received input with shape (32, 1592)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 1592), dtype=float32)
  • training=False
  • mask=None