## Running the model with 2nd half data

In [1]:
!pip install tensorflow
!pip install pyspark



In [2]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
from pyspark.sql import SparkSession
from sklearn.metrics import accuracy_score
import tensorflow as tf
import numpy as np

In [3]:
#import data into dataframe
diabetes2_df = pd.read_csv('https://raw.githubusercontent.com/ComfyKoala/diabetes-classification/main/JV/Resources/second_half.csv')
diabetes2_df.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes
0,2015,Male,38.0,Mississippi,0,0,0,0,1,0,0,No Info,22.87,6.5,80,0
1,2015,Female,4.0,Mississippi,1,0,0,0,0,0,0,No Info,14.98,5.8,126,0
2,2016,Female,17.0,Mississippi,0,0,0,0,1,0,0,No Info,23.97,4.0,85,0
3,2015,Male,2.0,Mississippi,0,0,0,1,0,0,0,never,27.32,4.5,200,0
4,2016,Female,28.0,Mississippi,0,1,0,0,0,0,0,No Info,25.0,4.8,126,0


## Data prep

In [4]:
#renaming column names
diabetes2_df = diabetes2_df.rename(columns={
    'hbA1c_level': 'A1C',
    'blood_glucose_level': 'RBG',
    'hypertension': 'HTN',
    'race:AfricanAmerican': 'AfricanAmerican',
    'race:Asian': 'Asian',
    'race:Caucasian': 'Caucasian',
    'race:Hispanic': 'Hispanic',
    'race:Other': 'Other'
})
diabetes2_df.head()


Unnamed: 0,year,gender,age,location,AfricanAmerican,Asian,Caucasian,Hispanic,Other,HTN,heart_disease,smoking_history,bmi,A1C,RBG,diabetes
0,2015,Male,38.0,Mississippi,0,0,0,0,1,0,0,No Info,22.87,6.5,80,0
1,2015,Female,4.0,Mississippi,1,0,0,0,0,0,0,No Info,14.98,5.8,126,0
2,2016,Female,17.0,Mississippi,0,0,0,0,1,0,0,No Info,23.97,4.0,85,0
3,2015,Male,2.0,Mississippi,0,0,0,1,0,0,0,never,27.32,4.5,200,0
4,2016,Female,28.0,Mississippi,0,1,0,0,0,0,0,No Info,25.0,4.8,126,0


In [5]:
#dropping 'location' and 'smoking_history'
diabetes2_df=diabetes2_df.drop(columns=['location', 'smoking_history'])

In [6]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
#cutoff_value = 10
#counts = diabetes2_df['bmi'].value_counts()
#classifications_to_replace = counts[counts < cutoff_value].index.tolist()


# Replace in dataframe
#for cls in classifications_to_replace:
    #diabetes2_df['bmi'] = diabetes2_df['bmi'].replace(cls,"Other")

# Check to make sure replacement was successful
#diabetes2_df['bmi'].value_counts()

In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
categorical_columns = diabetes2_df.select_dtypes(include=['object']).columns
diabetes2_df = pd.get_dummies(diabetes2_df, columns=categorical_columns, drop_first=True)

In [9]:
diabetes2_df.head()

Unnamed: 0,year,age,AfricanAmerican,Asian,Caucasian,Hispanic,Other,HTN,heart_disease,bmi,A1C,RBG,diabetes,gender_Male,gender_Other
0,2015,38.0,0,0,0,0,1,0,0,22.87,6.5,80,0,1,0
1,2015,4.0,1,0,0,0,0,0,0,14.98,5.8,126,0,0,0
2,2016,17.0,0,0,0,0,1,0,0,23.97,4.0,85,0,0,0
3,2015,2.0,0,0,0,1,0,0,0,27.32,4.5,200,0,1,0
4,2016,28.0,0,1,0,0,0,0,0,25.0,4.8,126,0,0,0


In [10]:
#checking columns
print(diabetes2_df.columns)

Index(['year', 'age', 'AfricanAmerican', 'Asian', 'Caucasian', 'Hispanic',
       'Other', 'HTN', 'heart_disease', 'bmi', 'A1C', 'RBG', 'diabetes',
       'gender_Male', 'gender_Other'],
      dtype='object')


## Import model

In [19]:
#import model
model = load_model('diabetes_model.h5')



In [20]:
#Separate the features and target: Use the 'diabetes' column for the target variable.
X_new = diabetes2_df.drop(columns=['diabetes'])
y_new = diabetes2_df['diabetes'].astype(int)

In [21]:
diabetes2_df["diabetes"].dtype

dtype('int64')

In [22]:
#make predictions
predictions = model.predict(X_new)

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step


In [23]:
predictions.dtype

dtype('float32')

In [24]:
# Evaluate the accuracy of the predictions from the model
predictions_binary=predictions.astype(int)

In [25]:
accuracy = accuracy_score(y_new, predictions_binary)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 91.58%
