Tensorflow exploration<br /><br />

Tensorflow went up to 82% model Accuracy on the test data, but was not as good as the other models in predicting diabetes in the training sets.

In [116]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.10.1


In [117]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn

from imblearn.under_sampling import NearMiss 
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import recall_score

In [118]:
def OverSample(df, target):
    over_sampler = SMOTE(random_state=42, sampling_strategy='minority')
    
    train_labels = df[target].copy()
    train_features = df.drop(columns=target)

    over_sampled_features, over_sampled_labels = over_sampler.fit_resample(train_features, train_labels)

    over_sampled_data = over_sampled_features
    over_sampled_data[target] = over_sampled_labels
    return over_sampled_data

In [119]:
df = pd.read_csv("diabetes_categories.csv")

In [120]:
df.head(3)

Unnamed: 0,diabetes,bmi,smoker,stroke,asthma,physical_activity,heavy_drinking,no_doctor_due_to_cost,any_healthcare_insurance,general_health_status,mental_health_status,physical_health_status,difficulty_walking,gender,age,education,income,race,routine_checkup,sleep_time,heart_related
0,No,Overweight,No,No,Never,No,Yes,No,Medicare,Excellent,0,0,No,Female,80 or older,High School,"$25,000 to < $35,000",White,Never,6.0,No
1,No,Overweight,No,No,Never,Yes,Yes,No,Employer,Very good,1-13,1-13,No,Female,55 to 59,Graduated College,"$100,000 to < $200,000",White,Within Last Year,5.0,No
2,No,Normal,Some,No,Current,Yes,Yes,No,Refused,Excellent,0,0,No,Female,Don’t know,High School,Don’t know,White,Within Last Year,7.0,No


In [121]:
pd.set_option("display.max_columns", 999)

In [123]:
df.columns

Index(['diabetes', 'bmi', 'smoker', 'stroke', 'asthma', 'physical_activity',
       'heavy_drinking', 'no_doctor_due_to_cost', 'any_healthcare_insurance',
       'general_health_status', 'mental_health_status',
       'physical_health_status', 'difficulty_walking', 'gender', 'age',
       'education', 'income', 'race', 'routine_checkup', 'sleep_time',
       'heart_related'],
      dtype='object')

In [124]:
def PrepareData(df):
    
    #Update binary data
    df.loc[df.diabetes== "Yes", "diabetes"] = 1
    df.loc[df.diabetes=="No", "diabetes"] = 0
    
    df.loc[df.heart_related=="Yes","heart_related"] = 1
    df.loc[df.heart_related=="No","heart_related"] = 0
           
           
    categorical_columns = [ 'bmi', 'smoker', 'stroke', 'asthma', 'physical_activity',
       'heavy_drinking', 'no_doctor_due_to_cost', 'any_healthcare_insurance',
       'general_health_status', 'mental_health_status',
       'physical_health_status', 'difficulty_walking', 'gender', 'age',
       'education', 'income', 'race', 'routine_checkup', 'sleep_time']
    
    df = pd.get_dummies(df, columns=categorical_columns, prefix=categorical_columns,dtype=float)
    
    #Change everything to floats    
    df = df.astype(float)

    return df

prepared = PrepareData(df)

In [125]:
mpl.rcParams['figure.figsize'] = (6, 5)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [126]:
prepared.head()

Unnamed: 0,diabetes,heart_related,bmi_Normal,bmi_Obese,bmi_Overweight,bmi_Under,smoker_Current,smoker_Former,smoker_No,smoker_Some,stroke_No,stroke_Refused,stroke_Unsure,stroke_Yes,asthma_Current,asthma_Don't know,asthma_Former,asthma_Never,physical_activity_Don't know,physical_activity_No,physical_activity_Yes,heavy_drinking_Don't know,heavy_drinking_No,heavy_drinking_Yes,no_doctor_due_to_cost_No,no_doctor_due_to_cost_Refused,no_doctor_due_to_cost_Unsure,no_doctor_due_to_cost_Yes,any_healthcare_insurance_CHIP,any_healthcare_insurance_Employer,any_healthcare_insurance_Indian,any_healthcare_insurance_Medicaid,any_healthcare_insurance_Medicare,any_healthcare_insurance_Medigap,any_healthcare_insurance_Military,any_healthcare_insurance_No,any_healthcare_insurance_Other,any_healthcare_insurance_Private,any_healthcare_insurance_Refused,any_healthcare_insurance_State,any_healthcare_insurance_Unsure,general_health_status_Don’t know,general_health_status_Excellent,general_health_status_Fair,general_health_status_Good,general_health_status_Poor,general_health_status_Refused,general_health_status_Very good,mental_health_status_0,mental_health_status_1-13,mental_health_status_14+,mental_health_status_Unknown,physical_health_status_0,physical_health_status_1-13,physical_health_status_14+,physical_health_status_Unknown,difficulty_walking_Don’t know,difficulty_walking_No,difficulty_walking_Refused,difficulty_walking_Yes,gender_Female,gender_Male,age_18 to 24,age_25 to 29,age_30 to 34,age_35 to 39,age_40 to 44,age_45 to 49,age_50 to 54,age_55 to 59,age_60 to 64,age_65 to 69,age_70 to 74,age_75 to 79,age_80 or older,age_Don’t know,education_Graduated College,education_High School,education_Middle School,education_Not sure,education_Some College,"income_$100,000 to < $200,000","income_$15,000 to < $25,000","income_$25,000 to < $35,000","income_$35,000 to < $50,000","income_$50,000 to < $100,000","income_< $15,000","income_> $200,000",income_Don’t know,race_American Indian or Alaskan Native,race_Asian,race_Black or African American,race_Don’t know,race_Multiracial,race_Native Hawaiian or other Pacific Islander,race_No race,race_Refused,race_White,routine_checkup_5 or more years,routine_checkup_Don't know,routine_checkup_Last 2 Years,routine_checkup_Last 5 Years,routine_checkup_Never,routine_checkup_Refused,routine_checkup_Within Last Year,sleep_time_1.0,sleep_time_10.0,sleep_time_11.0,sleep_time_12.0,sleep_time_13.0,sleep_time_14.0,sleep_time_15.0,sleep_time_16.0,sleep_time_17.0,sleep_time_18.0,sleep_time_19.0,sleep_time_2.0,sleep_time_20.0,sleep_time_21.0,sleep_time_22.0,sleep_time_23.0,sleep_time_24.0,sleep_time_3.0,sleep_time_4.0,sleep_time_5.0,sleep_time_6.0,sleep_time_7.0,sleep_time_8.0,sleep_time_9.0,sleep_time_Don't Know,sleep_time_Refused
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [131]:
train_df, test_df = train_test_split(prepared, test_size=0.2)

In [132]:
train_labels = train_df['diabetes']
train_features = train_df.drop(columns='diabetes')

over_sampler = RandomOverSampler(random_state=42)
X_over, y_over = over_sampler.fit_resample(train_features, train_labels)
sampled_data = X_over
sampled_data['diabetes'] = y_over
sampled_data.diabetes.value_counts()

  sampled_data['diabetes'] = y_over


diabetes
0.0    214086
1.0    214086
Name: count, dtype: int64

In [129]:
#We might need to reduce years smoked into categories
sampled_data.describe()


Unnamed: 0,heart_related,bmi_Normal,bmi_Obese,bmi_Overweight,bmi_Under,smoker_Current,smoker_Former,smoker_No,smoker_Some,stroke_No,stroke_Refused,stroke_Unsure,stroke_Yes,asthma_Current,asthma_Don't know,asthma_Former,asthma_Never,physical_activity_Don't know,physical_activity_No,physical_activity_Yes,heavy_drinking_Don't know,heavy_drinking_No,heavy_drinking_Yes,no_doctor_due_to_cost_No,no_doctor_due_to_cost_Refused,no_doctor_due_to_cost_Unsure,no_doctor_due_to_cost_Yes,any_healthcare_insurance_CHIP,any_healthcare_insurance_Employer,any_healthcare_insurance_Indian,any_healthcare_insurance_Medicaid,any_healthcare_insurance_Medicare,any_healthcare_insurance_Medigap,any_healthcare_insurance_Military,any_healthcare_insurance_No,any_healthcare_insurance_Other,any_healthcare_insurance_Private,any_healthcare_insurance_Refused,any_healthcare_insurance_State,any_healthcare_insurance_Unsure,general_health_status_Don’t know,general_health_status_Excellent,general_health_status_Fair,general_health_status_Good,general_health_status_Poor,general_health_status_Refused,general_health_status_Very good,mental_health_status_0,mental_health_status_1-13,mental_health_status_14+,mental_health_status_Unknown,physical_health_status_0,physical_health_status_1-13,physical_health_status_14+,physical_health_status_Unknown,difficulty_walking_Don’t know,difficulty_walking_No,difficulty_walking_Refused,difficulty_walking_Yes,gender_Female,gender_Male,age_18 to 24,age_25 to 29,age_30 to 34,age_35 to 39,age_40 to 44,age_45 to 49,age_50 to 54,age_55 to 59,age_60 to 64,age_65 to 69,age_70 to 74,age_75 to 79,age_80 or older,age_Don’t know,education_Graduated College,education_High School,education_Middle School,education_Not sure,education_Some College,"income_$100,000 to < $200,000","income_$15,000 to < $25,000","income_$25,000 to < $35,000","income_$35,000 to < $50,000","income_$50,000 to < $100,000","income_< $15,000","income_> $200,000",income_Don’t know,race_American Indian or Alaskan Native,race_Asian,race_Black or African American,race_Don’t know,race_Multiracial,race_Native Hawaiian or other Pacific Islander,race_No race,race_Refused,race_White,routine_checkup_5 or more years,routine_checkup_Don't know,routine_checkup_Last 2 Years,routine_checkup_Last 5 Years,routine_checkup_Never,routine_checkup_Refused,routine_checkup_Within Last Year,sleep_time_1.0,sleep_time_10.0,sleep_time_11.0,sleep_time_12.0,sleep_time_13.0,sleep_time_14.0,sleep_time_15.0,sleep_time_16.0,sleep_time_17.0,sleep_time_18.0,sleep_time_19.0,sleep_time_2.0,sleep_time_20.0,sleep_time_21.0,sleep_time_22.0,sleep_time_23.0,sleep_time_24.0,sleep_time_3.0,sleep_time_4.0,sleep_time_5.0,sleep_time_6.0,sleep_time_7.0,sleep_time_8.0,sleep_time_9.0,sleep_time_Don't Know,sleep_time_Refused,diabetes
count,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0,428322.0
mean,0.137705,0.235612,0.414837,0.336793,0.012757,0.064064,0.042013,0.571297,0.322626,0.935107,6.8e-05,0.002694,0.062131,0.117409,0.008368,0.044277,0.829946,0.00197,0.285895,0.712135,0.014977,0.053609,0.931414,0.916829,0.000579,0.002227,0.080365,0.00029,0.31657,0.003871,0.06963,0.37465,0.001347,0.038625,0.03905,0.023326,0.074386,0.011891,0.028992,0.017372,0.00183,0.113291,0.186486,0.343821,0.066548,0.000628,0.287396,0.596981,0.238669,0.145157,0.019194,0.551711,0.252245,0.171504,0.02454,0.00289,0.767932,0.000392,0.228786,0.520242,0.479758,0.036596,0.031131,0.03926,0.048223,0.055314,0.058374,0.075427,0.090609,0.113636,0.128378,0.124448,0.094812,0.094219,0.009572,0.395777,0.251,0.063973,0.002281,0.286969,0.156772,0.096488,0.109658,0.11641,0.253905,0.061244,0.045211,0.160312,0.025383,0.028903,0.101421,0.009164,0.003507,0.010609,0.017022,0.01454,0.78945,0.029585,0.008171,0.072646,0.04032,0.003544,0.000738,0.844995,0.002197,0.028432,0.001812,0.008342,0.000453,0.000761,0.000869,0.000892,8.6e-05,0.000486,6.8e-05,0.003455,0.000353,2e-06,4.4e-05,2.3e-05,9.6e-05,0.008585,0.03171,0.072873,0.217535,0.278725,0.279724,0.050653,0.010859,0.000967,0.5
std,0.34459,0.424381,0.492695,0.472614,0.112223,0.244867,0.200619,0.494891,0.467482,0.246337,0.008228,0.051836,0.241393,0.321908,0.091091,0.205711,0.375681,0.044346,0.45184,0.452768,0.121461,0.225245,0.25275,0.276141,0.024056,0.047142,0.271857,0.017012,0.465139,0.062096,0.254522,0.484033,0.036678,0.1927,0.193714,0.150937,0.262398,0.108394,0.167785,0.130655,0.042744,0.316948,0.389499,0.474983,0.249238,0.025053,0.452548,0.490505,0.42627,0.35226,0.137205,0.497319,0.434302,0.376949,0.154718,0.053684,0.422153,0.019801,0.420052,0.499591,0.499591,0.187769,0.173671,0.194214,0.214238,0.228591,0.23445,0.264079,0.287053,0.31737,0.33451,0.330093,0.292955,0.292133,0.097369,0.489018,0.433589,0.244705,0.047705,0.452347,0.363586,0.29526,0.312464,0.320716,0.435244,0.239777,0.207768,0.366895,0.157285,0.167535,0.301886,0.095288,0.059114,0.102452,0.129354,0.119704,0.407699,0.16944,0.090026,0.259555,0.196709,0.059427,0.027152,0.36191,0.04682,0.166203,0.042526,0.090952,0.021277,0.027578,0.029458,0.029851,0.009294,0.022031,0.008228,0.058681,0.018773,0.001528,0.00666,0.004832,0.009783,0.092255,0.175227,0.259928,0.41257,0.448372,0.448864,0.219289,0.103638,0.031075,0.500001
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
75%,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [133]:
# Form np arrays of labels and features.
train_labels = np.array(sampled_data['diabetes'])
test_labels = np.array(test_df['diabetes'])

In [134]:
train_features = np.array(sampled_data.drop(columns='diabetes'))
test_features = np.array(test_df.drop(columns='diabetes'))

In [135]:
print(f'Average class probability in training set:   {train_labels.mean():.4f}')
print(f'Average class probability in test set:       {test_labels.mean():.4f}')

Average class probability in training set:   0.5000
Average class probability in test set:       0.1708


In [101]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)


print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

Training labels shape: (207320,)
Validation labels shape: (51831,)
Test labels shape: (64788,)
Training features shape: (207320, 130)
Validation features shape: (51831, 130)
Test features shape: (64788, 130)


In [136]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)


In [137]:
layer_sizes = [2000]
for layer_size in layer_sizes:
    feature_size = train_features.shape[-1]
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(layer_size, activation='relu',input_shape=(train_features.shape[-1],)),
        keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1000, activation='relu'),
        keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(500, activation='relu'),
        keras.layers.Dropout(0.1),

        tf.keras.layers.Dense(100, activation='relu'),
        keras.layers.Dropout(0.1),


        # Classification  of 3 
        # #tf.keras.layers.Dense(3, activation='softmax')  
        
        #Binary
        tf.keras.layers.Dense(1,activation="sigmoid")  
    ])
    model.summary()

    model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                metrics=['Recall']
                )

    model.fit(train_features, train_labels, callbacks=[callback], epochs=20)

    #test_loss, test_acc = model.evaluate(test_features,  test_labels, verbose=2)
    predictions = model.predict(test_features)
    
    predicted = np.array([1 if x >= 0.5 else 0 for x in predictions])

    recall_score(test_labels, predicted)

    

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_35 (Dense)            (None, 2000)              262000    
                                                                 
 dropout_28 (Dropout)        (None, 2000)              0         
                                                                 
 dense_36 (Dense)            (None, 1000)              2001000   
                                                                 
 dropout_29 (Dropout)        (None, 1000)              0         
                                                                 
 dense_37 (Dense)            (None, 500)               500500    
                                                                 
 dropout_30 (Dropout)        (None, 500)               0         
                                                                 
 dense_38 (Dense)            (None, 100)              

In [140]:
recall_score(test_labels, predicted)

0.48025305015815634

In [146]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels, predicted)

array([[44147,  9576],
       [ 5751,  5314]], dtype=int64)