In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import optuna
from ml_optfit.ml_optfit import HyperOptimNN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
df_data = pd.read_csv('diabetes_prediction_dataset.csv')
df_data.head(2)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0


In [2]:
df_data['stratification_key']= df_data['hypertension'].astype(str) + '-' + df_data['heart_disease'].astype(str)+ '-' + df_data['smoking_history'].astype(str)+ '-' + df_data['diabetes'].astype(str)
train, test = train_test_split(df_data, train_size=0.7, stratify=df_data['stratification_key'])
valid, test =train_test_split(test, train_size=0.667, stratify=test['stratification_key'])
print('Train Size:', train.shape[0], '--- Diabetes Frequency:', f'{round(100*train.diabetes.sum()/train.shape[0],2)}%')
print('Valid Size:', valid.shape[0], '--- Diabetes Frequency:', f'{round(100*valid.diabetes.sum()/valid.shape[0],2)}%')
print('Test Size:', test.shape[0], '--- Diabetes Frequency:', f'{round(100*test.diabetes.sum()/test.shape[0],2)}%')

Train Size: 70000 --- Diabetes Frequency: 8.5%
Valid Size: 20010 --- Diabetes Frequency: 8.49%
Test Size: 9990 --- Diabetes Frequency: 8.51%


In [3]:
gender_encoder = LabelEncoder()
train['gender']=gender_encoder.fit_transform(train['gender'])
valid['gender']=gender_encoder.transform(valid['gender'])
test['gender']=gender_encoder.transform(test['gender'])

smoking_history_encoder = LabelEncoder()
train['smoking_history']=smoking_history_encoder.fit_transform(train['smoking_history'])
valid['smoking_history']=smoking_history_encoder.transform(valid['smoking_history'])
test['smoking_history']=smoking_history_encoder.transform(test['smoking_history'])

In [4]:
features = ['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'HbA1c_level', 'blood_glucose_level','bmi']
target = ['diabetes']

In [5]:
train_features = tf.data.Dataset.from_tensor_slices(train[features].to_numpy(dtype=np.float32))
train_target = tf.data.Dataset.from_tensor_slices(train[target].to_numpy(dtype=np.float32))
train_df = tf.data.Dataset.zip((train_features, train_target))

valid_features = tf.data.Dataset.from_tensor_slices(valid[features].to_numpy(dtype=np.float32))
valid_target = tf.data.Dataset.from_tensor_slices(valid[target].to_numpy(dtype=np.float32))
valid_df = tf.data.Dataset.zip((valid_features, valid_target))

In [6]:
opt_nn = HyperOptimNN(direction='maximize',
                      train=train_df.shuffle(buffer_size=1000).batch(500),
                      valid=valid_df.shuffle(buffer_size=1000).batch(500),
                      y_valid=valid[target].to_numpy(dtype=np.float32),
                      unshuffled_valid=valid_df.batch(500),
                      evaluation_func=f1_score,
                      loss_func='binary_crossentropy',
                      epochs=100)

In [7]:
train[features].shape

(70000, 8)

In [8]:
input_hyper = {'input_1':{
            'input_shape':(8,),
            'n_hidden_layers':{'type':'int', 'low':1, 'high':3},
            'units':{'type':'int', 'low':2, 'high':5},
            'activation':{'type':'class', 'vals':['relu', 'tanh', 'selu']},
            'dropouts':{'type':'float', 'low':0.01, 'high':0.9},
            }}
common_hyper = {'common':{
            'n_hidden_layers':{'type':'int', 'low':1, 'high':3},
            'units':{'type':'int', 'low':2, 'high':5},
            'activation':{'type':'class', 'vals':['relu', 'tanh', 'selu']},
            'dropouts':{'type':'float', 'low':0.01, 'high':0.9},
            }}
output_hyper = {'output_1':{
            'n_outputs':1,
            'n_hidden_layers':{'type':'int', 'low':1, 'high':3},
            'units':{'type':'int', 'low':2, 'high':5},
            'activation':{'type':'class', 'vals':['relu', 'tanh', 'selu']},
            'dropouts':{'type':'float', 'low':0.01, 'high':0.9}
            }}


study, best_hyper, best_model = opt_nn.optimize_nn(input_hyper=input_hyper,
                           common_hyper=common_hyper,
                           output_hyper=output_hyper,
                           study_name='TF Test',
                           n_trials=30, 
                           multivariate=False)

Best trial: 10. Best value: 0.743374: 100%|██████████| 30/30 [09:30<00:00, 19.01s/it] 


In [9]:
study, best_hyper, best_model

(<optuna.study.study.Study at 0x1ed5b646630>,
 defaultdict(collections.defaultdict,
             {'input_1': defaultdict(None,
                          {'input_shape': (8,),
                           'n_hidden_layers': 1,
                           'units': [32],
                           'activation': ['relu'],
                           'dropouts': [0.7288125046701953]}),
              'common': defaultdict(None,
                          {'n_hidden_layers': 1,
                           'units': [4],
                           'activation': ['relu'],
                           'dropouts': [0.5808562606830822]}),
              'output_1': defaultdict(None,
                          {'n_outputs': 1,
                           'n_hidden_layers': 2,
                           'units': [4, 4],
                           'activation': ['tanh', 'tanh'],
                           'dropouts': [0.360157122783144,
                            0.809005692237188]}),
              'best_thresh

In [11]:
best_model.summary()