In [1]:
import pandas as pd 
import numpy as np
import optuna
from ml_optfit.ml_optfit import HyperOptim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
df_data = pd.read_csv('diabetes_prediction_dataset.csv')
df_data.head(2)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0


In [2]:
df_data['stratification_key']= df_data['hypertension'].astype(str) + '-' + df_data['heart_disease'].astype(str)+ '-' + df_data['smoking_history'].astype(str)+ '-' + df_data['diabetes'].astype(str)
train, test = train_test_split(df_data, train_size=70000, stratify=df_data['stratification_key'])
valid, test =train_test_split(test, train_size=20000, stratify=test['stratification_key'])
print('Train Size:', train.shape[0], '--- Diabetes Frequency:', f'{round(100*train.diabetes.sum()/train.shape[0],2)}%')
print('Valid Size:', valid.shape[0], '--- Diabetes Frequency:', f'{round(100*valid.diabetes.sum()/valid.shape[0],2)}%')
print('Test Size:', test.shape[0], '--- Diabetes Frequency:', f'{round(100*test.diabetes.sum()/test.shape[0],2)}%')

Train Size: 70000 --- Diabetes Frequency: 8.5%
Valid Size: 20000 --- Diabetes Frequency: 8.49%
Test Size: 10000 --- Diabetes Frequency: 8.51%


In [3]:
gender_encoder = LabelEncoder()
train['gender']=gender_encoder.fit_transform(train['gender'])
valid['gender']=gender_encoder.transform(valid['gender'])
test['gender']=gender_encoder.transform(test['gender'])

smoking_history_encoder = LabelEncoder()
train['smoking_history']=smoking_history_encoder.fit_transform(train['smoking_history'])
valid['smoking_history']=smoking_history_encoder.transform(valid['smoking_history'])
test['smoking_history']=smoking_history_encoder.transform(test['smoking_history'])

In [4]:
features = ['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'HbA1c_level', 'blood_glucose_level','bmi']
target = ['diabetes']

In [5]:
hyperopt=HyperOptim(direction='maximize', 
                    train=train, 
                    valid=valid, 
                    features=features, 
                    target='diabetes', 
                    evaluation_func=f1_score)

forest_hyper_dict = {'class_weight':
                            {'type': 'class',
                            'values': ['balanced', 'balanced_subsample', None]},
                    'n_estimators':
                            {'type': 'int',
                            'low': 100,
                            'high':600,
                            'log':False,
                            'step':100},
                    'min_impurity_decrease':
                            {'type': 'float',
                            'low': 0,
                            'high':0.1,
                            'log':False,
                            'step':0.01}
                            }

study, best_hyper=hyperopt.optimize_model(model_type=RandomForestClassifier, 
                       study_name='randomforest', 
                       hyperparam_dict=forest_hyper_dict, 
                       multivariate=False, 
                       n_trials=30)

Best trial: 1. Best value: 0.816921: 100%|██████████| 30/30 [01:51<00:00,  3.70s/it]


In [6]:
study,best_hyper

(<optuna.study.study.Study at 0x18de0112f00>,
 {'class_weight': 'balanced',
  'n_estimators': 200,
  'min_impurity_decrease': 0.01,
  'best_threshold': 0.686868686868687})

In [7]:
optuna.visualization.plot_param_importances(study)

In [8]:
optuna.visualization.plot_optimization_history(study)