In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from Impute_Transformer import Imputer

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [4]:
#Load data 
X = pd.read_csv("data/X_large.csv")
y = np.load("data/y_new.npy")

In [5]:
#split into test and training
(X_train, X_test, y_train, y_test) = train_test_split(X, 
                                                      y, 
                                                      test_size = 0.2, 
                                                      random_state = 42)

In [6]:
#list with catecorical cariables for imputer
imp_cat_cols = ['physical_activity', 'platelets_isNormal', 'platelets_isIncreased','platelets_isDecreased', 
                'urine_albumin_isNegative','urine_albumin_is>=30', 'urine_albumin_is>=100', 
                'urine_albumin_is>=300', 'urine_albumin_is>=1000', 'urine_albumin_isTrace', 
                'urine_glucose_isNegative', 'urine_glucose_isLight', 
                'urine_glucose_isMedium','urine_glucose_isDark', 
                'urine_glucose_isVerydark','urine_glucose_isTrace', 'urine_hematest_isNegative', 
                'urine_hematest_isSmall', 'urine_hematest_isModerate', 
                'urine_hematest_isLarge','urine_hematest_isVerylarge', 'urine_hematest_isTrace']

In [7]:
#define column transformer to make imputation
imputer = Imputer(imp_cat_cols)
#define rf model
rf = RandomForestClassifier(random_state = 42, criterion = 'gini')
#define rf pipe
pipe_rf = make_pipeline(imputer, rf)
#define param grid to search through
param_search_rf = {
    'randomforestclassifier__n_estimators': Integer(200,800),
    'randomforestclassifier__max_depth': Integer(2, 10),
    'randomforestclassifier__max_features': Real(0.1,0.4),

}
#define cv splits
skf = StratifiedKFold(n_splits = 5, random_state=42, shuffle=True)
#define rf grid
grid_rf = BayesSearchCV(pipe_rf, 
                   search_spaces = param_search_rf,
                   n_iter = 50,
                   optimizer_kwargs = {'acq_func': 'EI'},
                   scoring = 'neg_log_loss',
                   n_jobs = -1,
                   refit = True,
                   cv = skf, 
                   random_state = 42)

In [8]:
grid_rf.fit(X_train, y_train)

BayesSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
              estimator=Pipeline(steps=[('imputer',
                                         Imputer(categorical_features=['physical_activity',
                                                                       'platelets_isNormal',
                                                                       'platelets_isIncreased',
                                                                       'platelets_isDecreased',
                                                                       'urine_albumin_isNegative',
                                                                       'urine_albumin_is>=30',
                                                                       'urine_albumin_is>=100',
                                                                       'urine_albumin_is>=300',
                                                                       'urine_albumin_is>=100...
           

In [10]:
joblib.dump(grid_rf, 'Results/RF_grid.pkl')

['Results_NEW/RF_grid.pkl']