In [None]:
# default_exp model_iterator

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Iterate over classifiers with Bayesian optimization

In [None]:
# exporti
import random
import numpy as np
import pandas as pd
from fastcore.test import *
from sklearn.ensemble import RandomForestClassifier
from Yikai_helper_funcs import * 
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_multilabel_classification, make_regression
from sklearn.metrics import make_scorer, roc_auc_score
from fastai.data.transforms import get_files
from pathlib import Path

# from fastcore.basics import store_attr  # has bugs currently

from Yikai_helper_funcs.params import *

In [None]:
def get_data():
    """Synthetic binary classification dataset."""
    data, targets = make_classification(
        n_samples=100,
        n_features=45)
    return data, targets


x, y = get_data()

In [None]:
# export 
def _compile_results(root_dir):
    
    '''
    This is the function to conpile results from different models that are stored in a root dirctory.
    Results were stored in .json file where each .json file corresponds to one call of an optinizer.
    
    Params:
        root_dir (str or Path ): The root directory where all the .json files were stored
    '''
    if len(get_files(root_dir, extensions= ".json")) == 1: 
        file = get_files(root_dir, extensions= ".json")[0]
        model_name = str(Path(file).parent).split("/")[-1]
        df_init = pd.read_json(file, lines=True)
        df_init['model'] = [model_name] * df_init.shape[0]
        return df_init
    
    else:
        for i, file in enumerate(get_files(root_dir, extensions= ".json")):
            #print(file)
            model_name = str(Path(file).parent).split("/")[-1]
            #print(model_name)
            if not i:
                df_init = pd.read_json(file, lines=True)
                df_init['model'] = [model_name] * df_init.shape[0]
            else: 
                df = pd.read_json(file, lines=True)
                df['model'] = [model_name] * df.shape[0]
                df_final = pd.concat([df_init, df], ignore_index = True).sort_values('target', ascending= False)

        return df_final


In [None]:
if Path('./bayes_opt_logs/').exists():
    df = _compile_results(Path('./bayes_opt_logs/')) # './bayes_opt_logs/' works fine as well.
    df

# Iterate models

In [None]:
# export
class ModelIterator:
    
    def __init__(self, x, y, *, rf_params = None, xgboost_params = None, 
                 lightgbm_params = None, log_path = Path("./bayes_opt_logs"),
                rf_init_points = 10, rf_n_iter = 5,
                xgboost_init_points = 10, xgboost_n_iter = 5, 
                lightgbm_init_points = 10, lightgbm_n_iter = 5):
    

        self.log_path = Path(log_path)
        if not rf_params: self.rf_params = {}
        if not lightgbm_params: self.lightgbm_params = {}
        if not xgboost_params: self.xgboost_params = {}
        self.rf_init_points = rf_init_points
        self.rf_n_iter = rf_n_iter
        self.xgboost_init_points = xgboost_init_points
        self.xgboost_n_iter = xgboost_n_iter
        self.lightgbm_init_points = lightgbm_init_points
        self.lightgbm_n_iter = lightgbm_n_iter

    
    # TODO: Change hardcoded init_points & n_iter
    # TODO: Add try except blocks
    def _run_rf(self, x, y, **kwargs_model ):
        """  run RF and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        RFParamGenerator().matrix_generation()
                        
        """ 
        params_forest = RFParamGenerator(**kwargs_model).matrix_generation()
        @optimize_bayes_param(X=x, y=y)
        def optimize_forest(n_estimators, min_samples_split, max_depth , ccp_alpha):
            return RandomForestClassifier(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split),  
                                          max_depth = int(max_depth), ccp_alpha = float(ccp_alpha), n_jobs=-1) 
        best_rf = optimize_forest(init_points=self.rf_init_points, n_iter=self.rf_n_iter, pbounds=params_forest, log_dir= self.log_path/"forest")
        return best_rf
    
    
    def _run_xgboost(self, x, y, **kwargs_model ):
        """  run Xgboost and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        XgboostParamGenerator().matrix_generation()
                        
        """ 
        params_xgboost = XgboostParamGenerator().matrix_generation()

        @optimize_bayes_param(X=x, y=y)
        def optimize_xgboost(n_estimators, max_depth, min_child_weight, gamma,learning_rate, subsample):
            return XGBClassifier(n_estimators= int(n_estimators), max_depth = int(max_depth), 
            min_child_weight = min_child_weight , gamma = gamma, learning_rate = learning_rate,
            subsample = subsample,
            n_jobs=-1)

        best_xgboost = optimize_xgboost(init_points=self.xgboost_init_points, n_iter=self.xgboost_n_iter, pbounds=params_xgboost, log_dir=self.log_path/"xgboost")
        return best_xgboost
    
    def _run_lightgbm(self, x, y, **kwargs_model):
            
        """  run Lightgbm and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        LgbmParamGenerator().matrix_generation()
            """

        params_lgbm = LgbmParamGenerator().matrix_generation()

        @optimize_bayes_param(X=x, y=y)
        def optimize_lgbm(num_leaves: int,learning_rate:float, min_child_samples, reg_alpha, reg_lambda, colsample_bytree):
            return LGBMClassifier(  
                **{
                "num_leaves" : int(num_leaves),
                "learning_rate" : float(learning_rate),
               "min_child_samples" : int(min_child_samples),
                "reg_alpha" : float(reg_alpha),
                "reg_lambda" : float(reg_lambda),
               'colsample_bytree': float(colsample_bytree)   
            })

        best_lightgbm = optimize_lgbm(init_points=5, n_iter=10, pbounds= params_lgbm, log_dir= self.log_path/"lgbm")
        return best_lightgbm
    
    def fit_predict(self, compile_results = True):
        
        best_rf = self._run_rf(x, y, **self.rf_params)
        best_xgboost = self._run_xgboost(x, y, **self.xgboost_params)
       # best_lightgbm = self._run_lightgbm(x, y, **self.lightgbm_params)
        
        print("""
        ----------------------------------------------------
        Returned best_rf, best_xgboost
        
        """)
        return best_rf, best_xgboost #, best_lightgbm

    def __call__(self):
        return self.fit_predict()
    
    def compile_results(self):
        return _compile_results(self.log_path)
        

In [None]:
iterator = ModelIterator(x, y)

In [None]:
a,b = iterator()

The best combination of hyperparameters are {'ccp_alpha': 0.1, 'max_depth': 70.57988694377921, 'min_samples_split': 2.0, 'n_estimators': 20.0}
The best score for the hyperparameters are 0.85
The best combination of hyperparameters are {'gamma': 3.7136776609260598, 'learning_rate': 0.2167961016370373, 'max_depth': 7.539356769634978, 'min_child_weight': 0.5068658104168189, 'n_estimators': 33.88882170271247, 'subsample': 0.6212943010806828}
The best score for the hyperparameters are 0.8300000000000001

        ----------------------------------------------------
        Returned best_rf, best_xgboost
        
        


In [None]:
assert a.__class__.__name__ == "RandomForestClassifier"

In [None]:
iterator.compile_results()

Unnamed: 0,target,params,datetime,model
14,0.85,"{'ccp_alpha': 0.1, 'max_depth': 70.57988694377921, 'min_samples_split': 2.0, 'n_estimators': 20.0}","{'datetime': '2020-10-30 11:19:04', 'elapsed': 11.703147, 'delta': 0.25911}",forest
10,0.84,"{'ccp_alpha': 0.054105622366733004, 'max_depth': 57.048673161898094, 'min_samples_split': 48.10446300913914, 'n_estimators': 279.32115719676585}","{'datetime': '2020-10-30 11:19:01', 'elapsed': 9.090092, 'delta': 1.138335}",forest
27,0.83,"{'gamma': 3.7136776609260593, 'learning_rate': 0.21679610163703703, 'max_depth': 7.539356769634978, 'min_child_weight': 0.506865810416818, 'n_estimators': 33.88882170271247, 'subsample': 0.6212943010806821}","{'datetime': '2020-10-30 11:19:05', 'elapsed': 1.236437, 'delta': 0.280117}",xgboost
8,0.83,"{'ccp_alpha': 0.034467543507090005, 'max_depth': 58.94063876431647, 'min_samples_split': 46.35661028585887, 'n_estimators': 282.45330643881437}","{'datetime': '2020-10-30 11:19:00', 'elapsed': 7.730209, 'delta': 1.025486}",forest
13,0.83,"{'ccp_alpha': 0.052129562762187005, 'max_depth': 105.0, 'min_samples_split': 2.0, 'n_estimators': 111.6100155243519}","{'datetime': '2020-10-30 11:19:03', 'elapsed': 11.444037, 'delta': 0.6602800000000001}",forest
4,0.82,"{'ccp_alpha': 0.079444386496789, 'max_depth': 89.81701419845686, 'min_samples_split': 40.04511099725938, 'n_estimators': 107.83760330551017}","{'datetime': '2020-10-30 11:18:56', 'elapsed': 3.904091, 'delta': 0.485176}",forest
11,0.82,"{'ccp_alpha': 0.090688706631161, 'max_depth': 81.67031636664547, 'min_samples_split': 30.579345722625956, 'n_estimators': 79.8789446662902}","{'datetime': '2020-10-30 11:19:01', 'elapsed': 9.620502, 'delta': 0.53041}",forest
17,0.81,"{'gamma': 9.477275494910353, 'learning_rate': 0.15320101738702502, 'max_depth': 4.863992233552129, 'min_child_weight': 0.40526383582332304, 'n_estimators': 11.0840109699008, 'subsample': 0.469092172621336}","{'datetime': '2020-10-30 11:19:04', 'elapsed': 0.12967099999999998, 'delta': 0.032591999999999996}",xgboost
23,0.81,"{'gamma': 11.682946644747425, 'learning_rate': 0.285787689887124, 'max_depth': 5.964892778826315, 'min_child_weight': 0.59792575807783, 'n_estimators': 7.273720969528306, 'subsample': 0.491952710919951}","{'datetime': '2020-10-30 11:19:04', 'elapsed': 0.37557799999999997, 'delta': 0.022227}",xgboost
20,0.81,"{'gamma': 2.356996174242245, 'learning_rate': 0.285225525983594, 'max_depth': 9.521255624586537, 'min_child_weight': 0.550847343570219, 'n_estimators': 39.682297752391584, 'subsample': 0.7604369695630081}","{'datetime': '2020-10-30 11:19:04', 'elapsed': 0.301013, 'delta': 0.07479000000000001}",xgboost


In [None]:
#  Check if the best RF model matches the best rf parameters in the above table
a.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.1,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 70,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 20,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}