In [None]:
# default_exp model_iterator

In [None]:
%load_ext autoreload
%autoreload 2

# Iterate over classifiers with Bayesian optimization

In [None]:
# exporti
import random
import numpy as np
import pandas as pd
from fastcore.test import *
from sklearn.ensemble import RandomForestClassifier
from Yikai_helper_funcs import * 
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_multilabel_classification, make_regression
from sklearn.metrics import make_scorer, roc_auc_score
from fastai.data.transforms import get_files
from pathlib import Path

# from fastcore.basics import store_attr  # has bugs currently

from Yikai_helper_funcs.params import *

In [None]:
def get_data():
    """Synthetic binary classification dataset."""
    data, targets = make_classification(
        n_samples=100,
        n_features=45)
    return data, targets


x, y = get_data()

In [None]:
# export 
def _compile_results(root_dir):
    
    '''
    This is the function to conpile results from different models that are stored in a root dirctory.
    Results were stored in .json file where each .json file corresponds to one call of an optinizer.
    
    Params:
        root_dir (str or Path ): The root directory where all the .json files were stored
    '''
    if len(get_files(root_dir, extensions= ".json")) == 1: 
        file = get_files(root_dir, extensions= ".json")[0]
        model_name = str(Path(file).parent).split("/")[-1]
        df_init = pd.read_json(file, lines=True)
        df_init['model'] = [model_name] * df_init.shape[0]
        return df_init
    
    else:
        for i, file in enumerate(get_files(root_dir, extensions= ".json")):
            #print(file)
            model_name = str(Path(file).parent).split("/")[-1]
            #print(model_name)
            if not i:
                df_init = pd.read_json(file, lines=True)
                df_init['model'] = [model_name] * df_init.shape[0]
            else: 
                df = pd.read_json(file, lines=True)
                df['model'] = [model_name] * df.shape[0]
                df_final = pd.concat([df_init, df], ignore_index = True).sort_values('target', ascending= False)

        return df_final


In [None]:
if Path('./bayes_opt_logs/').exists():
    df = _compile_results(Path('./bayes_opt_logs/')) # './bayes_opt_logs/' works fine as well.
    df

# Iterate models

In [None]:
# export
class ModelIterator:
    
    def __init__(self, x, y, *, rf_params = None, xgboost_params = None, 
                 lightgbm_params = None, log_path = Path("./bayes_opt_logs"),
                rf_init_points = 10, rf_n_iter = 5,
                xgboost_init_points = 10, xgboost_n_iter = 5, 
                lightgbm_init_points = 10, lightgbm_n_iter = 5):
    

        self.log_path = Path(log_path)
        if not rf_params: self.rf_params = {}
        if not lightgbm_params: self.lightgbm_params = {}
        if not xgboost_params: self.xgboost_params = {} # These are passed in the fit_predict method
        self.rf_init_points = rf_init_points
        self.rf_n_iter = rf_n_iter
        self.xgboost_init_points = xgboost_init_points
        self.xgboost_n_iter = xgboost_n_iter
        self.lightgbm_init_points = lightgbm_init_points
        self.lightgbm_n_iter = lightgbm_n_iter
        self.x = x
        self.y = y

    
    # TODO: Change hardcoded init_points & n_iter
    # TODO: Add try except blocks
    def _run_rf(self, x, y, **kwargs_model ):
        """  run RF and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        RFParamGenerator().matrix_generation()
                        
        """ 
        params_forest = RFParamGenerator(**kwargs_model).matrix_generation()
        @optimize_bayes_param(X=x, y=y)
        def optimize_forest(n_estimators, min_samples_split, max_depth , ccp_alpha):
            return RandomForestClassifier(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split),  
                                          max_depth = int(max_depth), ccp_alpha = float(ccp_alpha), n_jobs=-1) 
        best_rf = optimize_forest(init_points=self.rf_init_points, n_iter=self.rf_n_iter, pbounds=params_forest, log_dir= self.log_path/"forest")
        return best_rf
    
    
    def _run_xgboost(self, x, y, **kwargs_model ):
        """  run Xgboost and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        XgboostParamGenerator().matrix_generation()
                        
        """ 
        params_xgboost = XgboostParamGenerator(**kwargs_model).matrix_generation()

        @optimize_bayes_param(X=x, y=y)
        def optimize_xgboost(n_estimators, max_depth, min_child_weight, gamma,learning_rate, subsample):
            return XGBClassifier(n_estimators= int(n_estimators), max_depth = int(max_depth), 
            min_child_weight = min_child_weight , gamma = gamma, learning_rate = learning_rate,
            subsample = subsample,
            n_jobs=-1)

        best_xgboost = optimize_xgboost(init_points=self.xgboost_init_points, n_iter=self.xgboost_n_iter, pbounds=params_xgboost, log_dir=self.log_path/"xgboost")
        return best_xgboost
    
    def _run_lightgbm(self, x, y, **kwargs_model):
            
        """  run Lightgbm and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        LgbmParamGenerator().matrix_generation()
            """

        params_lgbm = LgbmParamGenerator(**kwargs_model).matrix_generation()

        @optimize_bayes_param(X=x, y=y)
        def optimize_lgbm(num_leaves: int,learning_rate:float, min_child_samples, reg_alpha, reg_lambda, colsample_bytree):
            return LGBMClassifier(  
                **{
                "num_leaves" : int(num_leaves),
                "learning_rate" : float(learning_rate),
               "min_child_samples" : int(min_child_samples),
                "reg_alpha" : float(reg_alpha),
                "reg_lambda" : float(reg_lambda),
               'colsample_bytree': float(colsample_bytree)   
            })

        best_lightgbm = optimize_lgbm(init_points=5, n_iter=10, pbounds= params_lgbm, log_dir= self.log_path/"lgbm")
        return best_lightgbm
    
    def fit_predict(self, compile_results = True):
        
        best_rf = self._run_rf(self.x, self.y, **self.rf_params)
        best_xgboost = self._run_xgboost(self.x, self.y, **self.xgboost_params)
       # best_lightgbm = self._run_lightgbm(x, y, **self.lightgbm_params)
        
        print("""
        ----------------------------------------------------
        Returned best_rf, best_xgboost
        
        """)
        return best_rf, best_xgboost #, best_lightgbm

    def __call__(self):
        return self.fit_predict()
    
    def compile_results(self):
        return _compile_results(self.log_path)
        

In [None]:
iterator = ModelIterator(x, y)

In [None]:
a,b = iterator()

The best combination of hyperparameters are {'ccp_alpha': 0.029834902375002192, 'max_depth': 35.69860577029296, 'min_samples_split': 4.497178477874517, 'n_estimators': 406.4077438886299}
The best score for the hyperparameters are 0.99
The best combination of hyperparameters are {'gamma': 10.457964153104308, 'learning_rate': 0.27079083302557405, 'max_depth': 3.9195331071476, 'min_child_weight': 0.22851186079028146, 'n_estimators': 97.85382371807863, 'subsample': 0.5507585185908407}
The best score for the hyperparameters are 0.99

        ----------------------------------------------------
        Returned best_rf, best_xgboost
        
        


In [None]:
assert a.__class__.__name__ == "RandomForestClassifier"

In [None]:
iterator.compile_results()

Unnamed: 0,target,params,datetime,model
0,0.99,"{'ccp_alpha': 0.029834902375002, 'max_depth': 35.69860577029296, 'min_samples_split': 4.497178477874517, 'n_estimators': 406.4077438886299}","{'datetime': '2020-10-30 12:15:31', 'elapsed': 0.0, 'delta': 0.0}",forest
14,0.99,"{'ccp_alpha': 0.049457167892975, 'max_depth': 21.34714460344915, 'min_samples_split': 2.097431305959691, 'n_estimators': 489.86987505735203}","{'datetime': '2020-10-30 12:15:45', 'elapsed': 14.030575, 'delta': 2.085842}",forest
4,0.99,"{'ccp_alpha': 0.021861492413113002, 'max_depth': 83.05304497803968, 'min_samples_split': 20.243912858025652, 'n_estimators': 270.7014108189465}","{'datetime': '2020-10-30 12:15:33', 'elapsed': 2.019139, 'delta': 0.823596}",forest
6,0.99,"{'ccp_alpha': 0.043781538376893005, 'max_depth': 62.67572775871829, 'min_samples_split': 46.88187614036777, 'n_estimators': 358.8362393777668}","{'datetime': '2020-10-30 12:15:35', 'elapsed': 4.171425, 'delta': 1.260813}",forest
10,0.99,"{'ccp_alpha': 0.01, 'max_depth': 44.14258634567429, 'min_samples_split': 2.0, 'n_estimators': 360.63113095974927}","{'datetime': '2020-10-30 12:15:40', 'elapsed': 8.294351, 'delta': 1.25263}",forest
11,0.99,"{'ccp_alpha': 0.1, 'max_depth': 105.0, 'min_samples_split': 25.40894051245977, 'n_estimators': 158.47752924077685}","{'datetime': '2020-10-30 12:15:40', 'elapsed': 9.178861, 'delta': 0.88451}",forest
12,0.99,"{'ccp_alpha': 0.069757657642516, 'max_depth': 75.70271526835617, 'min_samples_split': 2.0, 'n_estimators': 389.6688850547288}","{'datetime': '2020-10-30 12:15:42', 'elapsed': 10.656779, 'delta': 1.4779179999999998}",forest
13,0.99,"{'ccp_alpha': 0.1, 'max_depth': 105.0, 'min_samples_split': 2.0, 'n_estimators': 247.90684002652753}","{'datetime': '2020-10-30 12:15:43', 'elapsed': 11.944733, 'delta': 1.287954}",forest
27,0.89,"{'gamma': 5.7300257335118445, 'learning_rate': 0.22360059477888103, 'max_depth': 3.8748947524170463, 'min_child_weight': 0.341132165498487, 'n_estimators': 86.43925887285536, 'subsample': 0.499383142077564}","{'datetime': '2020-10-30 12:14:42', 'elapsed': 1.201651, 'delta': 0.12539799999999998}",xgboost
9,0.89,"{'ccp_alpha': 0.090029399554787, 'max_depth': 89.50191040914383, 'min_samples_split': 54.93897219851763, 'n_estimators': 160.88551700701495}","{'datetime': '2020-10-30 12:15:38', 'elapsed': 7.041721, 'delta': 0.594742}",forest


In [None]:
#  Check if the best RF model matches the best rf parameters in the above table
a.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.029834902375002192,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 35,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 406,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}