In [None]:
# default_exp model_iterator

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Iterate over classifiers with Bayesian optimization

In [None]:
# exporti
import random
import numpy as np
import pandas as pd
from fastcore.test import *
from sklearn.ensemble import RandomForestClassifier
from Yikai_helper_funcs import * 
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_multilabel_classification, make_regression
from sklearn.metrics import make_scorer, roc_auc_score
from fastai.data.transforms import get_files
from pathlib import Path

# from fastcore.basics import store_attr  # has bugs currently

from Yikai_helper_funcs.params import *

In [None]:
def get_data():
    """Synthetic binary classification dataset."""
    data, targets = make_classification(
        n_samples=100,
        n_features=45)
    return data, targets


x, y = get_data()

In [None]:
# export 
def _compile_results(root_dir):
    
    '''
    This is the function to conpile results from different models that are stored in a root dirctory.
    Results were stored in .json file where each .json file corresponds to one call of an optinizer.
    
    Params:
        root_dir (str or Path ): The root directory where all the .json files were stored
    '''
    if len(get_files(root_dir, extensions= ".json")) == 1: 
        file = get_files(root_dir, extensions= ".json")[0]
        model_name = str(Path(file).parent).split("/")[-1]
        df_init = pd.read_json(file, lines=True)
        df_init['model'] = [model_name] * df_init.shape[0]
        return df_init
    
    else:
        for i, file in enumerate(get_files(root_dir, extensions= ".json")):
            #print(file)
            model_name = str(Path(file).parent).split("/")[-1]
            #print(model_name)
            if not i:
                df_init = pd.read_json(file, lines=True)
                df_init['model'] = [model_name] * df_init.shape[0]
            else: 
                df = pd.read_json(file, lines=True)
                df['model'] = [model_name] * df.shape[0]
                df_final = pd.concat([df_init, df], ignore_index = True).sort_values('target', ascending= False)

        return df_final


In [None]:
if Path('./bayes_opt_logs/').exists():
    df = _compile_results(Path('./bayes_opt_logs/')) # './bayes_opt_logs/' works fine as well.
    df

# Iterate models

In [None]:
# export
class ModelIterator:
    
    def __init__(self, x, y, *, rf_params = None, xgboost_params = None, 
                 lightgbm_params = None, log_path = Path("./bayes_opt_logs"),
                rf_init_points = 10, rf_n_iter = 5,
                xgboost_init_points = 10, xgboost_n_iter = 5, 
                lightgbm_init_points = 10, lightgbm_n_iter = 5):
    

        self.log_path = Path(log_path)
        if not rf_params: self.rf_params = {}
        if not lightgbm_params: self.lightgbm_params = {}
        if not xgboost_params: self.xgboost_params = {} # These are passed in the fit_predict method
        self.rf_init_points = rf_init_points
        self.rf_n_iter = rf_n_iter
        self.xgboost_init_points = xgboost_init_points
        self.xgboost_n_iter = xgboost_n_iter
        self.lightgbm_init_points = lightgbm_init_points
        self.lightgbm_n_iter = lightgbm_n_iter
        self.x = x
        self.y = y

    
    # TODO: Add try except blocks
    def _run_rf(self, x, y, **kwargs_model ):
        """  run RF and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        RFParamGenerator().matrix_generation()
                        
        """ 
        params_forest = RFParamGenerator(**kwargs_model).matrix_generation()
        @optimize_bayes_param(X=x, y=y)
        def optimize_forest(n_estimators, min_samples_split, max_depth , ccp_alpha):
            return RandomForestClassifier(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split),  
                                          max_depth = int(max_depth), ccp_alpha = float(ccp_alpha), n_jobs=-1) 
        best_rf = optimize_forest(init_points=self.rf_init_points, n_iter=self.rf_n_iter, pbounds=params_forest, log_dir= self.log_path/"forest")
        return best_rf
    
    
    def _run_xgboost(self, x, y, **kwargs_model ):
        """  run Xgboost and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        XgboostParamGenerator().matrix_generation()
                        
        """ 
        params_xgboost = XgboostParamGenerator(**kwargs_model).matrix_generation()

        @optimize_bayes_param(X=x, y=y)
        def optimize_xgboost(n_estimators, max_depth, min_child_weight, gamma,learning_rate, subsample):
            return XGBClassifier(n_estimators= int(n_estimators), max_depth = int(max_depth), 
            min_child_weight = min_child_weight , gamma = gamma, learning_rate = learning_rate,
            subsample = subsample,
            n_jobs=-1)

        best_xgboost = optimize_xgboost(init_points=self.xgboost_init_points, n_iter=self.xgboost_n_iter, pbounds=params_xgboost, log_dir=self.log_path/"xgboost")
        return best_xgboost
    
    def _run_lightgbm(self, x, y, **kwargs_model):
            
        """  run Lightgbm and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        LgbmParamGenerator().matrix_generation()
            """

        params_lgbm = LgbmParamGenerator(**kwargs_model).matrix_generation()

        @optimize_bayes_param(X=x, y=y)
        def optimize_lgbm(num_leaves: int,learning_rate:float, min_child_samples, reg_alpha, reg_lambda, colsample_bytree):
            return LGBMClassifier(  
                **{
                "num_leaves" : int(num_leaves),
                "learning_rate" : float(learning_rate),
               "min_child_samples" : int(min_child_samples),
                "reg_alpha" : float(reg_alpha),
                "reg_lambda" : float(reg_lambda),
               'colsample_bytree': float(colsample_bytree)   
            })

        best_lightgbm = optimize_lgbm(init_points=5, n_iter=10, pbounds= params_lgbm, log_dir= self.log_path/"lgbm")
        return best_lightgbm
    
    def fit_predict(self, compile_results = True):
        
        best_rf = self._run_rf(self.x, self.y, **self.rf_params)
        best_xgboost = self._run_xgboost(self.x, self.y, **self.xgboost_params)
       # best_lightgbm = self._run_lightgbm(x, y, **self.lightgbm_params)
        
        print("""
        ----------------------------------------------------
        Returned best_rf, best_xgboost
        
        """)
        return best_rf, best_xgboost #, best_lightgbm

    def __call__(self):
        return self.fit_predict()
    
    def compile_results(self):
        return _compile_results(self.log_path)
        

In [None]:
iterator = ModelIterator(x, y)

In [None]:
a,b = iterator()

The best combination of hyperparameters are {'ccp_alpha': 0.08830026526241672, 'max_depth': 100.19386157870402, 'min_samples_split': 40.341341327020295, 'n_estimators': 350.0713846235119}
The best score for the hyperparameters are 0.93
The best combination of hyperparameters are {'gamma': 32.98401068803757, 'learning_rate': 0.04145004574335963, 'max_depth': 6.416662902313407, 'min_child_weight': 0.300129784066568, 'n_estimators': 18.071373415937323, 'subsample': 0.8980155925879995}
The best score for the hyperparameters are 0.9099999999999999

        ----------------------------------------------------
        Returned best_rf, best_xgboost
        
        


In [None]:
assert a.__class__.__name__ == "RandomForestClassifier"

In [None]:
iterator.compile_results()

Unnamed: 0,target,params,datetime,model
8,0.92,"{'ccp_alpha': 0.067279368242635, 'max_depth': 91.03474087288205, 'min_samples_split': 26.240855559576847, 'n_estimators': 430.02833821659726}","{'datetime': '2020-10-30 11:36:40', 'elapsed': 6.50518, 'delta': 1.469874}",forest
10,0.92,"{'ccp_alpha': 0.07100718803377201, 'max_depth': 48.415180987244405, 'min_samples_split': 23.245071908137618, 'n_estimators': 222.12516088441248}","{'datetime': '2020-10-30 11:36:42', 'elapsed': 7.872465, 'delta': 0.8809300000000001}",forest
29,0.91,"{'gamma': 23.471406303633138, 'learning_rate': 0.30000000000000004, 'max_depth': 6.496415610335795, 'min_child_weight': 0.6000000000000001, 'n_estimators': 69.8177992230171, 'subsample': 1.0}","{'datetime': '2020-10-30 11:37:30', 'elapsed': 2.38872, 'delta': 0.498231}",xgboost
28,0.91,"{'gamma': 26.684037893019685, 'learning_rate': 0.30000000000000004, 'max_depth': 10.089188228688416, 'min_child_weight': 0.6000000000000001, 'n_estimators': 67.6191739478623, 'subsample': 1.0}","{'datetime': '2020-10-30 11:37:30', 'elapsed': 1.890489, 'delta': 0.33347899999999997}",xgboost
27,0.91,"{'gamma': 25.03612198995891, 'learning_rate': 0.30000000000000004, 'max_depth': 6.6950519570567195, 'min_child_weight': 0.6000000000000001, 'n_estimators': 64.7335622168152, 'subsample': 1.0}","{'datetime': '2020-10-30 11:37:30', 'elapsed': 1.55701, 'delta': 0.385665}",xgboost
22,0.91,"{'gamma': 27.311304164918727, 'learning_rate': 0.17494967328494102, 'max_depth': 3.792886316623781, 'min_child_weight': 0.511493892786127, 'n_estimators': 68.24213228044414, 'subsample': 0.9711859231425151}","{'datetime': '2020-10-30 11:37:29', 'elapsed': 0.570624, 'delta': 0.098658}",xgboost
5,0.91,"{'ccp_alpha': 0.051249200760959006, 'max_depth': 90.2344518151939, 'min_samples_split': 6.47177794834542, 'n_estimators': 161.11245121507758}","{'datetime': '2020-10-30 11:36:37', 'elapsed': 3.643096, 'delta': 0.616495}",forest
20,0.91,"{'gamma': 29.907283188432896, 'learning_rate': 0.22936197296285502, 'max_depth': 4.936189462050605, 'min_child_weight': 0.585635681443024, 'n_estimators': 90.27879421089291, 'subsample': 0.9730594600408781}","{'datetime': '2020-10-30 11:37:28', 'elapsed': 0.37666000000000005, 'delta': 0.144502}",xgboost
16,0.91,"{'gamma': 21.597278566095166, 'learning_rate': 0.155399005398684, 'max_depth': 3.3453028355846213, 'min_child_weight': 0.36001909276979904, 'n_estimators': 10.781942173643085, 'subsample': 0.803301864300732}","{'datetime': '2020-10-30 11:37:28', 'elapsed': 0.026674, 'delta': 0.026674}",xgboost
13,0.9,"{'ccp_alpha': 0.09985531011958401, 'max_depth': 66.98699990325483, 'min_samples_split': 2.0, 'n_estimators': 391.71719157229995}","{'datetime': '2020-10-30 11:36:45', 'elapsed': 10.833482, 'delta': 1.6076329999999999}",forest


In [None]:
#  Check if the best RF model matches the best rf parameters in the above table
a.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.08830026526241672,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 100,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 40,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 350,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}