In [None]:
# default_exp model_iterator

In [None]:
%load_ext autoreload
%autoreload 2

# Iterate over classifiers with Bayesian optimization

In [None]:
# exporti
import random
import numpy as np
import pandas as pd
from fastcore.test import *
from sklearn.ensemble import RandomForestClassifier
from Yikai_helper_funcs import * 
from sklearn.datasets import make_classification
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_multilabel_classification, make_regression
from sklearn.metrics import make_scorer, roc_auc_score
from fastai.data.transforms import get_files
from pathlib import Path

# from fastcore.basics import store_attr  # has bugs currently

from Yikai_helper_funcs.params import *

In [None]:
def get_data():
    """Synthetic binary classification dataset."""
    data, targets = make_classification(
        n_samples=100,
        n_features=45)
    return data, targets


x, y = get_data()

In [None]:
# export 
def _compile_results(root_dir):
    
    '''
    This is the function to conpile results from different models that are stored in a root dirctory.
    Results were stored in .json file where each .json file corresponds to one call of an optinizer.
    
    Params:
        root_dir (str or Path ): The root directory where all the .json files were stored
    '''
    if len(get_files(root_dir, extensions= ".json")) == 1: 
        file = get_files(root_dir, extensions= ".json")[0]
        model_name = str(Path(file).parent).split("/")[-1]
        df_init = pd.read_json(file, lines=True)
        df_init['model'] = [model_name] * df_init.shape[0]
        return df_init
    
    else:
        for i, file in enumerate(get_files(root_dir, extensions= ".json")):
            #print(file)
            model_name = str(Path(file).parent).split("/")[-1]
            #print(model_name)
            if not i:
                df_init = pd.read_json(file, lines=True)
                df_init['model'] = [model_name] * df_init.shape[0]
            else: 
                df = pd.read_json(file, lines=True)
                df['model'] = [model_name] * df.shape[0]
                df_final = pd.concat([df_init, df], ignore_index = True).sort_values('target', ascending= False)

        return df_final


In [None]:
if Path('./bayes_opt_logs/').exists():
    df = _compile_results(Path('./bayes_opt_logs/')) # './bayes_opt_logs/' works fine as well.
    df

# Iterate models

In [None]:
# export
class ModelIterator:
    
    def __init__(self, x, y, *, rf_params = None, xgboost_params = None, 
                 lightgbm_params = None, log_path = Path("./bayes_opt_logs"),
                rf_init_points = 10, rf_n_iter = 5,
                xgboost_init_points = 10, xgboost_n_iter = 5, 
                lightgbm_init_points = 10, lightgbm_n_iter = 5):
    

        self.log_path = Path(log_path)
        if not rf_params: self.rf_params = {}
        if not lightgbm_params: self.lightgbm_params = {}
        if not xgboost_params: self.xgboost_params = {} # These are passed in the fit_predict method
        self.rf_init_points = rf_init_points
        self.rf_n_iter = rf_n_iter
        self.xgboost_init_points = xgboost_init_points
        self.xgboost_n_iter = xgboost_n_iter
        self.lightgbm_init_points = lightgbm_init_points
        self.lightgbm_n_iter = lightgbm_n_iter
        self.x = x
        self.y = y

    
    # TODO: Change hardcoded init_points & n_iter
    # TODO: Add try except blocks
    def _run_rf(self, x, y, **kwargs_model ):
        """  run RF and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        RFParamGenerator().matrix_generation()
                        
        """ 
        params_forest = RFParamGenerator(**kwargs_model).matrix_generation()
        @optimize_bayes_param(X=x, y=y)
        def optimize_forest(n_estimators, min_samples_split, max_depth , ccp_alpha):
            return RandomForestClassifier(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split),  
                                          max_depth = int(max_depth), ccp_alpha = float(ccp_alpha), n_jobs=-1) 
        best_rf = optimize_forest(init_points=self.rf_init_points, n_iter=self.rf_n_iter, pbounds=params_forest, log_dir= self.log_path/"forest_test")
        return best_rf
    
    
    def _run_xgboost(self, x, y, **kwargs_model ):
        """  run Xgboost and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        XgboostParamGenerator().matrix_generation()
                        
        """ 
        params_xgboost = XgboostParamGenerator(**kwargs_model).matrix_generation()

        @optimize_bayes_param(X=x, y=y)
        def optimize_xgboost(n_estimators, max_depth, min_child_weight, gamma,learning_rate, subsample):
            return XGBClassifier(n_estimators= int(n_estimators), max_depth = int(max_depth), 
            min_child_weight = min_child_weight , gamma = gamma, learning_rate = learning_rate,
            subsample = subsample,
            n_jobs=-1)

        best_xgboost = optimize_xgboost(init_points=self.xgboost_init_points, n_iter=self.xgboost_n_iter, pbounds=params_xgboost, log_dir=self.log_path/"xgboost_test")
        return best_xgboost
    
    def _run_lightgbm(self, x, y, **kwargs_model):
            
        """  run Lightgbm and log tehm at self.log_path
        
        Params:
            kwargs_model: the range for hyperparameters that you want to overwrite the default generated from 
                        LgbmParamGenerator().matrix_generation()
            """

        params_lgbm = LgbmParamGenerator(**kwargs_model).matrix_generation()

        @optimize_bayes_param(X=x, y=y)
        def optimize_lgbm(num_leaves: int,learning_rate:float, min_child_samples, reg_alpha, reg_lambda, colsample_bytree):
            return LGBMClassifier(  
                **{
                "num_leaves" : int(num_leaves),
                "learning_rate" : float(learning_rate),
               "min_child_samples" : int(min_child_samples),
                "reg_alpha" : float(reg_alpha),
                "reg_lambda" : float(reg_lambda),
               'colsample_bytree': float(colsample_bytree)   
            })

        best_lightgbm = optimize_lgbm(init_points=5, n_iter=10, pbounds= params_lgbm, log_dir= self.log_path/"lgbm_test")
        return best_lightgbm
    
    def fit_predict(self, compile_results = True):
        
        best_rf = self._run_rf(self.x, self.y, **self.rf_params)
        best_xgboost = self._run_xgboost(self.x, self.y, **self.xgboost_params)
       # best_lightgbm = self._run_lightgbm(x, y, **self.lightgbm_params)
        
        print("""
        ----------------------------------------------------
        Returned best_rf, best_xgboost
        
        """)
        return best_rf, best_xgboost #, best_lightgbm

    def __call__(self):
        return self.fit_predict()
    
    def compile_results(self):
        return _compile_results(self.log_path)
        

In [None]:
iterator = ModelIterator(x, y)

In [None]:
a,b = iterator()

The best combination of hyperparameters are {'ccp_alpha': 0.016125949599018908, 'max_depth': 85.2182034515113, 'min_samples_split': 30.20533749120456, 'n_estimators': 116.38316525005445}
The best score for the hyperparameters are 0.9
The best combination of hyperparameters are {'gamma': 19.44714671178108, 'learning_rate': 0.25218693542334614, 'max_depth': 5.56688841657944, 'min_child_weight': 0.49835854789956996, 'n_estimators': 57.528298183122644, 'subsample': 0.5058840564323542}
The best score for the hyperparameters are 0.9

        ----------------------------------------------------
        Returned best_rf, best_xgboost
        
        


In [None]:
assert a.__class__.__name__ == "RandomForestClassifier"

In [None]:
iterator.compile_results()

Unnamed: 0,target,params,datetime,model
22,0.9,"{'gamma': 19.44714671178108, 'learning_rate': 0.252186935423346, 'max_depth': 5.56688841657944, 'min_child_weight': 0.498358547899569, 'n_estimators': 57.528298183122644, 'subsample': 0.505884056432354}","{'datetime': '2020-10-30 13:39:24', 'elapsed': 0.570715, 'delta': 0.06661}",xgboost
7,0.9,"{'ccp_alpha': 0.016125949599018002, 'max_depth': 85.2182034515113, 'min_samples_split': 30.20533749120456, 'n_estimators': 116.38316525005445}","{'datetime': '2020-10-30 13:39:14', 'elapsed': 6.618534, 'delta': 0.490751}",forest
15,0.89,"{'gamma': 18.737012674306975, 'learning_rate': 0.014042230087081001, 'max_depth': 7.598830144613695, 'min_child_weight': 0.029469544819655004, 'n_estimators': 17.20045943691692, 'subsample': 0.46396463518934905}","{'datetime': '2020-10-30 13:39:23', 'elapsed': 0.0, 'delta': 0.0}",xgboost
11,0.89,"{'ccp_alpha': 0.07057335618820701, 'max_depth': 94.35201054110658, 'min_samples_split': 19.5976583373162, 'n_estimators': 339.8544431378984}","{'datetime': '2020-10-30 13:39:20', 'elapsed': 11.903299, 'delta': 1.3172220000000001}",forest
21,0.89,"{'gamma': 24.8022125052445, 'learning_rate': 0.257794332839592, 'max_depth': 7.673275585234246, 'min_child_weight': 0.549623218287408, 'n_estimators': 16.218405446276055, 'subsample': 0.690499024759992}","{'datetime': '2020-10-30 13:39:24', 'elapsed': 0.504105, 'delta': 0.030532999999999998}",xgboost
19,0.89,"{'gamma': 21.439396566414764, 'learning_rate': 0.020335018280972002, 'max_depth': 8.919223833338647, 'min_child_weight': 0.34887411942114405, 'n_estimators': 90.41754215173228, 'subsample': 0.810918861932405}","{'datetime': '2020-10-30 13:39:24', 'elapsed': 0.440835, 'delta': 0.135839}",xgboost
1,0.89,"{'ccp_alpha': 0.07443589926857101, 'max_depth': 102.81105604300114, 'min_samples_split': 45.34188427654193, 'n_estimators': 327.32490352685704}","{'datetime': '2020-10-30 13:39:09', 'elapsed': 1.07041, 'delta': 1.07041}",forest
14,0.89,"{'ccp_alpha': 0.014363318727668, 'max_depth': 43.052297264116355, 'min_samples_split': 10.089210650567475, 'n_estimators': 278.34654944365667}","{'datetime': '2020-10-30 13:39:23', 'elapsed': 15.324774, 'delta': 1.079825}",forest
13,0.89,"{'ccp_alpha': 0.024827837855866, 'max_depth': 99.93259289398, 'min_samples_split': 16.52783778237177, 'n_estimators': 309.11254588276654}","{'datetime': '2020-10-30 13:39:22', 'elapsed': 14.244949, 'delta': 1.188328}",forest
12,0.89,"{'ccp_alpha': 0.1, 'max_depth': 65.0302536147975, 'min_samples_split': 7.550138093644308, 'n_estimators': 317.9106478932843}","{'datetime': '2020-10-30 13:39:21', 'elapsed': 13.056621, 'delta': 1.153322}",forest


In [None]:
#  Check if the best RF model matches the best rf parameters in the above table
a.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.016125949599018908,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 85,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 30,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 116,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}