# Kaggle Competition - Quant & Machine Learning Course
# Tutorial 14: Bayesian Optimization for Hyper-parameter Tuning

Modified based on resource https://www.kaggle.com/somang1418/tuning-hyperparameters-under-10-minutes-lgbm/data?select=train.csv

**Bayesian Optimization** is a probabilistic model based approach for finding the minimum of any function that returns a real-value metric. [(source)](https://towardsdatascience.com/an-introductory-example-of-bayesian-optimization-in-python-with-hyperopt-aae40fff4ff0)<br> It is very effective with real-world applications in high-dimensional parameter-tuning for complex machine learning algorithms. Bayesian optimization utilizes the Bayesian technique of setting a prior over the objective function and
combining it with evidence to get a posterior function.


## Loading Library and Dataset


In [18]:
#basic tools 
import os
import numpy as np
import pandas as pd
import warnings

#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 

#graph, plots
import matplotlib.pyplot as plt
import seaborn as sns

#building models
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import time
import sys

#metrics 
from sklearn.metrics import roc_auc_score, roc_curve
import shap
warnings.simplefilter(action='ignore', category=FutureWarning)

By Changing the data type of each column, I reduced memory usages by 75%. By taking the minimum and the maximum of each column, the function assigns which numeric data type is optimal for the column and change the data type. If you want to know more about how it works, I suggest you to read [Eryk's article](https://towardsdatascience.com/make-working-with-large-dataframes-easier-at-least-for-your-memory-6f52b5f4b5c4)! 

In [19]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [20]:
%%time
train= reduce_mem_usage(pd.read_csv("~/Documents/deep-learning/quant_course/data/sales_prediction_train.csv"))
test= reduce_mem_usage(pd.read_csv("~/Documents/deep-learning/quant_course/data/sales_prediction_test.csv"))
print("Shape of train set: ",train.shape)
print("Shape of test set: ",test.shape)

Mem. usage decreased to 78.01 Mb (74.7% reduction)
Mem. usage decreased to 77.82 Mb (74.6% reduction)
Shape of train set:  (200000, 202)
Shape of test set:  (200000, 201)
CPU times: user 40.3 s, sys: 26 s, total: 1min 6s
Wall time: 1min 9s



## Bayesian Optimization with LightGBM


In [21]:
y=train['target']
X=train.drop(['ID_code','target'],axis=1)

In [22]:
X.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.921875,-6.785156,11.90625,5.09375,11.460938,-9.28125,5.117188,18.625,-4.921875,5.746094,...,4.433594,3.964844,3.136719,1.691406,18.515625,-2.398438,7.878906,8.5625,12.78125,-1.091797
1,11.5,-4.148438,13.859375,5.390625,12.359375,7.042969,5.621094,16.53125,3.146484,8.085938,...,7.640625,7.722656,2.583984,10.953125,15.429688,2.033203,8.125,8.789062,18.359375,1.952148
2,8.609375,-2.746094,12.078125,7.894531,10.585938,-9.085938,6.941406,14.617188,-4.917969,5.953125,...,2.90625,9.789062,1.669922,1.685547,21.609375,3.142578,-6.519531,8.265625,14.71875,0.396484
3,11.0625,-2.152344,8.953125,7.195312,12.585938,-1.835938,5.84375,14.921875,-5.859375,8.242188,...,4.464844,4.742188,0.717773,1.421875,23.03125,-1.270508,-2.927734,10.289062,17.96875,-9.0
4,9.835938,-1.483398,12.875,6.636719,12.273438,2.449219,5.941406,19.25,6.265625,7.679688,...,-1.490234,9.523438,-0.150757,9.195312,13.289062,-1.511719,3.925781,9.5,18.0,-8.8125


In [23]:
y.sum()

20098

In [26]:
%%time

def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6,n_estimators=10000, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf,min_sum_hessian_in_leaf,subsample):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.01, 1.0),
                                            'num_leaves': (24, 80),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 30),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)}, random_state=200)

    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']


CPU times: user 27 µs, sys: 85 µs, total: 112 µs
Wall time: 126 µs


In [None]:
opt_params = bayes_parameter_opt_lgb(X, y, init_round=5, opt_round=10, n_folds=3, random_seed=6,n_estimators=10000)

Here is my optimal parameter for LightGBM. 

In [25]:
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective']='binary'
opt_params[1]['metric']='auc'
opt_params[1]['is_unbalance']=True
opt_params[1]['boost_from_average']=False
opt_params=opt_params[1]
opt_params


{'bagging_fraction': 0.8587937033434263,
 'feature_fraction': 0.17035409528632064,
 'learning_rate': 0.20242837470652494,
 'max_bin': 74,
 'max_depth': 25,
 'min_data_in_leaf': 39,
 'min_sum_hessian_in_leaf': 13.471651529525241,
 'num_leaves': 27,
 'subsample': 0.7751907561797581,
 'objective': 'binary',
 'metric': 'auc',
 'is_unbalance': True,
 'boost_from_average': False}