<a href="https://colab.research.google.com/github/ArpanSM/Machine_Learning_Hackathons/blob/master/Cipla_Data_Scientist_Hiring_Challenge_Hyperparameter_Tuning_HyperOpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install hyperopt
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/5a/41/24e14322b9986cf72a8763e0a0a69cc256cf963cf9502c8f0044a62c1ae8/catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2MB)
[K     |████████████████████████████████| 69.2MB 41kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


In [None]:
import warnings                                 
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
trainx = pd.read_csv("/content/drive/MyDrive/Projects/CiplaDS_Hiring/fe_train.csv")
testx = pd.read_csv("/content/drive/MyDrive/Projects/CiplaDS_Hiring/fe_test.csv")
pd.set_option('display.max_columns', 500)

X = trainx.drop(["Customer ID","Loan Sanction Amount (USD)"], axis=1)
y = trainx["Loan Sanction Amount (USD)"]

In [None]:
#import required packages
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import gc
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample
#optional but advised
import warnings
warnings.filterwarnings('ignore')

#GLOBAL HYPEROPT PARAMETERS
NUM_EVALS = 1000 #number of hyperopt evaluation rounds
N_FOLDS = 5 #number of cross-validation folds on data in each evaluation round

#LIGHTGBM PARAMETERS
LGBM_MAX_LEAVES = 2**11 #maximum number of leaves per tree for LightGBM
LGBM_MAX_DEPTH = 25 #maximum tree depth for LightGBM
EVAL_METRIC_LGBM_REG = 'rmse' #LightGBM regression metric. Note that 'rmse' is more commonly used 
EVAL_METRIC_LGBM_CLASS = 'auc'#LightGBM classification metric

#XGBOOST PARAMETERS
XGB_MAX_LEAVES = 2**12 #maximum number of leaves when using histogram splitting
XGB_MAX_DEPTH = 25 #maximum tree depth for XGBoost
EVAL_METRIC_XGB_REG = 'rmse' #XGBoost regression metric
EVAL_METRIC_XGB_CLASS = 'auc' #XGBoost classification metric

#CATBOOST PARAMETERS
CB_MAX_DEPTH = 8 #maximum tree depth in CatBoost
OBJECTIVE_CB_REG = 'RMSE' #CatBoost regression metric
OBJECTIVE_CB_CLASS = 'Logloss' #CatBoost classification metric

#OPTIONAL OUTPUT
BEST_SCORE = 0

def quick_hyperopt(data, labels, package='lgbm', num_evals=NUM_EVALS, diagnostic=False):
    
    #==========
    #LightGBM
    #==========
    
    if package=='lgbm':
        
        print('Running {} rounds of LightGBM parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth',
                         'num_leaves',
                          'max_bin',
                         'min_data_in_leaf',
                         'min_data_in_bin']
        
        def objective(space_params):
            
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
            
            #extract nested conditional parameters
            if space_params['boosting']['boosting'] == 'goss':
                top_rate = space_params['boosting'].get('top_rate')
                other_rate = space_params['boosting'].get('other_rate')
                #0 <= top_rate + other_rate <= 1
                top_rate = max(top_rate, 0)
                top_rate = min(top_rate, 0.5)
                other_rate = max(other_rate, 0)
                other_rate = min(other_rate, 0.5)
                space_params['top_rate'] = top_rate
                space_params['other_rate'] = other_rate
            
            subsample = space_params['boosting'].get('subsample', 1.0)
            space_params['boosting'] = space_params['boosting']['boosting']
            space_params['subsample'] = subsample
            
            #for classification, set stratified=True and metrics=EVAL_METRIC_LGBM_CLASS
            cv_results = lgb.cv(space_params, train, nfold = N_FOLDS, stratified=False,
                                early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_REG, seed=42)
            
            best_loss = cv_results['rmse-mean'][-1] #'l2-mean' for rmse
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = 1 - cv_results['auc-mean'][-1]
            #if necessary, replace 'auc-mean' with '[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = lgb.Dataset(data, labels)
                
        #integer and string parameters, used with hp.choice()
        boosting_list = [{'boosting': 'gbdt',
                          'subsample': hp.uniform('subsample', 0.5, 1)},
                         {'boosting': 'goss',
                          'subsample': 1.0,
                         'top_rate': hp.uniform('top_rate', 0, 0.5),
                         'other_rate': hp.uniform('other_rate', 0, 0.5)}] #if including 'dart', make sure to set 'n_estimators'
        metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        #metric_list = ['auc'] #modify as required for other classification metrics
        objective_list_reg = ['huber', 'gamma', 'fair', 'tweedie']
        objective_list_class = ['binary', 'cross_entropy']
        #for classification set objective_list = objective_list_class
        objective_list = objective_list_reg

        space ={'boosting' : hp.choice('boosting', boosting_list),
                'num_leaves' : hp.quniform('num_leaves', 2, LGBM_MAX_LEAVES, 1),
                'max_depth': hp.quniform('max_depth', 2, LGBM_MAX_DEPTH, 1),
                'max_bin': hp.quniform('max_bin', 32, 255, 1),
                'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 256, 1),
                'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
                'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
                'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
                'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'metric' : hp.choice('metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'feature_fraction' : hp.quniform('feature_fraction', 0.5, 1, 0.01),
                'bagging_fraction' : hp.quniform('bagging_fraction', 0.5, 1, 0.01)
            }
        
        #optional: activate GPU for LightGBM
        #follow compilation steps here:
        #https://www.kaggle.com/vinhnguyen/gpu-acceleration-for-lightgbm/
        #then uncomment lines below:
        #space['device'] = 'gpu'
        #space['gpu_platform_id'] = 0,
        #space['gpu_device_id'] =  0

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
                
        #fmin() will return the index of values chosen from the lists/arrays in 'space'
        #to obtain actual values, index values are used to subset the original lists/arrays
        best['boosting'] = boosting_list[best['boosting']]['boosting']#nested dict, index twice
        best['metric'] = metric_list[best['metric']]
        best['objective'] = objective_list[best['objective']]
                
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #XGBoost
    #==========
    
    if package=='xgb':
        
        print('Running {} rounds of XGBoost parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth']
        
        def objective(space_params):
            
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract multiple nested tree_method conditional parameters
            #libera te tutemet ex inferis
            if space_params['tree_method']['tree_method'] == 'hist':
                max_bin = space_params['tree_method'].get('max_bin')
                space_params['max_bin'] = int(max_bin)
                if space_params['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                    grow_policy = space_params['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                    space_params['grow_policy'] = grow_policy
                    space_params['tree_method'] = 'hist'
                else:
                    max_leaves = space_params['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                    space_params['grow_policy'] = 'lossguide'
                    space_params['max_leaves'] = int(max_leaves)
                    space_params['tree_method'] = 'hist'
            else:
                space_params['tree_method'] = space_params['tree_method'].get('tree_method')
                
            #for classification replace EVAL_METRIC_XGB_REG with EVAL_METRIC_XGB_CLASS
            cv_results = xgb.cv(space_params, train, nfold=N_FOLDS, metrics=[EVAL_METRIC_XGB_REG],
                             early_stopping_rounds=100, stratified=False, seed=42)
            
            best_loss = cv_results['test-rmse-mean'].iloc[-1] #or 'test-rmse-mean' if using RMSE  test-mae-mean
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = 1 - cv_results['test-auc-mean'].iloc[-1]
            #if necessary, replace 'test-auc-mean' with 'test-[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = xgb.DMatrix(data, labels)
        
        #integer and string parameters, used with hp.choice()
        boosting_list = ['gbtree', 'gblinear'] #if including 'dart', make sure to set 'n_estimators'
        metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        #metric_list = ['auc']
        #modify as required for other classification metrics classification
        
        tree_method = [{'tree_method' : 'exact'},
               {'tree_method' : 'approx'},
               {'tree_method' : 'hist',
                'max_bin': hp.quniform('max_bin', 2**3, 2**7, 1),
                'grow_policy' : {'grow_policy': {'grow_policy':'depthwise'},
                                'grow_policy' : {'grow_policy':'lossguide',
                                                  'max_leaves': hp.quniform('max_leaves', 32, XGB_MAX_LEAVES, 1)}}}]
        
        #if using GPU, replace 'exact' with 'gpu_exact' and 'hist' with
        #'gpu_hist' in the nested dictionary above
        
        objective_list_reg = ['reg:squarederror', 'reg:gamma', 'reg:tweedie']
        objective_list_class = ['reg:logistic', 'binary:logistic']
        #for classification change line below to 'objective_list = objective_list_class'
        objective_list = objective_list_reg
        
        space ={'boosting' : hp.choice('boosting', boosting_list),
                'tree_method' : hp.choice('tree_method', tree_method),
                'max_depth': hp.quniform('max_depth', 2, XGB_MAX_DEPTH, 1),
                'reg_alpha' : hp.uniform('reg_alpha', 0, 5),
                'reg_lambda' : hp.uniform('reg_lambda', 0, 5),
                'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
                'gamma' : hp.uniform('gamma', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'eval_metric' : hp.choice('eval_metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
                'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
                'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
                'nthread' : -1
            }
        
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        best['tree_method'] = tree_method[best['tree_method']]['tree_method']
        best['boosting'] = boosting_list[best['boosting']]
        best['eval_metric'] = metric_list[best['eval_metric']]
        best['objective'] = objective_list[best['objective']]
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        if 'max_bin' in best:
            best['max_bin'] = int(best['max_bin'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #CatBoost
    #==========
    
    if package=='cb':
        
        print('Running {} rounds of CatBoost parameter optimisation:'.format(num_evals))
        
        #clear memory 
        gc.collect()
            
        integer_params = ['depth',
                          #'one_hot_max_size', #for categorical data
                          'min_data_in_leaf',
                          'max_bin']
        
        def objective(space_params):
                        
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract nested conditional parameters
            if space_params['bootstrap_type']['bootstrap_type'] == 'Bayesian':
                bagging_temp = space_params['bootstrap_type'].get('bagging_temperature')
                space_params['bagging_temperature'] = bagging_temp
                
            if space_params['grow_policy']['grow_policy'] == 'LossGuide':
                max_leaves = space_params['grow_policy'].get('max_leaves')
                space_params['max_leaves'] = int(max_leaves)
                
            space_params['bootstrap_type'] = space_params['bootstrap_type']['bootstrap_type']
            space_params['grow_policy'] = space_params['grow_policy']['grow_policy']
                           
            #random_strength cannot be < 0
            space_params['random_strength'] = max(space_params['random_strength'], 0)
            #fold_len_multiplier cannot be < 1
            space_params['fold_len_multiplier'] = max(space_params['fold_len_multiplier'], 1)
                       
            #for classification set stratified=True
            cv_results = cb.cv(train, space_params, fold_count=N_FOLDS, 
                             early_stopping_rounds=25, stratified=False, partition_random_seed=42)
           
            best_loss = cv_results['test-RMSE-mean'].iloc[-1] #'test-RMSE-mean' for RMSE  test-MAE-mean
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = cv_results['test-Logloss-mean'].iloc[-1]
            #if necessary, replace 'test-Logloss-mean' with 'test-[your-preferred-metric]-mean'
            
            return{'loss':best_loss, 'status': STATUS_OK}
        
        train = cb.Pool(data, labels.astype('float32'))
        
        #integer and string parameters, used with hp.choice()
        bootstrap_type = [{'bootstrap_type':'Bernoulli'}, {'bootstrap_type':'Bayesian','bagging_temperature' : hp.loguniform('bagging_temperature', np.log(1), np.log(50))}] 
        LEB = ['No', 'AnyImprovement'] #remove 'Armijo' if using CPU -- 'Armijo'
        #score_function = ['Correlation', 'L2', 'NewtonCorrelation', 'NewtonL2']
        grow_policy = [{'grow_policy':'SymmetricTree'},
                       {'grow_policy':'Depthwise'},
                       {'grow_policy':'Lossguide',
                        'max_leaves': hp.quniform('max_leaves', 2, 32, 1)}]
        eval_metric_list_reg = ['MAE', 'RMSE', 'Poisson']
        eval_metric_list_class = ['Logloss', 'AUC', 'F1']
        #for classification change line below to 'eval_metric_list = eval_metric_list_class'
        eval_metric_list = eval_metric_list_reg
                
        space ={'depth': hp.quniform('depth', 2, CB_MAX_DEPTH, 1),
                'max_bin' : hp.quniform('max_bin', 1, 32, 1), #if using CPU just set this to 254
                'l2_leaf_reg' : hp.uniform('l2_leaf_reg', 0, 5),
                'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 1, 50, 1),
                'random_strength' : hp.loguniform('random_strength', np.log(0.005), np.log(5)),
                #'one_hot_max_size' : hp.quniform('one_hot_max_size', 2, 16, 1), #uncomment if using categorical features
                'bootstrap_type' : hp.choice('bootstrap_type', bootstrap_type),
                'learning_rate' : hp.uniform('learning_rate', 0.05, 0.25),
                'eval_metric' : hp.choice('eval_metric', eval_metric_list),
                'objective' : OBJECTIVE_CB_REG,
                #'score_function' : hp.choice('score_function', score_function), #crashes kernel - reason unknown
                'leaf_estimation_backtracking' : hp.choice('leaf_estimation_backtracking', LEB),
                'grow_policy': hp.choice('grow_policy', grow_policy),
                #'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),# CPU only
                'fold_len_multiplier' : hp.loguniform('fold_len_multiplier', np.log(1.01), np.log(2.5)),
                'od_type' : 'Iter',
                'od_wait' : 25,
                'task_type' : 'GPU',
                'verbose' : 0
            }
        
        #optional: run CatBoost without GPU
        #uncomment line below
        space['task_type'] = 'CPU'
            
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        #unpack nested dicts first
        best['bootstrap_type'] = bootstrap_type[best['bootstrap_type']]['bootstrap_type']
        best['grow_policy'] = grow_policy[best['grow_policy']]['grow_policy']
        best['eval_metric'] = eval_metric_list[best['eval_metric']]
        
        #best['score_function'] = score_function[best['score_function']] 
        #best['leaf_estimation_method'] = LEM[best['leaf_estimation_method']] #CPU only
        best['leaf_estimation_backtracking'] = LEB[best['leaf_estimation_backtracking']]        
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    else:
        print('Package not recognised. Please use "lgbm" for LightGBM, "xgb" for XGBoost or "cb" for CatBoost.')  

###LightGBM

In [None]:
lgbm_params = quick_hyperopt(X, y, 'lgbm', 500)

Running 500 rounds of LightGBM parameter optimisation:
100%|██████████| 500/500 [2:04:02<00:00, 14.88s/it, best loss: 21361.336218131346]
{bagging_fraction: 0.5
boosting: gbdt
feature_fraction: 0.96
lambda_l1: 4.77689014327168
lambda_l2: 4.499321371418711
learning_rate: 0.10786399263172929
max_bin: 46
max_depth: 6
metric: RMSE
min_data_in_bin: 228
min_data_in_leaf: 20
min_gain_to_split: 1.73
num_leaves: 9
objective: tweedie
subsample: 0.8636282309962746}


In [None]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score

lgb_reg = lgb.LGBMRegressor(bagging_fraction= 0.5,
                            boosting= 'gbdt',
                            feature_fraction= 0.96,
                            lambda_l1= 4.77689014327168,
                            lambda_l2= 4.499321371418711,
                            learning_rate= 0.10786399263172929,
                            max_bin= 46,
                            max_depth= 6,
                            min_data_in_bin= 228,
                            min_data_in_leaf= 20,
                            min_gain_to_split= 1.73,
                            num_leaves= 9,
                            objective= 'tweedie',
                            subsample= 0.8636282309962746)


scores = cross_val_score(lgb_reg, X, y, scoring='r2', cv=10) 
np.mean(scores)

0.7937341178939272

In [None]:
lgb_reg.fit(X,y)
preds = lgb_reg.predict(testx.drop(labels=['Customer ID'],axis=1))
testx['Loan Sanction Amount (USD)'] = preds
sub = testx[['Customer ID','Loan Sanction Amount (USD)']]
sub.to_csv("sub2.csv",index=False)

###XGBoost

In [None]:
xgb_params = quick_hyperopt(X, y, 'xgb', 500)

Running 500 rounds of XGBoost parameter optimisation:
100%|██████████| 500/500 [2:42:47<00:00, 19.53s/it, best loss: 23031.74375]
{boosting: gbtree
colsample_bylevel: 0.66
colsample_bynode: 0.8200000000000001
colsample_bytree: 0.97
eval_metric: MAE
gamma: 1.4127400227564677
learning_rate: 0.19949042730420335
max_depth: 6
min_child_weight: 1.2198659007882582
objective: reg:squarederror
reg_alpha: 1.859401255487128
reg_lambda: 3.2882463739391916
subsample: 1.0
tree_method: exact}


In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score

xg_reg = xgb.XGBRegressor(boosting =  'gbtree' ,
                          colsample_bylevel = 0.66,
                          colsample_bynode = 0.82,
                          colsample_bytree = 0.97,
                          gamma = 1.4127400227564677,
                          learning_rate = 0.19949042730420335,
                          max_bin = 6,
                          min_child_weight = 1.2198659007882582,
                          objective =  'reg:squarederror' ,
                          reg_alpha = 1.859401255487128,
                          reg_lambda = 3.2882463739391916,
                          subsample = 1.0,
                          tree_method =  'hist' )

scores = cross_val_score(xg_reg, X, y, scoring='r2', cv=10) 
np.mean(scores)

0.7284611037194745

In [None]:
xg_reg.fit(X,y)
preds = xg_reg.predict(testx.drop(labels=['Customer ID'],axis=1))
testx['Loan Sanction Amount (USD)'] = preds
sub = testx[['Customer ID','Loan Sanction Amount (USD)']]
sub.to_csv("sub3.csv",index=False)

###CatBoost

In [None]:
cb_params = quick_hyperopt(X, y, 'cb', 10)

Running 10 rounds of CatBoost parameter optimisation:
  0%|          | 0/10 [00:00<?, ?it/s, best loss: ?]Stopped by overfitting detector  (25 iterations wait)
 40%|████      | 4/10 [19:41<29:32, 295.44s/it, best loss: 23735.737704165185]Stopped by overfitting detector  (25 iterations wait)
 80%|████████  | 8/10 [31:00<07:31, 225.61s/it, best loss: 23117.7122553992]Stopped by overfitting detector  (25 iterations wait)
100%|██████████| 10/10 [33:52<00:00, 203.25s/it, best loss: 22340.60945541969]
{bagging_temperature: 1.0533951627285427
bootstrap_type: Bayesian
depth: 4
eval_metric: RMSE
fold_len_multiplier: 1.77261654004014
grow_policy: Depthwise
l2_leaf_reg: 1.848322548272635
leaf_estimation_backtracking: No
learning_rate: 0.22729338132226334
max_bin: 24
min_data_in_leaf: 12
random_strength: 0.02072616892556881}


In [None]:
import catboost as cb
from sklearn.model_selection import cross_val_score

cb_reg = cb.CatBoostRegressor(bootstrap_type =  'Bayesian' ,
                          bagging_temperature = 1.0533951627285427,
                          depth = 4,
                          grow_policy = 'Depthwise',
                          fold_len_multiplier = 1.77261654004014,
                          l2_leaf_reg = 1.848322548272635,
                          leaf_estimation_backtracking = 'No',
                          learning_rate =  0.22729338132226334 ,
                          max_bin = 24,
                          min_data_in_leaf = 12,
                          random_strength = 0.02072616892556881 )

scores = cross_val_score(cb_reg, X, y, scoring='r2', cv=10) 
np.mean(scores)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0:	learn: 41139.9108560	total: 54.5ms	remaining: 54.5s
1:	learn: 35881.7468190	total: 110ms	remaining: 55s
2:	learn: 32160.5814638	total: 159ms	remaining: 53s
3:	learn: 29545.4155855	total: 211ms	remaining: 52.5s
4:	learn: 27521.2597692	total: 272ms	remaining: 54.1s
5:	learn: 25950.9172771	total: 326ms	remaining: 54s
6:	learn: 24883.8371984	total: 379ms	remaining: 53.8s
7:	learn: 24226.6151740	total: 433ms	remaining: 53.7s
8:	learn: 23764.2770595	total: 497ms	remaining: 54.7s
9:	learn: 23374.3064597	total: 554ms	remaining: 54.8s
10:	learn: 23009.7244360	total: 604ms	remaining: 54.3s
11:	learn: 22798.6078918	total: 656ms	remaining: 54s
12:	learn: 22586.7298268	total: 721ms	remaining: 54.7s
13:	learn: 22429.5071191	total: 791ms	remaining: 55.7s
14:	learn: 22301.6018931	total: 835ms	remaining: 54.9s
15:	learn: 22242.3940589	total: 900ms	remaining: 55.3s
16:	learn: 22157.6587602	total: 954ms	remaining: 55.1s
17:	learn: 22126.

0.7647668619127058

In [None]:
cb_reg.fit(X,y)
preds = cb_reg.predict(testx.drop(labels=['Customer ID'],axis=1))
testx['Loan Sanction Amount (USD)'] = preds
sub = testx[['Customer ID','Loan Sanction Amount (USD)']]
sub.to_csv("sub4.csv",index=False)

0:	learn: 40993.2012648	total: 59.9ms	remaining: 59.8s
1:	learn: 35383.9902928	total: 118ms	remaining: 58.9s
2:	learn: 31626.4517859	total: 183ms	remaining: 1m
3:	learn: 28825.0510525	total: 237ms	remaining: 59.1s
4:	learn: 27020.8099994	total: 302ms	remaining: 1m
5:	learn: 25595.5903343	total: 356ms	remaining: 59s
6:	learn: 24607.5123273	total: 417ms	remaining: 59.2s
7:	learn: 24011.3535692	total: 486ms	remaining: 1m
8:	learn: 23486.7429151	total: 542ms	remaining: 59.6s
9:	learn: 23169.2131883	total: 586ms	remaining: 58.1s
10:	learn: 22929.1888044	total: 639ms	remaining: 57.5s
11:	learn: 22760.7262398	total: 704ms	remaining: 58s
12:	learn: 22583.3691249	total: 766ms	remaining: 58.2s
13:	learn: 22467.5925843	total: 826ms	remaining: 58.2s
14:	learn: 22366.9217911	total: 902ms	remaining: 59.2s
15:	learn: 22296.0794841	total: 958ms	remaining: 58.9s
16:	learn: 22224.8223609	total: 1.01s	remaining: 58.6s
17:	learn: 22102.8781833	total: 1.07s	remaining: 58.6s
18:	learn: 22064.0453280	total: 