In [1]:
# !pip install lightgbm xgboost catboost category-encoders sklearn pandas==1.1.5

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# drive_path = 'drive/MyDrive/深度学习/机器学习训练营/final/'

Mounted at /content/drive


In [2]:
import pandas as pd
import missingno as msno
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import lightgbm as lgb

In [3]:
drive_path = 'final/'

## data

In [4]:
train = pd.read_csv(drive_path + 'train_final.csv')
test = pd.read_csv(drive_path + 'test_final.csv')

In [5]:
train_dataset = lgb.Dataset(train.drop(columns='loan_status'), train['loan_status'])
test_dataset = lgb.Dataset(test.drop(columns='loan_status'), test['loan_status'])

## LGBM

In [54]:
import io
import multiprocessing
from contextlib import redirect_stdout
from copy import deepcopy
from dataclasses import dataclass, asdict
import hyperopt.pyll
from hyperopt import fmin, tpe, hp
from hyperopt import space_eval
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

import copy
cpu_count = 4
use_gpu = False
@dataclass
class LGBOpt:
    num_threads: any = hp.choice('num_threads', [cpu_count])
    num_leaves: any = hp.choice('num_leaves', [30,40,50,60,70,80,90,100,110,120,130,140])
    metric: any = hp.choice('metric', ['binary_error'])
    num_round: any = hp.choice('num_rounds', [1500])
    objective: any = hp.choice('objective', ['binary'])
    learning_rate: any = hp.uniform('learning_rate', 0.01, 0.1)
    feature_fraction: any = hp.uniform('feature_fraction', 0.5, 1.0)
    bagging_fraction: any = hp.uniform('bagging_fraction', 0.8, 1.0)
    device_type: any = hp.choice('device_tpye', ['gpu']) if use_gpu else hp.choice('device_type',
                                                                                   ['cpu'])
    boosting: any = hp.choice('boosting', ['gbdt', 'dart', 'goss'])
    extra_trees: any = hp.choice('extra_tress', [False, True])
    drop_rate: any = hp.uniform('drop_rate', 0, 0.2)
    uniform_drop: any = hp.choice('uniform_drop', [True, False])
    lambda_l1: any = hp.uniform('lambda_l1', 0, 10)  # TODO: Check range
    lambda_l2: any = hp.uniform('lambda_l2', 0, 10)  # TODO: Check range
    min_gain_to_split: any = hp.uniform('min_gain_to_split', 0, 1)  # TODO: Check range
    min_data_in_bin = hp.choice('min_data_in_bin', [3, 5, 10, 15, 20, 50])

    @staticmethod
    def get_common_params():
        return {'num_thread': 4, 'num_leaves': 12, 'metric': 'binary', 'objective': 'binary',
                'num_round': 1000, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}

In [121]:
class FitterBase(object):
    def __init__(self, label, metric, max_eval=100, opt=None):
        self.label = label
        self.metric = metric
        self.opt_params = dict()
        self.max_eval = max_eval
        self.opt = opt

    def get_loss(self, y, y_pred):
        if self.metric == 'error':
            return 1 - accuracy_score(y, y_pred)
        elif self.metric == 'precision':
            return 1 - precision_score(y, y_pred)
        elif self.metric == 'recall':
            return 1 - recall_score(y, y_pred)
        elif self.metric == 'macro_f1':
            return 1 - f1_score(y, y_pred, average='macro')
        elif self.metric == 'micro_f1':
            return 1 - f1_score(y, y_pred, average='micro')
        elif self.metric == 'auc':  # TODO: Add a warning checking if y_predict is all [0, 1], it should be probability
            return 1 - roc_auc_score(y, y_pred)
        else:
            raise Exception("Not implemented yet.")

class LGBFitter(FitterBase):
    def __init__(self, label='label', metric='error', opt: LGBOpt = None, max_eval=100):
        super(LGBFitter, self).__init__(label, metric, max_eval)
        if opt is not None:
            self.opt = opt
        else:
            self.opt = LGBOpt()
        self.best_round = None
        self.clf = None

    def train(self, train_df, eval_df, params=None, use_best_eval=True):
        self.best_round = None
        dtrain = lgb.Dataset(train_df.drop(columns=[self.label]), train_df[self.label])
        deval = lgb.Dataset(eval_df.drop(columns=[self.label]), eval_df[self.label])
        evallist = [dtrain, deval]
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)

        num_round = use_params.pop('num_round')
        if use_best_eval:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            min_error = np.inf
            min_index = 0
            for idx in range(len(output) - 1):
                if len(output[idx].split("\t")) == 3:
                    temp = float(output[idx].split("\t")[2].split(":")[1])
                    if min_error > temp:
                        min_error = temp
                        min_index = int(output[idx].split("\t")[0][1:-1])
            print("The minimum is attained in round %d" % (min_index + 1))
            self.best_round = min_index + 1
            return output
        else:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            self.best_round = num_round
            return output

    def search(self, train_df, eval_df, use_best_eval=True):
        self.opt_params = dict()

        def train_impl(params):
            self.train(train_df, eval_df, params, use_best_eval)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                           num_iteration=self.best_round) > 0.5).astype(int)
            return self.get_loss(eval_df[self.label], y_pred)

        self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def search_k_fold(self, k_fold, data, use_best_eval=True):
        self.opt_params = dict()

        def train_impl_nfold(params):
            loss = list()
            for train_id, eval_id in k_fold.split(data):
                train_df = data.loc[train_id]
                eval_df = data.loc[eval_id]
                self.train(train_df, eval_df, params, use_best_eval)
                if self.metric == 'auc':
                    y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
                else:
                    y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                               num_iteration=self.best_round) > 0.5).astype(int)
                loss.append(self.get_loss(eval_df[self.label], y_pred))
            return np.mean(loss)

        self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True, use_best_eval=True):
        acc_result = list()
        train_pred = np.empty(train_data.shape[0])
        test_pred = np.empty(test_data.shape[0])
        if drop_test_y:
            dtest = test_data.drop(columns=self.label)
        else:
            dtest = test_data

        models = list()
        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.loc[train_id]
            eval_df = train_data.loc[eval_id]
            self.train(train_df, eval_df, params, use_best_eval)
            models.append(copy.deepcopy(self.clf))
            train_pred[eval_id] = self.clf.predict(eval_df.drop(columns=self.label), num_iteration=self.best_round)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                           num_iteration=self.best_round) > 0.5).astype(int)
            acc_result.append(self.get_loss(eval_df[self.label], y_pred))
            test_pred += self.clf.predict(dtest, num_iteration=self.best_round)
        test_pred /= k_fold.n_splits
        return train_pred, test_pred, acc_result, models

In [56]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)

### baseline

In [9]:
fitter = LGBFitter(label='loan_status')

fitter.search_k_fold(kfold, train)
para = space_eval(asdict(fitter.opt), fitter.opt_params)

The minimum is attained in round 737                   
The minimum is attained in round 120                   
The minimum is attained in round 118                   
The minimum is attained in round 164                   
The minimum is attained in round 185                   
The minimum is attained in round 525                                              
The minimum is attained in round 146                                              
The minimum is attained in round 49                                               
The minimum is attained in round 536                                              
The minimum is attained in round 140                                              
The minimum is attained in round 118                                                  
The minimum is attained in round 46                                                   
The minimum is attained in round 42                                                   
The minimum is attained in round 497        

The minimum is attained in round 155                                        
The minimum is attained in round 426                                        
The minimum is attained in round 208                                        
The minimum is attained in round 975                                        
The minimum is attained in round 226                                        
The minimum is attained in round 417                                                    
The minimum is attained in round 768                                                    
The minimum is attained in round 341                                                    
The minimum is attained in round 705                                                    
The minimum is attained in round 299                                                    
The minimum is attained in round 198                                                    
The minimum is attained in round 312                                             

The minimum is attained in round 910                                                 
The minimum is attained in round 996                                                 
The minimum is attained in round 898                                                 
The minimum is attained in round 859                                                 
The minimum is attained in round 912                                                 
The minimum is attained in round 289                                                   
The minimum is attained in round 388                                                   
The minimum is attained in round 607                                                   
The minimum is attained in round 830                                                   
The minimum is attained in round 140                                                   
The minimum is attained in round 938                                                   
The minimum is attained in round 158      

The minimum is attained in round 80                                      
The minimum is attained in round 51                                      
The minimum is attained in round 279                                     
The minimum is attained in round 247                                     
The minimum is attained in round 758                                        
The minimum is attained in round 566                                        
The minimum is attained in round 73                                         
The minimum is attained in round 290                                        
The minimum is attained in round 116                                        
The minimum is attained in round 883                                        
The minimum is attained in round 122                                       
The minimum is attained in round 607                                       
The minimum is attained in round 356                                       
The minimum is

The minimum is attained in round 383                                     
The minimum is attained in round 793                                     
The minimum is attained in round 372                                     
The minimum is attained in round 815                                     
The minimum is attained in round 171                                     
The minimum is attained in round 240                                     
The minimum is attained in round 72                                      
The minimum is attained in round 421                                     
The minimum is attained in round 271                                     
The minimum is attained in round 241                                     
The minimum is attained in round 202                                     
The minimum is attained in round 99                                      
The minimum is attained in round 663                                     
The minimum is attained in round 277  

In [161]:
fitter = LGBFitter(label='loan_status')
output = fitter.train_k_fold(kfold, train, test, params=para, use_best_eval=True)
test_acc = accuracy_score(test['loan_status'], output[1].round())
print(f'accuracy_score: {test_acc}')

The minimum is attained in round 232
Finished loading model, total used 1000 iterations
The minimum is attained in round 252
Finished loading model, total used 1000 iterations
The minimum is attained in round 105
Finished loading model, total used 1000 iterations
The minimum is attained in round 823
Finished loading model, total used 1000 iterations
The minimum is attained in round 163
Finished loading model, total used 1000 iterations
accuracy_score: 0.91108


In [57]:
train_new = train.copy(deep=True)
test_new = test.copy(deep=True)

In [58]:
# train_new = train_new.drop(columns=['discrete_term_1_one_hot','discrete_home_ownership_1_one_hot',
#                                     'discrete_purpose_1_one_hot','continuous_pub_rec','continuous_dti_joint'])
# test_new = test_new.drop(columns=['discrete_term_1_one_hot','discrete_home_ownership_1_one_hot',
#                                   'discrete_purpose_1_one_hot','continuous_pub_rec','continuous_dti_joint'])

train_new['diff_funded_amt'] = train_new['continuous_funded_amnt'] - train_new['continuous_funded_amnt_inv']
test_new['diff_funded_amt'] = test_new['continuous_funded_amnt'] - test_new['continuous_funded_amnt_inv']

train_new['division_continuous_mths_since'] = train_new['continuous_mths_since_last_record'] / train_new['continuous_mths_since_last_major_derog']
test_new['division_continuous_mths_since'] = test_new['continuous_mths_since_last_record'] / test_new['continuous_mths_since_last_major_derog']

train_new['rate_inst_inc'] = train_new['continuous_installment']*12 / train_new['continuous_annual_inc']
test_new['rate_inst_inc'] = test_new['continuous_installment']*12 / test_new['continuous_annual_inc']

In [59]:
fitter_new = LGBFitter(label='loan_status')

fitter_new.search_k_fold(kfold, train_new)
para_new = space_eval(asdict(fitter_new.opt), fitter_new.opt_params)
print(para_new)

The minimum is attained in round 231                   
The minimum is attained in round 194                   
The minimum is attained in round 141                   
The minimum is attained in round 365                   
The minimum is attained in round 483                   
The minimum is attained in round 131                                                
The minimum is attained in round 852                                                
The minimum is attained in round 254                                                
The minimum is attained in round 117                                                
The minimum is attained in round 160                                                
The minimum is attained in round 1247                                               
The minimum is attained in round 106                                              
The minimum is attained in round 381                                              
The minimum is attained in round 1200       

The minimum is attained in round 142                                                 
The minimum is attained in round 661                                                 
The minimum is attained in round 282                                                 
The minimum is attained in round 370                                                 
The minimum is attained in round 109                                                  
The minimum is attained in round 107                                                  
The minimum is attained in round 705                                                  
The minimum is attained in round 1097                                                 
The minimum is attained in round 339                                                  
The minimum is attained in round 109                                                  
The minimum is attained in round 80                                                  
The minimum is attained in round 217            

The minimum is attained in round 50                                                
The minimum is attained in round 285                                               
The minimum is attained in round 924                                               
The minimum is attained in round 158                                               
The minimum is attained in round 116                                               
The minimum is attained in round 78                                                
The minimum is attained in round 96                                                
The minimum is attained in round 377                                               
The minimum is attained in round 302                                               
The minimum is attained in round 189                                               
The minimum is attained in round 1224                                              
The minimum is attained in round 411                                        

The minimum is attained in round 192                                                    
The minimum is attained in round 133                                                    
The minimum is attained in round 166                                                    
The minimum is attained in round 413                                                    
The minimum is attained in round 107                                                    
The minimum is attained in round 105                                                    
The minimum is attained in round 67                                                     
The minimum is attained in round 396                                                    
The minimum is attained in round 283                                                    
The minimum is attained in round 72                                                     
The minimum is attained in round 50                                                     
The minimum is attain

The minimum is attained in round 505                                                 
The minimum is attained in round 213                                                 
The minimum is attained in round 916                                                 
The minimum is attained in round 1347                                                
The minimum is attained in round 1254                                                
The minimum is attained in round 103                                                 
The minimum is attained in round 132                                                 
The minimum is attained in round 218                                                 
The minimum is attained in round 517                                                 
The minimum is attained in round 1239                                                
The minimum is attained in round 1171                                                
The minimum is attained in round 650                  

The minimum is attained in round 900                                                 
The minimum is attained in round 105                                                 
The minimum is attained in round 193                                                 
The minimum is attained in round 273                                                 
The minimum is attained in round 391                                                 
The minimum is attained in round 265                                                 
The minimum is attained in round 987                                                 
The minimum is attained in round 935                                                 
The minimum is attained in round 1220                                                
The minimum is attained in round 1445                                                
The minimum is attained in round 940                                                 
The minimum is attained in round 156                  

In [159]:
para_new = {'bagging_fraction': 0.9460815583946198, 'boosting': 'goss', 'device_type': 'cpu', 'drop_rate': 0.19544834548867776, 'extra_trees': True, 'feature_fraction': 0.9808059037360214, 'lambda_l1': 4.582193079266174, 'lambda_l2': 0.8068402557589727, 'learning_rate': 0.0769855835499045, 'metric': 'binary_error', 'min_gain_to_split': 0.9676266908986928, 'num_leaves': 50, 'num_round': 2000, 'num_threads': 4, 'objective': 'binary', 'uniform_drop': True}

In [160]:
fitter = LGBFitter(label='loan_status')
result = fitter.train_k_fold(kfold, train_new, test_new, params=para_new, use_best_eval=True)
test_acc = accuracy_score(test_new['loan_status'], result[1].round())
print(f'accuracy_score: {test_acc}')

The minimum is attained in round 94
Finished loading model, total used 1990 iterations
The minimum is attained in round 147
Finished loading model, total used 1975 iterations
The minimum is attained in round 384
Finished loading model, total used 1973 iterations
The minimum is attained in round 1521
Finished loading model, total used 1981 iterations
The minimum is attained in round 237
Finished loading model, total used 1984 iterations
accuracy_score: 0.91822


## xgboost

In [88]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [72]:
dtrain = xgb.DMatrix(train.drop(columns='loan_status'), label=train['loan_status'])
dtest = xgb.DMatrix(test.drop(columns='loan_status'), label=test['loan_status'])

In [167]:
param = {'max_depth': 10, 'eta': 0.02, 'gamma': 1, 'objective': 'binary:logistic', 'eval_metric': ["error"],'nthread':4,}

num_round = 300
evallist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-error:0.09322	train-error:0.06544
[1]	eval-error:0.09206	train-error:0.06484
[2]	eval-error:0.09132	train-error:0.06346
[3]	eval-error:0.09100	train-error:0.06328
[4]	eval-error:0.08976	train-error:0.06324
[5]	eval-error:0.09022	train-error:0.06284
[6]	eval-error:0.08970	train-error:0.06258
[7]	eval-error:0.08896	train-error:0.06266
[8]	eval-error:0.08918	train-error:0.06108
[9]	eval-error:0.08878	train-error:0.06154
[10]	eval-error:0.08878	train-error:0.06066
[11]	eval-error:0.08850	train-error:0.06096
[12]	eval-error:0.08842	train-error:0.06076
[13]	eval-error:0.08816	train-error:0.06062
[14]	eval-error:0.08822	train-error:0.06050
[15]	eval-error:0.08830	train-error:0.06048
[16]	eval-error:0.08822	train-error:0.06012
[17]	eval-error:0.08798	train-error:0.05988
[18]	eval-error:0.08800	train-error:0.05944
[19]	eval-error:0.08786	train-error:0.05926
[20]	eval-error:0.08748	train-error:0.05904
[21]	eval-error:0.08738	train-error:0.05896
[22]	eval-error:0.08734	train-error:0.0587

[185]	eval-error:0.08466	train-error:0.04784
[186]	eval-error:0.08470	train-error:0.04778
[187]	eval-error:0.08462	train-error:0.04778
[188]	eval-error:0.08466	train-error:0.04768
[189]	eval-error:0.08462	train-error:0.04764
[190]	eval-error:0.08458	train-error:0.04760
[191]	eval-error:0.08460	train-error:0.04758
[192]	eval-error:0.08462	train-error:0.04754
[193]	eval-error:0.08458	train-error:0.04748
[194]	eval-error:0.08460	train-error:0.04734
[195]	eval-error:0.08458	train-error:0.04728
[196]	eval-error:0.08460	train-error:0.04728
[197]	eval-error:0.08458	train-error:0.04732
[198]	eval-error:0.08466	train-error:0.04720
[199]	eval-error:0.08456	train-error:0.04714
[200]	eval-error:0.08460	train-error:0.04710
[201]	eval-error:0.08462	train-error:0.04708
[202]	eval-error:0.08460	train-error:0.04700
[203]	eval-error:0.08452	train-error:0.04702
[204]	eval-error:0.08458	train-error:0.04704
[205]	eval-error:0.08466	train-error:0.04700
[206]	eval-error:0.08470	train-error:0.04694
[207]	eval

In [166]:
ypred = bst.predict(dtest)
print('acc:', accuracy_score(test['loan_status'], ypred.round()))

acc: 0.91574


In [155]:
cpu_count = 4
use_gpu = False
@dataclass
class XGBOpt:
    nthread: any = hp.choice('nthread', [cpu_count])
    max_depth: any = hp.choice('max_depth', range(3,12,2))
    eval_metric: any = hp.choice('eval_metric', ['error'])
    num_round: any = hp.choice('num_round', [800])
    objective: any = hp.choice('objective', ['binary:logistic'])
    eta: any = hp.uniform('eta', 0.04, 0.8)
    gamma: any = hp.uniform('gamma', 0.3, 0.9)
        
    booster: any = hp.choice('booster', ['gbtree', 'dart', 'gblinear'])
    sampling_method: any = hp.choice('sampling_method', ['uniform', 'gradient_based'])
#     reg_lambda: any = hp.uniform('reg_lambda', 0, 10)
#     reg_alpha: any = hp.uniform('reg_alpha', 0, 10)
    tree_method: any = hp.choice('tree_method', ['exact', 'approx', 'hist'])

In [156]:
class FitterBase(object):
    def __init__(self, label, metric, max_eval=100, opt=None):
        self.label = label
        self.eval_metric = metric
        self.opt_params = dict()
        self.max_eval = max_eval
        self.opt = opt

    def get_loss(self, y, y_pred):
        if self.eval_metric == 'error':
            return 1 - accuracy_score(y, y_pred)
        elif self.eval_metric == 'precision':
            return 1 - precision_score(y, y_pred)
        elif self.eval_metric == 'recall':
            return 1 - recall_score(y, y_pred)
        elif self.eval_metric == 'macro_f1':
            return 1 - f1_score(y, y_pred, average='macro')
        elif self.eval_metric == 'micro_f1':
            return 1 - f1_score(y, y_pred, average='micro')
        elif self.eval_metric == 'auc':  # TODO: Add a warning checking if y_predict is all [0, 1], it should be probability
            return 1 - roc_auc_score(y, y_pred)
        else:
            raise Exception("Not implemented yet.")

class XGBFitter(FitterBase):
    def __init__(self, label='label', eval_metric='error', opt: XGBOpt = None, max_eval=100):
        super(XGBFitter, self).__init__(label, eval_metric, max_eval)
        if opt is not None:
            self.opt = opt
        else:
            self.opt = XGBOpt()
        self.best_round = None
        self.clf = None

    def train(self, train_df, eval_df, params=None, use_best_eval=True):
        self.best_round = None
        dtrain = xgb.DMatrix(train_df.drop(columns=[self.label]), train_df[self.label])
        deval = xgb.DMatrix(eval_df.drop(columns=[self.label]), eval_df[self.label])
        evallist = [(deval, 'eval'), (dtrain, 'train')]
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)

        num_round = use_params.pop('num_round')
        if use_best_eval:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = xgb.train(use_params, dtrain, num_round, evallist)
                output = buf.getvalue().split("\n")
            min_error = np.inf
            min_index = 0
            for idx in range(len(output) - 1):
                if len(output[idx].split("\t")) == 3:
                    temp = float(output[idx].split("\t")[2].split(":")[1])
                    if min_error > temp:
                        min_error = temp
                        min_index = int(output[idx].split("\t")[0][1:-1])
            print("The minimum is attained in round %d" % (min_index + 1))
            self.best_round = min_index + 1
            return output
        else:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, evallist)
                output = buf.getvalue().split("\n")
            self.best_round = num_round
            return output

    def search(self, train_df, eval_df, use_best_eval=True):
        self.opt_params = dict()

        def train_impl(params):
            self.train(train_df, eval_df, params, use_best_eval)
            if self.eval_metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), early_stopping_rounds=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                           early_stopping_rounds=self.best_round) > 0.5).astype(int)
            return self.get_loss(eval_df[self.label], y_pred)

        self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def search_k_fold(self, k_fold, data, use_best_eval=True):
        self.opt_params = dict()

        def train_impl_nfold(params):
            loss = list()
            for train_id, eval_id in k_fold.split(data):
                train_df = data.loc[train_id]
                eval_df = data.loc[eval_id]
                dtrain = xgb.DMatrix(train_df.drop(columns=[self.label]), train_df[self.label])
                deval = xgb.DMatrix(eval_df.drop(columns=[self.label]), eval_df[self.label])
                self.train(train_df, eval_df, params, use_best_eval)
                if self.eval_metric == 'auc':
                    y_pred = self.clf.predict(eval_df.drop(columns=[self.label]))
                else:
                    y_pred = (self.clf.predict(deval) > 0.5).astype(int)
                loss.append(self.get_loss(eval_df[self.label], y_pred))
            return np.mean(loss)
        self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True, use_best_eval=True):
        acc_result = list()
        train_pred = np.empty(train_data.shape[0])
        test_pred = np.empty(test_data.shape[0])
        if drop_test_y:
            dtest = test_data.drop(columns=self.label)
        else:
            dtest = test_data

        models = list()
        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.loc[train_id]
            eval_df = train_data.loc[eval_id]
            self.train(train_df, eval_df, params, use_best_eval)
            models.append(copy.deepcopy(self.clf))
            train_pred[eval_id] = self.clf.predict(eval_df.drop(columns=self.label), early_stopping_rounds=self.best_round)
            if self.eval_metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), early_stopping_rounds=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                           early_stopping_rounds=self.best_round) > 0.5).astype(int)
            acc_result.append(self.get_loss(eval_df[self.label], y_pred))
            test_pred += self.clf.predict(dtest, early_stopping_rounds=self.best_round)
        test_pred /= k_fold.n_splits
        return train_pred, test_pred, acc_result, models

In [172]:
fitter = XGBFitter(label='loan_status')
result = fitter.train_k_fold(kfold, train, test, params=param, use_best_eval=True)
test_acc = accuracy_score(test_new['loan_status'], result[1].round())
print(f'accuracy_score: {test_acc}')

KeyError: 'num_round'

## randomForest

In [205]:
class RFOpt:
    num_threads: any = hp.choice('num_threads', [cpu_count])
    num_leaves: any = hp.choice('num_leaves', [30,40,50,60,70,80,90,100,110,120,130,140])
    metric: any = hp.choice('metric', ['binary_error'])
    num_round: any = hp.choice('num_rounds', [1500])
    objective: any = hp.choice('objective', ['binary'])
    learning_rate: any = hp.uniform('learning_rate', 0.01, 0.1)
    feature_fraction: any = hp.uniform('feature_fraction', 0.5, 1.0)
    bagging_fraction: any = hp.uniform('bagging_fraction', 0.8, 1.0)
    device_type: any = hp.choice('device_tpye', ['gpu']) if use_gpu else hp.choice('device_type',
                                                                                   ['cpu'])
    boosting: any = hp.choice('boosting', ['rf'])
    extra_trees: any = hp.choice('extra_tress', [False, True])
    drop_rate: any = hp.uniform('drop_rate', 0, 0.2)
    uniform_drop: any = hp.choice('uniform_drop', [True, False])
    lambda_l1: any = hp.uniform('lambda_l1', 0, 10)  # TODO: Check range
    lambda_l2: any = hp.uniform('lambda_l2', 0, 10)  # TODO: Check range
    min_gain_to_split: any = hp.uniform('min_gain_to_split', 0, 1)  # TODO: Check range
    min_data_in_bin = hp.choice('min_data_in_bin', [3, 5, 10, 15, 20, 50])

    @staticmethod
    def get_common_params():
        return {'num_thread': 4, 'num_leaves': 12, 'metric': 'binary', 'objective': 'binary',
                'num_round': 1000, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}

In [208]:
class FitterBase(object):
    def __init__(self, label, metric, max_eval=100, opt=None):
        self.label = label
        self.metric = metric
        self.opt_params = dict()
        self.max_eval = max_eval
        self.opt = opt

    def get_loss(self, y, y_pred):
        if self.metric == 'error':
            return 1 - accuracy_score(y, y_pred)
        elif self.metric == 'precision':
            return 1 - precision_score(y, y_pred)
        elif self.metric == 'recall':
            return 1 - recall_score(y, y_pred)
        elif self.metric == 'macro_f1':
            return 1 - f1_score(y, y_pred, average='macro')
        elif self.metric == 'micro_f1':
            return 1 - f1_score(y, y_pred, average='micro')
        elif self.metric == 'auc':  # TODO: Add a warning checking if y_predict is all [0, 1], it should be probability
            return 1 - roc_auc_score(y, y_pred)
        else:
            raise Exception("Not implemented yet.")

class RFFitter(FitterBase):
    def __init__(self, label='label', metric='error', opt: LGBOpt = None, max_eval=100):
        super(RFFitter, self).__init__(label, metric, max_eval)
        if opt is not None:
            self.opt = opt
        else:
            self.opt = LGBOpt()
        self.best_round = None
        self.clf = None

    def train(self, train_df, eval_df, params=None, use_best_eval=True):
        self.best_round = None
        dtrain = lgb.Dataset(train_df.drop(columns=[self.label]), train_df[self.label])
        deval = lgb.Dataset(eval_df.drop(columns=[self.label]), eval_df[self.label])
        evallist = [dtrain, deval]
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)

        num_round = use_params.pop('num_round')
        if use_best_eval:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            min_error = np.inf
            min_index = 0
            for idx in range(len(output) - 1):
                if len(output[idx].split("\t")) == 3:
                    temp = float(output[idx].split("\t")[2].split(":")[1])
                    if min_error > temp:
                        min_error = temp
                        min_index = int(output[idx].split("\t")[0][1:-1])
            print("The minimum is attained in round %d" % (min_index + 1))
            self.best_round = min_index + 1
            return output
        else:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            self.best_round = num_round
            return output

    def search(self, train_df, eval_df, use_best_eval=True):
        self.opt_params = dict()

        def train_impl(params):
            self.train(train_df, eval_df, params, use_best_eval)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                           num_iteration=self.best_round) > 0.5).astype(int)
            return self.get_loss(eval_df[self.label], y_pred)

        self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def search_k_fold(self, k_fold, data, use_best_eval=True):
        self.opt_params = dict()

        def train_impl_nfold(params):
            loss = list()
            for train_id, eval_id in k_fold.split(data):
                train_df = data.loc[train_id]
                eval_df = data.loc[eval_id]
                self.train(train_df, eval_df, params, use_best_eval)
                if self.metric == 'auc':
                    y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
                else:
                    y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                               num_iteration=self.best_round) > 0.5).astype(int)
                loss.append(self.get_loss(eval_df[self.label], y_pred))
            return np.mean(loss)

        self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True, use_best_eval=True):
        acc_result = list()
        train_pred = np.empty(train_data.shape[0])
        test_pred = np.empty(test_data.shape[0])
        if drop_test_y:
            dtest = test_data.drop(columns=self.label)
        else:
            dtest = test_data

        models = list()
        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.loc[train_id]
            eval_df = train_data.loc[eval_id]
            self.train(train_df, eval_df, params, use_best_eval)
            models.append(copy.deepcopy(self.clf))
            train_pred[eval_id] = self.clf.predict(eval_df.drop(columns=self.label), num_iteration=self.best_round)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                           num_iteration=self.best_round) > 0.5).astype(int)
            acc_result.append(self.get_loss(eval_df[self.label], y_pred))
            test_pred += self.clf.predict(dtest, num_iteration=self.best_round)
        test_pred /= k_fold.n_splits
        return train_pred, test_pred, acc_result, models

In [None]:
fitter_new ==RFFitter(label='loan_status')

fitter_new.search_k_fold(kfold, train_new)
para_new = space_eval(asdict(fitter_new.opt), fitter_new.opt_params)
print(para_new)

The minimum is attained in round 118                   
The minimum is attained in round 68                    
The minimum is attained in round 241                   
The minimum is attained in round 965                   
The minimum is attained in round 1017                  
The minimum is attained in round 83                                   
The minimum is attained in round 101                                  
The minimum is attained in round 122                                  
The minimum is attained in round 248                                  
The minimum is attained in round 94                                   
The minimum is attained in round 1484                                   
The minimum is attained in round 544                                    
The minimum is attained in round 974                                    
The minimum is attained in round 1319                                   
The minimum is attained in round 750                                    


The minimum is attained in round 389                                   
The minimum is attained in round 400                                   
The minimum is attained in round 924                                   
The minimum is attained in round 124                                   
The minimum is attained in round 255                                   
The minimum is attained in round 1059                                  
The minimum is attained in round 1084                                  
The minimum is attained in round 988                                   
The minimum is attained in round 136                                   
The minimum is attained in round 172                                   
The minimum is attained in round 321                                   
The minimum is attained in round 626                                   
The minimum is attained in round 185                                   
The minimum is attained in round 317                            

The minimum is attained in round 281                                        
The minimum is attained in round 276                                        
The minimum is attained in round 77                                         
The minimum is attained in round 60                                        
The minimum is attained in round 23                                        
The minimum is attained in round 171                                       
The minimum is attained in round 170                                       
The minimum is attained in round 293                                       
The minimum is attained in round 318                                       
The minimum is attained in round 495                                       
The minimum is attained in round 301                                       
The minimum is attained in round 282                                       
The minimum is attained in round 124                                       
The minim

## stacking

In [None]:
para_lgbm = {'bagging_fraction': 0.9460815583946198, 'boosting': 'goss', 'device_type': 'cpu', 'drop_rate': 0.19544834548867776, 'extra_trees': True, 'feature_fraction': 0.9808059037360214, 'lambda_l1': 4.582193079266174, 'lambda_l2': 0.8068402557589727, 'learning_rate': 0.0769855835499045, 'metric': 'binary_error', 'min_gain_to_split': 0.9676266908986928, 'num_leaves': 50, 'num_round': 2000, 'num_threads': 4, 'objective': 'binary', 'uniform_drop': True}
para_xgb = {'max_depth': 10, 'eta': 0.02, 'gamma': 1, 'objective': 'binary:logistic', 'eval_metric': ["error"],'nthread':4,}
para_rf = {}

num_round_list = [1000, 500, 1000]

In [None]:
def stacking_train_k_fold(k_fold, train_data, test_data, drop_test_y=True, use_best_eval=True):
        acc_result = list()
        train_pred = np.empty(train_data.shape[0])
        test_pred = np.empty(test_data.shape[0])
        if drop_test_y:
            dtest = test_data.drop(columns=['loan_status'])
        else:
            dtest = test_data

        models = list()
        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.loc[train_id]
            eval_df = train_data.loc[eval_id]
            
            
            
            model_lgbm = lgb.train(para_lgbm, train_dataset, num_boost_round=num_round_list[0], valid_sets=[train_df, eval_df])
            model_xgb = xgb.train(param, train_df, num_round_list[1], (eval_df, 'eval'), (train_df, 'train'))
            model_rf = lgb.train(para_rf, train_dataset, num_boost_round=num_round_list[2], valid_sets=[train_df, eval_df])
            
            y_pred_lgbm = model_lgbm.predict(eval_df.drop(columns=['loan_status']))
            y_pred_xgb = model_xgb.predict(eval_df.drop(columns=['loan_status']))
            y_pred_rf = model_rf.predict(eval_df.drop(columns=['loan_status']))
            
            y_pred = int((y_pred_lgbm + y_pred_xgb + y_pred_rf) / 3 > 0.5)
            
            acc_result.append(get_loss(eval_df['loan_status'], y_pred))
            
            test_pred += (model_lgbm.predict(dtest) + model_xgb.predict(dtest) + model_rf(dtest))/3
        test_pred /= k_fold.n_splits
        return test_pred

In [None]:
output = stacking_train_k_fold(kfold, train, test, use_best_eval=True)
test_acc = accuracy_score(test['loan_status'], output.round())
print(f'accuracy_score: {test_acc}')