In [1]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
wd = '/Users/ewenwang/Documents/practice_data'
os.chdir(wd)

file = 'loan_stats.csv'

In [4]:
import xgboost as xgb



In [7]:
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [5]:
def prepare_data():
    train = pd.read_csv(file)
    categorical_columns = train.select_dtypes(include=['object']).columns

    for column in tqdm(categorical_columns):
        le = LabelEncoder()
        train[column] = le.fit_transform(train[column])

    y = train['loan_status']

    X = train.drop(['loan_status'], 1)
    xgtrain = xgb.DMatrix(X, label=y)

    return xgtrain

In [8]:
xgtrain = prepare_data()

0it [00:00, ?it/s]


In [11]:
num_rounds = 3000
random_state = 2018
num_iter = 25
init_points = 5
params = {
    'eta': 0.01,
    'silent': 1,
    'eval_metric': 'mae',
    'verbose_eval': True,
    'seed': random_state
}

In [10]:
def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)


    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(50)])

    return -cv_result['test-mae-mean'].values[-1]

In [12]:
from sklearn.metrics import mean_absolute_error
from bayes_opt import BayesianOptimization

In [14]:
num_rounds = 3000
random_state = 2018
num_iter = 25
init_points = 5
params = {
    'eta': 0.01,
    'silent': 1,
    'eval_metric': 'auc',
    'verbose_eval': True,
    'seed': random_state
}

In [23]:
def xgb_evaluate(max_depth,
                 colsample_bytree,
                 subsample):
    
    params['max_depth'] = int(max_depth)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)   
    params['subsample'] = max(min(subsample, 1), 0)

    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(20)])

    return cv_result['test-auc-mean'].values[-1]

In [24]:
xgbBO = BayesianOptimization(xgb_evaluate, {'max_depth': (1, 15),
                                            'colsample_bytree': (0.1, 1),
                                            'subsample': (0.5, 1)})

xgbBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_depth |   subsample | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
Stopping. Best iteration:
[30]	train-auc:0.744275+0.00456637	test-auc:0.691921+0.00783145

    1 | 01m51s | [35m   0.69192[0m | [32m            0.1729[0m | [32m     9.2601[0m | [32m     0.7253[0m | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
Stopping. Best iteration:
[1148]	train-auc:0.732238+0.000501479	test-auc:0.700981+0.00636184

    2 | 08m58s | [35m   0.70098[0m | [32m            0.8014[0m | [32m     2.4474[0m | [32m     0.9944[0m | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-au

Stopping. Best iteration:
[96]	train-auc:0.838303+0.00662455	test-auc:0.692696+0.00714847

   25 | 05m21s |    0.69270 |             1.0000 |     11.3893 |      0.5000 | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
Stopping. Best iteration:
[439]	train-auc:0.713603+0.000725138	test-auc:0.696322+0.00697872

   26 | 04m28s |    0.69632 |             0.1000 |      2.0228 |      0.5000 | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
Stopping. Best iteration:
[71]	train-auc:0.858022+0.00676897	test-auc:0.690697+0.00788605

   27 | 05m13s |    0.69070 |             0.1000 |     13.7561 |      0.5000 | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
Stopping. Best iteration:
[865]	train-auc:0.759442+0.000966364