In [1]:
import os
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from bayes_opt import BayesianOptimization
import warnings
warnings.filterwarnings('ignore')

In [2]:
wd = '/Users/ewenwang/Documents/practice_data'
os.chdir(wd)

file = 'credit_score.csv'
dataset = pd.read_csv(file)

In [3]:
dataset.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [18]:
from sklearn.model_selection import train_test_split
dtrain, dtest = train_test_split(dataset, test_size=0.33, random_state=2018)

target = 'SeriousDlqin2yrs'
predictors = [x for x in dataset.columns if x not in [target]]

y = dtrain[target]
X = dtrain[predictors]

Dtrain = lgb.Dataset(X, label = y)

In [68]:
def bayesOpt(train, target, predictors, results=True, seed=2018):
    """ Hyper parameter tuning with Bayesian optimization.

    Employes the Bayesian optimization to find the optimal hyper parameters and 
    return an optimized classifier.

    Args:
        train: A training set of your machine learning project.
        target: The target variablet; limited to binary.
        predictors: The predictors.
        results: Whether print the progress out; default with True.
        seed: The random state.
        
    Returns:
        None.
    """
    print('optimizing...')
    
    y = dtrain[target]
    X = dtrain[predictors]

    Dtrain = lgb.Dataset(X, label = y)

    def lgb_evaluate(max_depth, colsample_bytree, subsample):
    
        params = {
            'eta': 0.01,
            'silent': 1,
            'num_boost_round':3000,
            'early_stopping_round':20,
            'n_fold':5,
            'verbose_eval': True,
            'seed': seed
        }

        params['max_depth'] = int(max_depth)
        params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)   
        params['subsample'] = max(min(subsample, 1), 0)

        cv_result = lgb.cv(params, Dtrain, metrics='auc')
        return cv_result['auc-mean'][-1]

    lgbBO = BayesianOptimization(lgb_evaluate, {'max_depth': (1, 20),
                                                'colsample_bytree': (0.1, 1),
                                                'subsample': (0.1, 1)})
    lgbBO.maximize(init_points=5, n_iter=25)

    if results:
        print('\nbest score:', '{:.6f}'.format(lgbBO.res['max']['max_val']),
              '\nbest parameters:', str({key: '{:.2f}'.format(value) for key, value in lgbBO.res['max']['max_params'].items()}))

    return None

In [7]:
import lightgbm as lgb

In [53]:
def lgb_evaluate(max_depth, colsample_bytree, subsample):
    
    params = {
        'eta': 0.01,
        'silent': 1,
        'num_boost_round':3000,
        'early_stopping_round':20,
        'nfold':5,
        'seed': 2018
    }
    
    params['max_depth'] = int(max_depth)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)   
    params['subsample'] = max(min(subsample, 1), 0)
    
    cv_result = lgb.cv(params, Dtrain, metrics='auc')
    return cv_result['auc-mean'][-1]

In [54]:
lgbBO = BayesianOptimization(lgb_evaluate, {'max_depth': (1, 20),
                                            'colsample_bytree': (0.1, 1),
                                            'subsample': (0.1, 1)})
lgbBO.maximize(init_points=5, n_iter=25)

[31mInitialization[0m
[94m------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_depth |   subsample | 
    1 | 00m03s | [35m   0.86461[0m | [32m            0.3215[0m | [32m    19.6647[0m | [32m     0.1316[0m | 
    2 | 00m05s | [35m   0.86489[0m | [32m            0.1849[0m | [32m     2.1540[0m | [32m     0.5680[0m | 
    3 | 00m05s |    0.86489 |             0.8014 |      2.3137 |      0.7659 | 
    4 | 00m02s | [35m   0.86555[0m | [32m            0.1448[0m | [32m     3.5075[0m | [32m     0.5262[0m | 
    5 | 00m03s |    0.86462 |             0.1192 |     16.0070 |      0.2090 | 
[31mBayesian Optimization[0m
[94m------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_depth |   subsample | 
    6 | 00m13s |    0.86461 |             0.1102 |     19.9747 |      0.9924 | 
    7 | 00m13s |    0

In [56]:
lgbBO.res['max']

{'max_params': {'colsample_bytree': 0.14478551565784675,
  'max_depth': 3.5075184925185616,
  'subsample': 0.52621530408354555},
 'max_val': 0.86554964969257586}

In [64]:
print('\nbest score:', '{:.6f}'.format(lgbBO.res['max']['max_val']),
      '\nbest parameters:', str({key: '{:.2f}'.format(value) for key, value in lgbBO.res['max']['max_params'].items()}))


best score: 0.865550 
best parameters: {'max_depth': '3.51', 'colsample_bytree': '0.14', 'subsample': '0.53'}


In [63]:
lgbBO.res['max']['max_params']

{'colsample_bytree': 0.14478551565784675,
 'max_depth': 3.5075184925185616,
 'subsample': 0.52621530408354555}

In [69]:
bayesOpt(dtrain, target=target, predictors=predictors)

optimizing...
[31mInitialization[0m
[94m------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_depth |   subsample | 
    1 | 00m02s | [35m   0.86555[0m | [32m            0.5061[0m | [32m     3.7085[0m | [32m     0.2833[0m | 
    2 | 00m03s |    0.86504 |             0.4472 |      9.8492 |      0.3578 | 
    3 | 00m03s |    0.86459 |             0.9775 |     17.5730 |      0.7476 | 
    4 | 00m07s |    0.86246 |             0.7680 |      1.2237 |      0.4353 | 
    5 | 00m05s |    0.86489 |             0.1100 |      2.4297 |      0.7048 | 
[31mBayesian Optimization[0m
[94m------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_depth |   subsample | 
    6 | 00m14s |    0.86461 |             0.1000 |     20.0000 |      0.1000 | 
    7 | 00m13s |    0.86468 |             0.9940 |      5.8144 |      0.9987 | 

In [60]:
import gossipcat as gc

In [61]:
SA_clf = gc.simAnneal(dtrain, target=target, predictors=predictors)

simulating...

INFO: Number of possible iterations given cooling schedule: 160



[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.6min finished


2 T: 10.0, score: 0.8629, std: 0.0064, params: {'max_depth': '4.00', 'subsample': '0.30', 'colsample_bytree': '0.30'}


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.8min finished


3 T: 10.0, score: 0.8552, std: 0.0062, params: {'max_depth': '10.00', 'subsample': '0.30', 'colsample_bytree': '0.30'}


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.0min finished


4 T: 10.0, score: 0.8537, std: 0.0076, params: {'max_depth': '10.00', 'subsample': '0.30', 'colsample_bytree': '0.50'}


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.7min finished


5 T: 10.0, score: 0.8568, std: 0.0068, params: {'max_depth': '5.00', 'subsample': '0.30', 'colsample_bytree': '0.50'}


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.0min finished


6 T: 10.0, score: 0.8530, std: 0.0072, params: {'max_depth': '8.00', 'subsample': '0.30', 'colsample_bytree': '0.50'}

best score: 0.863770 
best parameters: {'max_depth': '1.00', 'subsample': '0.30', 'colsample_bytree': '0.30'}
