### Bayesian Optimization을 이용하여 application과 previous로 만들어진 집합의 하이퍼 파라미터 튜닝

#### 라이브러리 및 데이터 세트 로딩. 이전 application 데이터의 FE 함수 복사

In [1]:
import numpy as np # 1.18.5
import pandas as pd # 0.25.1
import gc
import time
import matplotlib.pyplot as plt # 3.2.2
import seaborn as sns # 0.10.1
import os
%matplotlib inline

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

In [2]:
def get_dataset():
    app_train = pd.read_csv('application_train.csv')
    app_test = pd.read_csv('application_test.csv')
    apps = pd.concat([app_train, app_test])
    prev = pd.read_csv('previous_application.csv')

    return apps, prev

apps, prev = get_dataset()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  apps = pd.concat([app_train, app_test])


#### 이전 application 데이터의 feature engineering 함수 복사

In [3]:
def get_apps_processed(apps):
    
    # EXT_SOURCE_X FEATURE 가공
    apps['APPS_EXT_SOURCE_MEAN'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps['APPS_EXT_SOURCE_STD'].fillna(apps['APPS_EXT_SOURCE_STD'].mean())
    
    # AMT_CREDIT 비율로 Feature 가공
    apps['APPS_ANNUITY_CREDIT_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_CREDIT']
    apps['APPS_GOODS_CREDIT_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_CREDIT']
    
    # AMT_INCOME_TOTAL 비율로 Feature 가공
    apps['APPS_ANNUITY_INCOME_RATIO'] = apps['AMT_ANNUITY']/apps['AMT_INCOME_TOTAL']
    apps['APPS_CREDIT_INCOME_RATIO'] = apps['AMT_CREDIT']/apps['AMT_INCOME_TOTAL']
    apps['APPS_GOODS_INCOME_RATIO'] = apps['AMT_GOODS_PRICE']/apps['AMT_INCOME_TOTAL']
    apps['APPS_CNT_FAM_INCOME_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['CNT_FAM_MEMBERS']
    
    # DAYS_BIRTH, DAYS_EMPLOYED 비율로 Feature 가공
    apps['APPS_EMPLOYED_BIRTH_RATIO'] = apps['DAYS_EMPLOYED']/apps['DAYS_BIRTH']
    apps['APPS_INCOME_EMPLOYED_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_EMPLOYED']
    apps['APPS_INCOME_BIRTH_RATIO'] = apps['AMT_INCOME_TOTAL']/apps['DAYS_BIRTH']
    apps['APPS_CAR_BIRTH_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_BIRTH']
    apps['APPS_CAR_EMPLOYED_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_EMPLOYED']
    
    return apps

#### previous 데이터 가공후 인코딩 및 최종 데이터 집합 생성하는 함수 선언

In [4]:
from sklearn.model_selection import train_test_split # 0.23.1
from lightgbm import LGBMClassifier # 3.1.1

def get_prev_processed(prev):
    # 대출 신청 금액과 실제 대출액/대출 상품금액 차이 및 비율
    prev['PREV_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
    prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
    prev['PREV_CREDIT_APPL_RATIO'] = prev['AMT_CREDIT']/prev['AMT_APPLICATION']
    # prev['PREV_ANNUITY_APPL_RATIO'] = prev['AMT_ANNUITY']/prev['AMT_APPLICATION']
    prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE']/prev['AMT_APPLICATION']
    
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # 첫번째 만기일과 마지막 만기일까지의 기간
    prev['PREV_DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
    # 매월 납부 금액과 납부 횟수 곱해서 전체 납부 금액 구함. 
    all_pay = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
    # 전체 납부 금액 대비 AMT_CREDIT 비율을 구하고 여기에 다시 납부횟수로 나누어서 이자율 계산. 
    prev['PREV_INTERESTS_RATE'] = (all_pay/prev['AMT_CREDIT'] - 1)/prev['CNT_PAYMENT']
        
    return prev
    
    
def get_prev_amt_agg(prev):
    # 새롭게 생성된 대출 신청액 대비 다른 금액 차이 및 비율로 aggregation 수행. 
    agg_dict = {
         # 기존 컬럼. 
        'SK_ID_CURR':['count'],
        'AMT_CREDIT':['mean', 'max', 'sum'],
        'AMT_ANNUITY':['mean', 'max', 'sum'], 
        'AMT_APPLICATION':['mean', 'max', 'sum'],
        'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
        'AMT_GOODS_PRICE':['mean', 'max', 'sum'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
        # 가공 컬럼
        'PREV_CREDIT_DIFF':['mean', 'max', 'sum'], 
        'PREV_CREDIT_APPL_RATIO':['mean', 'max'],
        'PREV_GOODS_DIFF':['mean', 'max', 'sum'],
        'PREV_GOODS_APPL_RATIO':['mean', 'max'],
        'PREV_DAYS_LAST_DUE_DIFF':['mean', 'max', 'sum'],
        'PREV_INTERESTS_RATE':['mean', 'max']
    }

    prev_group = prev.groupby('SK_ID_CURR')
    prev_amt_agg = prev_group.agg(agg_dict)

    # multi index 컬럼을 '_'로 연결하여 컬럼명 변경
    prev_amt_agg.columns = ["PREV_"+ "_".join(x).upper() for x in prev_amt_agg.columns.ravel()]
    
    return prev_amt_agg

def get_prev_refused_appr_agg(prev):
    # 원래 groupby 컬럼 + 세부 기준 컬럼으로 groupby 수행. 세분화된 레벨로 aggregation 수행 한 뒤에 unstack()으로 컬럼레벨로 변형. 
    prev_refused_appr_group = prev[prev['NAME_CONTRACT_STATUS'].isin(['Approved', 'Refused'])].groupby([ 'SK_ID_CURR', 'NAME_CONTRACT_STATUS'])
    prev_refused_appr_agg = prev_refused_appr_group['SK_ID_CURR'].count().unstack()
    # 컬럼명 변경. 
    prev_refused_appr_agg.columns = ['PREV_APPROVED_COUNT', 'PREV_REFUSED_COUNT' ]
    # NaN값은 모두 0으로 변경. 
    prev_refused_appr_agg = prev_refused_appr_agg.fillna(0)
    
    return prev_refused_appr_agg

    

def get_prev_agg(prev):
    prev = get_prev_processed(prev)
    prev_amt_agg = get_prev_amt_agg(prev)
    prev_refused_appr_agg = get_prev_refused_appr_agg(prev)
    
    # prev_amt_agg와 조인. 
    prev_agg = prev_amt_agg.merge(prev_refused_appr_agg, on='SK_ID_CURR', how='left')
    # SK_ID_CURR별 과거 대출건수 대비 APPROVED_COUNT 및 REFUSED_COUNT 비율 생성. 
    prev_agg['PREV_REFUSED_RATIO'] = prev_agg['PREV_REFUSED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
    prev_agg['PREV_APPROVED_RATIO'] = prev_agg['PREV_APPROVED_COUNT']/prev_agg['PREV_SK_ID_CURR_COUNT']
    # 'PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT' 컬럼 drop 
    prev_agg = prev_agg.drop(['PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT'], axis=1)
    
    return prev_agg

def get_apps_all_with_prev_agg(apps, prev):
    apps_all =  get_apps_processed(apps)
    prev_agg = get_prev_agg(prev)
    print('prev_agg shape:', prev_agg.shape)
    print('apps_all before merge shape:', apps_all.shape)
    apps_all = apps_all.merge(prev_agg, on='SK_ID_CURR', how='left')
    print('apps_all after merge with prev_agg shape:', apps_all.shape)
    
    return apps_all

def get_apps_all_encoded(apps_all):
    object_columns = apps_all.dtypes[apps_all.dtypes == 'object'].index.tolist()
    for column in object_columns:
        apps_all[column] = pd.factorize(apps_all[column])[0]
    
    return apps_all

def get_apps_all_train_test(apps_all):
    apps_all_train = apps_all[~apps_all['TARGET'].isnull()]
    apps_all_test = apps_all[apps_all['TARGET'].isnull()]

    apps_all_test = apps_all_test.drop('TARGET', axis=1)
    
    return apps_all_train, apps_all_test
    
def train_apps_all(apps_all_train):
    ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    target_app = apps_all_train['TARGET']

    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
    print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)
    clf = LGBMClassifier(
                nthread=4,
                n_estimators=2000,
                learning_rate=0.01,
                num_leaves=32,
                colsample_bytree=0.8,
                subsample=0.8,
                max_depth=8,
                reg_alpha=0.04,
                reg_lambda=0.07,
                min_child_weight=40,
                silent=-1,
                verbose=-1,
                )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
    
    return clf

##### 최종 집합 생성 및 인코딩, 학습/테스트 데이터 분리, 학습/검증 피처와 타겟 데이터 분리

In [5]:
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = apps_all_train['TARGET']
train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)


prev_agg shape: (338857, 41)
apps_all before merge shape: (356255, 135)
apps_all after merge with prev_agg shape: (356255, 176)


#### Bayesian Optimization 

In [None]:
# bayesian optimization 패키지 설치
!pip install bayesian-optimization

In [6]:
from bayes_opt import BayesianOptimization # 1.2.0
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

##### 함수의 입력값 search 범위(하이퍼 파라미터 별 입력 범위) 를 설정

In [8]:
bayesian_params = {
    'max_depth': (6, 16),
    'num_leaves': (24, 64),
    'min_child_samples': (10, 200),
    'min_child_weight': (1, 50),
    'subsample': (0.5, 1), # 0.5 이하로 가도 크게 좋아지는게 없음
    'colsample_bytree': (0.5, 1),
    'max_bin': (10, 500),
    'reg_lambda': (0.001, 10),
    'reg_alpha': (0.01, 50)
}

##### 최대 값을 구할 함수 선언. 
* iteration 시 마다 hyperparameter를 입력받아 classifier 학습하고 roc_auc_score값을 반환 

In [10]:
  def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample,
                  colsample_bytree, max_bin, reg_lambda, reg_alpha):
        
        params = {
            'n_estimators': 500, # 1000이면 시간이 너무 오래 걸린다.
            'learning_rate': 0.02,
            'max_depth': int(round(max_depth)), # max_depth와 num_leaves는 실수형으로 받아지기에 정수형으로 바꿔줘야한다.
            'num_leaves': int(round(num_leaves)), 
            'min_child_samples': int(round(min_child_samples)), # min_child_samples와 min_child_weight는 실수형으로 받아지기에 정수형으로 바꿔줘야한다.
            'min_child_weight': int(round(min_child_weight)), 
            'subsample': max(min(subsample, 1), 0), # 위에서 필터링이 되긴했지만 다시 한번 제약해주는것이 좋다. 
            'colsample_bytree': max(min(colsample_bytree, 1), 0),
            'max_bin':  max(int(round(max_bin)),10),
            'reg_lambda': max(reg_lambda,0),
            'reg_alpha': max(reg_alpha, 0)
        }
        
        # print(params)
        
        lgb_model = LGBMClassifier(**params)
        lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
                      early_stopping_rounds= 100)
        valid_proba = lgb_model.predict_proba(valid_x)[:, 1]
        roc_auc = roc_auc_score(valid_y, valid_proba)
        
        return roc_auc

##### BayesianOptimization 객체 생성 후 함수 반환값이 최대가 되는 입력값 search를 위한 iteration 수행

In [11]:
# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(f=lgb_roc_eval, pbounds=bayesian_params, random_state=0)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. (24m 3s)
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.769375	training's binary_logloss: 0.246057	valid_1's auc: 0.755179	valid_1's binary_logloss: 0.248986
[200]	training's auc: 0.787681	training's binary_logloss: 0.238342	valid_1's auc: 0.766691	valid_1's binary_logloss: 0.244129
[300]	training's auc: 0.799234	training's binary_logloss: 0.233876	valid_1's auc: 0.77183	valid_1's binary_logloss: 0.242237
[400]	training's auc: 0.808068	training's binary_logloss: 0.230569	valid_1's auc: 0.774252	valid_1's binary_logloss: 0.241339
[500]	training's auc: 0.816074	training's binary_logloss: 0.227608	valid_1's auc: 0.775833	valid_1's binary_logloss: 0.240803
Did not meet early stopping. Best iterati

[300]	training's auc: 0.810285	training's binary_logloss: 0.230168	valid_1's auc: 0.77343	valid_1's binary_logloss: 0.241651
[400]	training's auc: 0.821917	training's binary_logloss: 0.225749	valid_1's auc: 0.775774	valid_1's binary_logloss: 0.240811
[500]	training's auc: 0.832404	training's binary_logloss: 0.221754	valid_1's auc: 0.777248	valid_1's binary_logloss: 0.240267
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.832404	training's binary_logloss: 0.221754	valid_1's auc: 0.777248	valid_1's binary_logloss: 0.240267
| [0m 8       [0m | [0m 0.7772  [0m | [0m 0.5334  [0m | [0m 449.0   [0m | [0m 12.21   [0m | [0m 174.8   [0m | [0m 29.78   [0m | [0m 62.2    [0m | [0m 9.646   [0m | [0m 8.556   [0m | [0m 0.716   [0m |
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.778361	training's binary_logloss: 0.244066	valid_1's auc: 0.759265	valid_1's binary_logloss: 0.24797
[200]	training's auc: 0.796634	training'

| [95m 15      [0m | [95m 0.7774  [0m | [95m 0.7871  [0m | [95m 405.1   [0m | [95m 10.03   [0m | [95m 165.2   [0m | [95m 21.33   [0m | [95m 61.87   [0m | [95m 2.02    [0m | [95m 2.977   [0m | [95m 0.9291  [0m |
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782211	training's binary_logloss: 0.24224	valid_1's auc: 0.759948	valid_1's binary_logloss: 0.247453
[200]	training's auc: 0.805055	training's binary_logloss: 0.232465	valid_1's auc: 0.770092	valid_1's binary_logloss: 0.242782
[300]	training's auc: 0.822634	training's binary_logloss: 0.225782	valid_1's auc: 0.774303	valid_1's binary_logloss: 0.241222
[400]	training's auc: 0.837137	training's binary_logloss: 0.220207	valid_1's auc: 0.776537	valid_1's binary_logloss: 0.240432
[500]	training's auc: 0.849431	training's binary_logloss: 0.215529	valid_1's auc: 0.777267	valid_1's binary_logloss: 0.240164
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.8494

[400]	training's auc: 0.828929	training's binary_logloss: 0.222738	valid_1's auc: 0.775752	valid_1's binary_logloss: 0.24056
[500]	training's auc: 0.840077	training's binary_logloss: 0.218474	valid_1's auc: 0.776472	valid_1's binary_logloss: 0.240302
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.840077	training's binary_logloss: 0.218474	valid_1's auc: 0.776472	valid_1's binary_logloss: 0.240302
| [0m 23      [0m | [0m 0.7765  [0m | [0m 0.9556  [0m | [0m 395.9   [0m | [0m 12.36   [0m | [0m 166.4   [0m | [0m 43.55   [0m | [0m 57.36   [0m | [0m 2.284   [0m | [0m 3.768   [0m | [0m 0.9566  [0m |
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782723	training's binary_logloss: 0.242071	valid_1's auc: 0.760665	valid_1's binary_logloss: 0.247344
[200]	training's auc: 0.805964	training's binary_logloss: 0.232194	valid_1's auc: 0.770315	valid_1's binary_logloss: 0.242684
[300]	training's auc: 0.823837	training

##### Iteration 수행 결과 출력

In [12]:
# BayesianOptimization객체의 res는 iteration 수행 시마다 모든 함수 반환결과와 그때의 파라미터 결과값을 가지고 있음. 
lgbBO.res

[{'target': 0.7758329290960616,
  'params': {'colsample_bytree': 0.7744067519636624,
   'max_bin': 360.44278952248555,
   'max_depth': 12.027633760716439,
   'min_child_samples': 113.52780476941041,
   'min_child_weight': 21.75908516760633,
   'num_leaves': 49.835764522666246,
   'reg_alpha': 21.884984691022,
   'reg_lambda': 8.917838234820016,
   'subsample': 0.9818313802505146}},
 {'target': 0.7756769340648957,
  'params': {'colsample_bytree': 0.6917207594128889,
   'max_bin': 397.94526866050563,
   'max_depth': 11.288949197529044,
   'min_child_samples': 117.92846660784714,
   'min_child_weight': 46.35423527634039,
   'num_leaves': 26.841442327915477,
   'reg_alpha': 4.36559369208002,
   'reg_lambda': 0.20316375600581688,
   'subsample': 0.916309922773969}},
 {'target': 0.776976163031267,
  'params': {'colsample_bytree': 0.8890783754749252,
   'max_bin': 436.30595264094137,
   'max_depth': 15.78618342232764,
   'min_child_samples': 161.8401272011775,
   'min_child_weight': 23.612488

##### Iteration 결과 Dictionary에서 최대 target값을 가지는 index 추출하고 그때의 parameter 값을 추출.  

In [13]:
# dictionary에 있는 target값을 모두 추출
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

[0.7758329290960616, 0.7756769340648957, 0.776976163031267, 0.7749205409939958, 0.7741227529929269, 0.7773664953021925, 0.7739399036759036, 0.7772482274769529, 0.7759758734462505, 0.7762406739235685, 0.7764952479787675, 0.7771535306175148, 0.7769328889682356, 0.7744011918766316, 0.7774422118078091, 0.7772670569283046, 0.777961302008304, 0.7771975861964955, 0.7767660924971176, 0.7756663645382131, 0.777062749280059, 0.7768597060356665, 0.7764720217059765, 0.7774493312274952, 0.7768854976504461, 0.7724379618621265, 0.7719190876904308, 0.7734656076822688, 0.7753891042822209, 0.7777188541578983]
maximum target index: 16


In [14]:
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

{'target': 0.777961302008304, 'params': {'colsample_bytree': 0.5824262661493007, 'max_bin': 371.1746026096842, 'max_depth': 15.055455354136251, 'min_child_samples': 168.3108071725216, 'min_child_weight': 23.886438476304335, 'num_leaves': 63.790243708508754, 'reg_alpha': 3.6327436106737037, 'reg_lambda': 0.3227467203381007, 'subsample': 0.6383581316575138}}


##### 최적화된 하이퍼 파라미터를 기반으로 재 테스트 

In [15]:
def train_apps_all(apps_all_train):
    ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    target_app = apps_all_train['TARGET']

    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
    print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)
    clf = LGBMClassifier(
                nthread=4,
                n_estimators=1000,
                learning_rate=0.02,
                max_depth = 15,
                num_leaves=64,
                colsample_bytree=0.582,
                subsample=0.638,
                max_bin=371,
                reg_alpha=3.633,
                reg_lambda=0.323,
                min_child_weight=24,
                min_child_samples=168,
                silent=-1,
                verbose=-1,
                )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
    
    return clf

In [16]:
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
clf = train_apps_all(apps_all_train)

prev_agg shape: (338857, 41)
apps_all before merge shape: (356255, 135)
apps_all after merge with prev_agg shape: (356255, 176)
train shape: (215257, 174) valid shape: (92254, 174)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.783906	training's binary_logloss: 0.242191	valid_1's auc: 0.761466	valid_1's binary_logloss: 0.24739
[200]	training's auc: 0.804487	training's binary_logloss: 0.23254	valid_1's auc: 0.770483	valid_1's binary_logloss: 0.242633
[300]	training's auc: 0.821403	training's binary_logloss: 0.225993	valid_1's auc: 0.774525	valid_1's binary_logloss: 0.241079
[400]	training's auc: 0.836004	training's binary_logloss: 0.220459	valid_1's auc: 0.776612	valid_1's binary_logloss: 0.240321
[500]	training's auc: 0.848499	training's binary_logloss: 0.215588	valid_1's auc: 0.778139	valid_1's binary_logloss: 0.239818
[600]	training's auc: 0.859436	training's binary_logloss: 0.211219	valid_1's auc: 0.77868	valid_1's binary_logloss: 0.23963
[700]

In [17]:
preds = clf.predict_proba(apps_all_test.drop('SK_ID_CURR', axis=1))[:, 1 ]
apps_all_test['TARGET'] = preds
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv('prev_baseline_tuning_01.csv', index=False)

| 차수 | Private Score | Public Score | 설명 |
|:---------:|:----------:|:----------:|:----------:|
| 1차 | *0.74088* | *0.74448* | application 데이터 세트 기본 preprocessing |
| 2차 | *0.75458* | *0.75882* | application 데이터 세트 Feature Engineering |
| 3차 | *0.76396* | *0.77579* | previous 데이터 세트 Feature Engineering |
| 4차 | *0.76420* | *0.77583* | previous 데이터 세트 Feature Engineering 함수화 |
| 5차 | *0.76710* | *0.77630* | Bayesian Optimization을 이용한 하이퍼 파라미터 튜닝 |

Public Score는 test 셋의 20%, Private Score는 test 셋의 80%로 설정, 그래서 보다 정확하게 튜닝을 하려면 train 셋을 cross validation으로 하는 것이 맞다. 하지만 이것이 점수를 무조건 올릴 수 있다라고 할 순 없다.(오히려 떨어질 수 있음)

##### cross validation 으로 hyper parameter 재 tuning
강의에서 6시간이 걸린다고 해서 일단은 돌리지 않고 코드만 가져옴.  
강의에서 돌렸을 때 Private Score 는 *0.76355*, Public Score 는 *0.77499* 가 나와서 오히려 떨어짐.

In [None]:
bayesian_params = {
    'max_depth': (6, 16), 
    'num_leaves': (24, 64), 
    'min_data_in_leaf': (10, 200), # min_child_samples
    'min_child_weight':(1, 50),
    'bagging_fraction':(0.5, 1.0), # subsample
    'feature_fraction': (0.5, 1.0), # colsample_bytree
    'max_bin':(10, 500),
    'lambda_l2':(0.001, 10), # reg_lambda
    'lambda_l1': (0.01, 50) # reg_alpha
}

In [None]:
import lightgbm as lgb

train_data = lgb.Dataset(data=ftr_app, label=target_app, free_raw_data=False)
def lgb_roc_eval_cv(max_depth, num_leaves, min_data_in_leaf, min_child_weight, bagging_fraction, 
                 feature_fraction,  max_bin, lambda_l2, lambda_l1):   
    params = {
        "num_iterations":500, "learning_rate":0.02,
        'early_stopping_rounds':100, 'metric':'auc',
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 실수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_data_in_leaf': int(round(min_data_in_leaf)),
        'min_child_weight': int(round(min_child_weight)),
        'bagging_fraction': max(min(bagging_fraction, 1), 0), 
        'feature_fraction': max(min(feature_fraction, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'lambda_l2': max(lambda_l2,0),
        'lambda_l1': max(lambda_l1, 0)
    }
    # 파이썬 lightgbm의 cv 메소드를 사용. 
    # cross_val_score() 는 early_stopping_rounds 가 없다.
    cv_result = lgb.cv(params, train_data, nfold=3, seed=0,  verbose_eval =100,  early_stopping_rounds=50, metrics=['auc'])
    return max(cv_result['auc-mean'])   

In [None]:
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

In [None]:
def train_apps_all(apps_all_train):
    ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    target_app = apps_all_train['TARGET']

    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
    print('train shape:', train_x.shape, 'valid shape:', valid_x.shape)
    clf = LGBMClassifier(
                nthread=4,
                n_estimators=1000,
                learning_rate=0.02,
                max_depth = 10,
                num_leaves=60,
                colsample_bytree=0.511,
                subsample=0.785,
                max_bin=208,
                reg_alpha=7.009,
                reg_lambda=6.579,
                min_child_weight=40,
                min_child_samples=91,
                silent=-1,
                verbose=-1,
                )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
    
    return clf

In [None]:
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
clf = train_apps_all(apps_all_train)

In [None]:
preds = clf.predict_proba(apps_all_test.drop('SK_ID_CURR', axis=1))[:, 1 ]
apps_all_test['TARGET'] = preds
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv('prev_baseline_tuning_02.csv', index=False)

> 캐글 Advanced 머신러닝 실전 박치기 / 인프런