# Hyper Parameter Tuning 

### 0. Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import lightgbm as lgb

from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

pd.set_option('display.max_rows', 100)

- load apps04

In [2]:
with open('apps04.pkl','rb') as f:
    apps = pd.read_pickle('apps04.pkl')

In [3]:
apps.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CARD_BALANCE_LIMIT_RATIO_MIN,CARD_BALANCE_LIMIT_RATIO_MAX,CARD_DRAWING_LIMIT_RATIO_MIN,CARD_DRAWING_LIMIT_RATIO_MAX,CARD_IS_DPD_MEAN,CARD_IS_DPD_SUM,CARD_IS_DPD_UNDER_120_MEAN,CARD_IS_DPD_UNDER_120_SUM,CARD_IS_DPD_OVER_120_MEAN,CARD_IS_DPD_OVER_120_SUM
0,100002,1.0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,,,,,,,,,,
1,100003,0.0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,,,,,,,,,,
2,100004,0.0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,,,,,,,,,,
3,100006,0.0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0.0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,,,,,,,,,,


### 1. Encoding

In [4]:
def encoding(df):
    obeject_columns = df.dtypes[df.dtypes == 'object'].index.tolist()
    for col in obeject_columns:
        df[col] = pd.factorize(df[col])[0]
    return df

apps = encoding(apps)


In [5]:
apps.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CARD_BALANCE_LIMIT_RATIO_MIN,CARD_BALANCE_LIMIT_RATIO_MAX,CARD_DRAWING_LIMIT_RATIO_MIN,CARD_DRAWING_LIMIT_RATIO_MAX,CARD_IS_DPD_MEAN,CARD_IS_DPD_SUM,CARD_IS_DPD_UNDER_120_MEAN,CARD_IS_DPD_UNDER_120_SUM,CARD_IS_DPD_OVER_120_MEAN,CARD_IS_DPD_OVER_120_SUM
0,100002,1.0,0,0,0,0,0,202500.0,406597.5,24700.5,...,,,,,,,,,,
1,100003,0.0,0,1,0,1,0,270000.0,1293502.5,35698.5,...,,,,,,,,,,
2,100004,0.0,1,0,1,0,0,67500.0,135000.0,6750.0,...,,,,,,,,,,
3,100006,0.0,0,1,0,0,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0.0,0,0,0,0,0,121500.0,513000.0,21865.5,...,,,,,,,,,,


### 2. Split Data

#### Split test/train data

In [6]:
app_train = apps[apps['TARGET'].notnull()]
app_test = apps[apps['TARGET'].isnull()]

app_test = app_test.drop(columns=['TARGET'], axis=1)

#### Split train / valid data in train data

In [7]:
ftr_app_train = app_train.drop(columns=['SK_ID_CURR','TARGET'], axis=1)
tgt_app_train = app_train['TARGET']

train_x, valid_x, train_y, valid_y = train_test_split(ftr_app_train, tgt_app_train, test_size=0.2)


### 3. Hyper-Paramter Tuning Model

#### Set Hyper-Parameter range 

In [8]:
bayesian_params = {
    'max_depth' : (6,16),
    'num_leaves' : (24,64),
    'min_child_samples' : (10,200),
    'min_child_weight' : (1,50),
    'subsample' : (0.5,1),
    'colsample_bytree':(0.5,1),
    'max_bin':(10,500),
    'reg_lambda':(0.001,10),
    'reg_alpha':(0.01,50)
}

#### Define objective function

In [9]:
def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, colsample_bytree, max_bin, reg_lambda, reg_alpha):

    params= {
        'n_estimators':500, 'learning_rate':0.02, 'n_jobs': -1,
        'max_depth':int(round(max_depth)), 'num_leaves':int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),'min_child_weight': int(round(min_child_weight)),
        'subsample' : max(min(subsample,1),0), 'colsample_bytree':max(min(colsample_bytree,1),0), 
        'max_bin':  max(int(round(max_bin)),10),'reg_lambda': max(reg_lambda,0), 'reg_alpha': max(reg_alpha, 0),
        'verbose': -1
    }



    lgb_model = LGBMClassifier(**params)
    callback = [lgb.early_stopping(100),lgb.log_evaluation(100)]

    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', callbacks= callback)

    valid_proba = lgb_model.predict_proba(valid_x)[:, 1]
    roc_auc = roc_auc_score(valid_y, valid_proba)  
    
    return roc_auc

#### Find best hyper parameter 
    - I recommend you to run this code in Google Colab or Kaggle Notebook


In [32]:
# lgb_opt = BayesianOptimization(f=lgb_roc_eval, pbounds= bayesian_params) 
# lgb_opt.maximize(init_points=10, n_iter=25)

# print('Best parameters: ', lgb_opt.max['params'])

#### Test with the best hyper parameter

In [10]:
print(f'train_x.shape, valid_x.shape: {train_x.shape, valid_x.shape}')

lgbm_clf = LGBMClassifier(
                 n_jobs= -1,
                n_estimators=1000,
                learning_rate=0.02,
                max_depth = 13,
                num_leaves=57,
                colsample_bytree=0.638,
                subsample=0.682,
                max_bin=435,
                reg_alpha=0.936,
                reg_lambda=4.533,
                min_child_weight=25,
                min_child_samples=166,
                verbosity= -1
)

callback = [lgb.early_stopping(100),lgb.log_evaluation(100)]

folds = KFold(n_splits=5, shuffle=True)
test_preds = np.zeros(app_test.shape[0])

for fold_n, (train_index, valid_index) in enumerate(folds.split(ftr_app_train)):
    print(f'###### Fold {fold_n} ######')

    train_x, valid_x, train_y, valid_y = ftr_app_train.iloc[train_index], ftr_app_train.iloc[valid_index], tgt_app_train.iloc[train_index], tgt_app_train.iloc[valid_index]

    lgbm_clf.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric= 'auc', callbacks= callback)

    test_preds += lgbm_clf.predict_proba(app_test.drop(columns=['SK_ID_CURR'], axis=1))[:, 1] / folds.n_splits


train_x.shape, valid_x.shape: ((246008, 306), (61503, 306))
###### Fold 0 ######
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.76303	valid_0's binary_logloss: 0.248173
[200]	valid_0's auc: 0.776578	valid_0's binary_logloss: 0.242385
[300]	valid_0's auc: 0.782756	valid_0's binary_logloss: 0.240092
[400]	valid_0's auc: 0.785955	valid_0's binary_logloss: 0.238935
[500]	valid_0's auc: 0.787779	valid_0's binary_logloss: 0.238274
[600]	valid_0's auc: 0.788701	valid_0's binary_logloss: 0.237941
[700]	valid_0's auc: 0.789345	valid_0's binary_logloss: 0.237735
[800]	valid_0's auc: 0.78975	valid_0's binary_logloss: 0.237613
[900]	valid_0's auc: 0.789917	valid_0's binary_logloss: 0.237558
[1000]	valid_0's auc: 0.790129	valid_0's binary_logloss: 0.237496
Did not meet early stopping. Best iteration is:
[997]	valid_0's auc: 0.790157	valid_0's binary_logloss: 0.23749
###### Fold 1 ######
Training until validation scores don't improve for 100 rounds
[100]	valid_0

In [11]:
app_test['TARGET'] = test_preds
app_test[['SK_ID_CURR','TARGET']].to_csv('lgbm_baseline.csv', index=False)

### 4. Submit to Kaggle

In [12]:
!kaggle competitions submit -c home-credit-default-risk -f lgbm_baseline.csv -m "Message"

100%|██████████████████████████████████████| 1.26M/1.26M [00:01<00:00, 1.14MB/s]
Successfully submitted to Home Credit Default Risk

## Public score: 0.79397, Private score: 0.78778