https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import LabelEncoder
import gc

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import model_selection, preprocessing, metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
TARGET = 'target'
init_round=15
opt_round= 15
n_folds=10
random_state = 0 
n_estimators = 10000
learning_rate=0.01

In [77]:
train = pd.read_csv('fm_train.csv')
test = pd.read_csv('fm_test.csv')
both = pd.concat([train,test])

In [78]:
both.shape

(1000000, 81)

In [79]:
def get_num_list(df):
    num_list = []
    for c in df:
        if 'ord' in c or 'num' in c:
            num_list.append(c)
    return num_list
num_list = get_num_list(both)

In [80]:
num_list = num_list[:-6] #put the last 5 ordinal to categorical
cat_names = [c for c in both if c not in num_list and c != 'Unnamed: 0']
#cat_names

In [81]:
#num_list

In [82]:
both[cat_names] = both[cat_names].astype(str)
both[num_list] = both[num_list].astype(np.float32)
both['target'] = both['target'].astype(str)

In [83]:
both = both.iloc[:, 1:]

In [84]:
drop_list = ['N_MOST_COMMON(ordinal.ord_4)[1]', 'N_MOST_COMMON(ordinal.ord_4)[2]', 
                  'N_MOST_COMMON(ordinal.ord_5)[1]', 'N_MOST_COMMON(ordinal.ord_5)[2]',
                  'N_MOST_COMMON(ordinal.ord_2)[1]', 'N_MOST_COMMON(ordinal.ord_2)[2]',
                  'N_MOST_COMMON(ordinal.ord_0)[1]', 'N_MOST_COMMON(ordinal.ord_0)[2]',
                  'N_MOST_COMMON(ordinal.ord_1)[1]', 'N_MOST_COMMON(ordinal.ord_1)[2]',
                  'N_MOST_COMMON(ordinal.ord_3)[1]', 'N_MOST_COMMON(ordinal.ord_3)[2]',
                  'SKEW(numeric.ord_5)','SKEW(numeric.ord_2)','SKEW(numeric.ord_3)',
                  'SKEW(numeric.ord_4)','SKEW(numeric.ord_0)','SKEW(numeric.ord_1)']
both = both.drop(drop_list, 1)

In [85]:
new_cat_names = [c for c in cat_names if c in both.columns]

In [86]:
def label_encoder(input_df, encoder_dict=None):
    """ Process a dataframe into a form useable by LightGBM """
    # Label encode categoricals
    #categorical_feats = input_df.columns[input_df.dtypes == 'object']
    
    categorical_feats = new_cat_names
    encoder_dict = {}
    for feat in categorical_feats:
        encoder = LabelEncoder()
        input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL'))
        encoder_dict[feat] = encoder
    return input_df, encoder_dict

# should split to encode, but this is just a prototype
# can extract dict and replace with unknown 
# https://stackoverflow.com/questions/21057621/sklearn-labelencoder-with-never-seen-before-values

both, encoder_dict = label_encoder(both)
new_cat_names.remove('target')

In [87]:
train = both.iloc[:600000]
test = both.iloc[600000:]
del both
gc.collect()

263

In [88]:
X = train.drop('target', axis=1)
y = train.target

X_test = test.drop('target', axis=1)
y_test = test.target

In [40]:
train_data = lgb.Dataset(data=X, label=y, categorical_feature = new_cat_names, free_raw_data=False)
#del X, y
#gc.collect()

test_data = lgb.Dataset(data=X_test, label=y_test, categorical_feature = new_cat_names, free_raw_data=False)
#del X_test, y_test
#gc.collect()

44

In [16]:
def lgb_eval(num_leaves, learning_rate, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {'objective':'binary','num_iterations':15000, 'early_stopping_round':100, 'metric':'auc'}#n_estimator
    params["num_leaves"] = round(int(num_leaves))
    params["learning_rate"] = learning_rate
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = round(int(max_depth))
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_state, stratified=True, verbose_eval =200)
    return max(cv_result['auc-mean'])

In [17]:
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 300),
                                        'learning_rate':(0.01, 0.05),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'lambda_l1': (0, 5),
                                        'lambda_l2': (0, 3),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=random_state)

In [18]:
opt_params = lgbBO.maximize(init_points=init_round, n_iter=opt_round)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
[200]	cv_agg's auc: 0.77448 + 0.00297989
[400]	cv_agg's auc: 0.778386 + 0.00296255
| [0m 1       [0m | [0m 0.7784  [0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 0.02695 [0m | [0m 7.577   [0m | [0m 24.69   [0m | [0m 0.08929 [0m | [0m 290.0   [0m |
[200]	cv_agg's auc: 0.777666 + 0.00286688
[400]	cv_agg's auc: 0.779448 + 0.00304883
| [95m 2       [0m | [95m 0.7796  [0m | [95m 0.8767  [0m | [95m 0.7334  [0m | [95m 2.644   [0m | [95m 1.704   [0m | [95m 0.04702 [0m | [95m 5.283   [0m | [95m 8.921   [0m | [95m 0.003002[0m | [95m 253.8   [0m |
[200]	cv_agg's auc: 0.774727 + 0.00275753
[400]	cv_agg's auc: 0.776703 + 0.00299011
| [0m 3       [0m | [0m

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_sp... | num_le... |
|  16       |  0.7856   |  0.9174   |  0.1062   |  0.07204  |  0.006567 |  0.01995  |  5.142    |  6.584    |  0.09796  |  297.4    |

In [68]:
# for making train - valid sets
from sklearn.model_selection import train_test_split

#Split in 80% train and 20% test set
train_df, val_df = train_test_split(train, test_size = 0.1, random_state=random_state)

#Define 'y' labels
train_y = train_df.target
val_y = val_df.target

#Define 'x' sets
train_x = train_df.drop(['target'], axis = 1)
val_x = val_df.drop(['target'], axis = 1)

In [69]:
train_data = lgb.Dataset(data=train_x, label=train_y, categorical_feature = new_cat_names, free_raw_data=False)
val_data = lgb.Dataset(data=val_x, label=val_y, categorical_feature = new_cat_names, free_raw_data=False)

In [70]:
params = {'objective':'binary',
        'num_iterations':15000, 
        'early_stopping_round':100, 
        'metric':'auc',
        'num_leaves': round(int(297.4)),
        'learning_rate':0.01995,
        'feature_fraction': 0.1062,
        'bagging_fraction': 0.9174,
        'max_depth': round(int(5.142)),
        'lambda_l1': 0.07204,
        'lambda_l2': 0.006567,
        'min_split_gain': 0.09796,
        'min_child_weight': 6.584,
        'seed':random_state}

In [71]:
lgbm = lgb.train(params,
                 train_data,
                 num_boost_round=40000,
                 valid_sets=val_data,
                 early_stopping_rounds=100,
                 verbose_eval=200,
                 )

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.759493
[400]	valid_0's auc: 0.772221
[600]	valid_0's auc: 0.777401
[800]	valid_0's auc: 0.78143
[1000]	valid_0's auc: 0.783015
[1200]	valid_0's auc: 0.783901
[1400]	valid_0's auc: 0.784419
[1600]	valid_0's auc: 0.784867
[1800]	valid_0's auc: 0.785288
[2000]	valid_0's auc: 0.785434
[2200]	valid_0's auc: 0.785606
[2400]	valid_0's auc: 0.785703
[2600]	valid_0's auc: 0.785842
[2800]	valid_0's auc: 0.785897
[3000]	valid_0's auc: 0.785935
[3200]	valid_0's auc: 0.786034
[3400]	valid_0's auc: 0.786074
[3600]	valid_0's auc: 0.786147
Early stopping, best iteration is:
[3666]	valid_0's auc: 0.786164


In [89]:
y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)

In [91]:
lgbm.save_model('model.txt')

<lightgbm.basic.Booster at 0x7f53f48981d0>

In [92]:
sub = pd.read_csv('sample_submission.csv')
sub.target = y_pred

In [96]:
sub.to_csv('submit.csv', index=False)