https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import LabelEncoder
import gc

In [2]:
TARGET = 'target'
init_round=15
opt_round= 15
n_folds=10
random_state = 0 
n_estimators = 10000
learning_rate=0.01

In [3]:
train = pd.read_csv('X_train_te.csv')
test = pd.read_csv('X_test_te.csv')

In [4]:
train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_3_enc,nom_4_enc,nom_5_enc,nom_6_enc,nom_7_enc,nom_8_enc,nom_9_enc,day_enc,month_enc,target
0,0.0,0.0,0.0,0.0,0.0,2,4,3,5,0,...,0.219536,0.206991,0.207101,0.182796,0.096974,0.130056,0.146341,0.198798,0.145765,0
1,1.0,1.0,0.0,0.0,1.0,2,3,0,-1,3,...,0.186715,0.179422,0.19863,0.214689,0.128219,0.1942,0.142506,0.212862,0.209179,0
2,0.0,1.0,0.0,0.0,0.0,2,-1,3,0,0,...,0.158121,0.206991,0.178694,0.159091,0.264662,0.176863,0.186347,0.163422,0.21281,0
3,-1.0,0.0,0.0,0.0,0.0,2,0,3,3,3,...,0.178768,0.179422,0.208511,0.199571,0.151746,0.225241,0.286885,0.163799,0.145765,0
4,0.0,-1.0,0.0,1.0,0.0,2,5,3,2,-1,...,0.202284,0.183874,0.154303,0.149485,0.196776,0.191346,0.245455,0.163422,0.225295,0


In [5]:
X = train.drop(['target'], axis = 1)
categorical_features = [col for c, col in enumerate(X.columns) \
                        if not ( np.issubdtype(X.dtypes[c], np.number )  )  ]
y = train['target']
print( len(categorical_features), X.shape, y.shape, y.mean()  )
for f in categorical_features:
    X[f] = X[f].astype('category')

0 (600000, 92) (600000,) 0.187205


In [6]:
train_data = lgb.Dataset(data=X, label=y, categorical_feature = X.columns.tolist(), free_raw_data=False)

In [7]:
def lgb_eval(num_leaves, learning_rate, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {'objective':'binary','num_iterations':15000, 'early_stopping_round':100, 'metric':'auc'}#
    params["num_leaves"] = round(int(num_leaves))
    params["learning_rate"] = learning_rate
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = round(int(max_depth))
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_state, stratified=True, verbose_eval =200)
    return max(cv_result['auc-mean'])

In [8]:
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 300),
                                        'learning_rate':(0.01, 0.05),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'lambda_l1': (0, 5),
                                        'lambda_l2': (0, 3),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=random_state)

In [9]:
opt_params = lgbBO.maximize(init_points=init_round, n_iter=opt_round)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
[200]	cv_agg's auc: 0.773882 + 0.00241158
[400]	cv_agg's auc: 0.777654 + 0.00243271
| [0m 1       [0m | [0m 0.7777  [0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 0.02695 [0m | [0m 7.577   [0m | [0m 24.69   [0m | [0m 0.08929 [0m | [0m 290.0   [0m |
[200]	cv_agg's auc: 0.777325 + 0.00232291
[400]	cv_agg's auc: 0.779218 + 0.0024149
| [95m 2       [0m | [95m 0.7794  [0m | [95m 0.8767  [0m | [95m 0.7334  [0m | [95m 2.644   [0m | [95m 1.704   [0m | [95m 0.04702 [0m | [95m 5.283   [0m | [95m 8.921   [0m | [95m 0.003002[0m | [95m 253.8   [0m |
[200]	cv_agg's auc: 0.774544 + 0.00242368
[400]	cv_agg's auc: 0.776379 + 0.00237355
| [0m 3       [0m | [0m

KeyboardInterrupt: 

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_sp... | num_le... |

|  19       |  0.7846   |  0.9539   |  0.1041   |  3.114    |  0.08302  |  0.01104  |  5.327    |  46.5     |  0.0375   |  26.51    |

In [11]:
# for making train - valid sets
from sklearn.model_selection import train_test_split

#Split in 80% train and 20% test set
train_df, val_df = train_test_split(train, test_size = 0.1, random_state=random_state)

#Define 'y' labels
train_y = train_df.target
val_y = val_df.target

#Define 'x' sets
train_x = train_df.drop(['target'], axis = 1)
val_x = val_df.drop(['target'], axis = 1)

categorical_features = [col for c, col in enumerate(train_x.columns) \
                        if not ( np.issubdtype(train_x.dtypes[c], np.number )  )  ]

for f in categorical_features:
    train_x[f] = train_x[f].astype('category')
    val_x[f] = val_x[f].astype('category')

In [12]:
train_data = lgb.Dataset(data=train_x, label=train_y, categorical_feature = categorical_features, free_raw_data=False)
val_data = lgb.Dataset(data=val_x, label=val_y, categorical_feature = categorical_features, free_raw_data=False)

In [13]:
params = {'objective':'binary',
        'num_iterations':15000, 
        'early_stopping_round':100, 
        'metric':'auc',
        'num_leaves': round(int(26.51)),
        'learning_rate':0.01104,
        'feature_fraction': 0.1041,
        'bagging_fraction': 0.9539,
        'max_depth': round(int(5.327)),
        'lambda_l1': 3.114,
        'lambda_l2': 0.08302,
        'min_split_gain': 0.0375,
        'min_child_weight': 46.5,
        'seed':random_state}

In [14]:
lgbm = lgb.train(params,
                 train_data,
                 num_boost_round=40000,
                 valid_sets=val_data,
                 early_stopping_rounds=100,
                 verbose_eval=200,
                 )

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.757991
[400]	valid_0's auc: 0.769884
[600]	valid_0's auc: 0.777457
[800]	valid_0's auc: 0.781298
[1000]	valid_0's auc: 0.783578
[1200]	valid_0's auc: 0.785217
[1400]	valid_0's auc: 0.786268
[1600]	valid_0's auc: 0.787129
[1800]	valid_0's auc: 0.78752
[2000]	valid_0's auc: 0.787832
[2200]	valid_0's auc: 0.787959
[2400]	valid_0's auc: 0.788069
[2600]	valid_0's auc: 0.788161
[2800]	valid_0's auc: 0.788217
[3000]	valid_0's auc: 0.788232
Early stopping, best iteration is:
[3032]	valid_0's auc: 0.788238


In [16]:
X_test = test

for f in categorical_features:
    X_test[f] = X_test[f].astype('category')

In [17]:
y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)

In [18]:
lgbm.save_model('model.txt')

<lightgbm.basic.Booster at 0x7f229fe0bc50>

In [19]:
sub = pd.read_csv('sample_submission.csv')
sub.target = y_pred

In [20]:
sub.to_csv('submit.csv', index=False)