In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
from bayes_opt import BayesianOptimization
import gc

In [2]:
TARGET = 'target'
init_round=15
opt_round= 15
n_folds=10
random_state = 0 
n_estimators = 10000
learning_rate=0.01

In [3]:
train = pd.read_csv('train_emb.csv')
test = pd.read_csv('test_emb.csv')

In [4]:
X = train.drop(['target'], axis = 1)
y = train.target

In [5]:
train_data = lgb.Dataset(data=X, label=y, free_raw_data=True)

In [6]:
def lgb_eval(num_leaves, learning_rate, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {'objective':'binary','num_iterations':15000, 'early_stopping_round':100, 'metric':'auc'}#n_estimator
    params["num_leaves"] = round(int(num_leaves))
    params["learning_rate"] = learning_rate
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = round(int(max_depth))
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_state, stratified=True, verbose_eval =200)
    return max(cv_result['auc-mean'])

In [7]:
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 300),
                                        'learning_rate':(0.01, 0.05),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'lambda_l1': (0, 5),
                                        'lambda_l2': (0, 3),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=random_state)

In [9]:
opt_params = lgbBO.maximize(init_points=init_round, n_iter=opt_round)

In [10]:
# |   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_sp... | num_le... |
# -------------------------------------------------------------------------------------------------------------------------------------

# |  18       |  0.7947   |  0.8875   |  0.1632   |  4.591    |  1.681    |  0.01483  |  5.235    |  49.96    |  0.08949  |  24.13    |

In [4]:
# for making train - valid sets
from sklearn.model_selection import train_test_split

#Split in 80% train and 20% test set
train_df, val_df = train_test_split(train, test_size = 0.1, random_state=random_state)

#Define 'y' labels
train_y = train_df.target
val_y = val_df.target

#Define 'x' sets
train_x = train_df.drop(['target'], axis = 1)
val_x = val_df.drop(['target'], axis = 1)

In [5]:
train_data = lgb.Dataset(data=train_x, label=train_y, free_raw_data=True)
val_data = lgb.Dataset(data=val_x, label=val_y,  free_raw_data=True)

In [6]:
params = {'objective':'binary',
        'num_iterations':15000, 
        'early_stopping_round':100, 
        'metric':'auc',
        'num_leaves': round(int(24.13)),
        'learning_rate':0.01483,
        'feature_fraction': 0.1632,
        'bagging_fraction': 0.8875,
        'max_depth': round(int(5.235)),
        'lambda_l1': 4.591,
        'lambda_l2': 1.681,
        'min_split_gain': 0.08949,
        'min_child_weight': 49.96,
        'seed':random_state}

In [None]:
lgb.cv(params, train_data, nfold=n_folds, seed=random_state, stratified=True, verbose_eval =200)

In [25]:
lgbm = lgb.train(params,
                 train_data,
                 num_boost_round=40000,
                 valid_sets=val_data,
                 early_stopping_rounds=100,
                 verbose_eval=200,
                 )
#0.785229

Training until validation scores don't improve for 500 rounds.
[200]	valid_0's auc: 0.766192
[400]	valid_0's auc: 0.778527
[600]	valid_0's auc: 0.782127
[800]	valid_0's auc: 0.783636
[1000]	valid_0's auc: 0.784357
[1200]	valid_0's auc: 0.784769
[1400]	valid_0's auc: 0.784916
[1600]	valid_0's auc: 0.785091
[1800]	valid_0's auc: 0.785215
[2000]	valid_0's auc: 0.785193
[2200]	valid_0's auc: 0.785245
[2400]	valid_0's auc: 0.785282
[2600]	valid_0's auc: 0.785294
[2800]	valid_0's auc: 0.785292
[3000]	valid_0's auc: 0.785283
[3200]	valid_0's auc: 0.785264
Early stopping, best iteration is:
[2887]	valid_0's auc: 0.78532


In [20]:
X_test = test

In [21]:
y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)

In [22]:
sub = pd.read_csv('sample_submission.csv')
sub.target = y_pred

In [23]:
sub.to_csv('submit.csv', index=False)