**Inspired with: [1]**

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [3]:
y = train['target']

cols_id_and_target = ['target', 'ID_code']
# insignificant vars as a result of a logit regression analysis:
#cols_insignificant_due_to_logit = ['var_7', 'var_10', 'var_17', 'var_27', 'var_30', 'var_38', 'var_39', 'var_41', 'var_96', 'var_98', 'var_100', 'var_103', 'var_117', 'var_124', 'var_126', 'var_136', 'var_158', 'var_160', 'var_161', 'var_183', 'var_185']
cols_insignificant_due_to_logit = []
# insignificant vars as a result of Kolmogorov-Smirnov analysis [2]:
#cols_insignificant_due_to_KS = ['var_9', 'var_14', 'var_29', 'var_46', 'var_61', 'var_73', 'var_79', 'var_129', 'var_184']
cols_insignificant_due_to_KS = []

X = train.drop(cols_id_and_target + cols_insignificant_due_to_logit + cols_insignificant_due_to_KS, axis = 1)

test_X = test.drop(['ID_code'] + cols_insignificant_due_to_logit + cols_insignificant_due_to_KS, axis = 1)

In [4]:
sample_train = train.sample( n = 2000, random_state = 1)
sample_y = sample_train['target']
sample_X = sample_train.drop(cols_id_and_target + cols_insignificant_due_to_logit + cols_insignificant_due_to_KS, axis = 1)

# use all data instead of sample
sample_y = y
sample_X = X

In [5]:
param = {
    'bagging_freq': 5,          
    'bagging_fraction': 0.331,
    'boost_from_average':'false',   
    'boost': 'gbdt',
    'feature_fraction': 0.0405,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1
}

In [6]:
# LighGBM API: [3]
# launch lgb on all data with known early_stopping_rounds, without kfolds
'''pred_for_train = np.zeros(len(sample_X))
pred_for_test = np.zeros(len(test_X))
kfold = StratifiedKFold(n_splits = 10, shuffle = False, random_state = 1)
for n, (train_i, val_i) in enumerate(kfold.split(sample_X, sample_y)):
    print("Fold №{}".format(n))
    train_data = lgb.Dataset(sample_X.iloc[train_i], label = sample_y.iloc[train_i])
    val_data = lgb.Dataset(sample_X.iloc[val_i], label = sample_y.iloc[val_i])
    trained_lgb = lgb.train(param, train_data, 30000, valid_sets = [train_data, val_data], verbose_eval = 1000, early_stopping_rounds = 4000)
    pred_for_train[val_i] = trained_lgb.predict(sample_X.iloc[val_i], num_iteration = trained_lgb.best_iteration)
    pred_for_test += trained_lgb.predict(test_X, num_iteration = trained_lgb.best_iteration) / kfold.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(sample_y, pred_for_train)))
'''

'pred_for_train = np.zeros(len(sample_X))\npred_for_test = np.zeros(len(test_X))\nkfold = StratifiedKFold(n_splits = 10, shuffle = False, random_state = 1)\nfor n, (train_i, val_i) in enumerate(kfold.split(sample_X, sample_y)):\n    print("Fold №{}".format(n))\n    train_data = lgb.Dataset(sample_X.iloc[train_i], label = sample_y.iloc[train_i])\n    val_data = lgb.Dataset(sample_X.iloc[val_i], label = sample_y.iloc[val_i])\n    trained_lgb = lgb.train(param, train_data, 30000, valid_sets = [train_data, val_data], verbose_eval = 1000, early_stopping_rounds = 4000)\n    pred_for_train[val_i] = trained_lgb.predict(sample_X.iloc[val_i], num_iteration = trained_lgb.best_iteration)\n    pred_for_test += trained_lgb.predict(test_X, num_iteration = trained_lgb.best_iteration) / kfold.n_splits\n\nprint("CV score: {:<8.5f}".format(roc_auc_score(sample_y, pred_for_train)))\n'

In [7]:
# num_iteration is gottn from LightGBM v1 model as anaverage
train_data = lgb.Dataset(sample_X, sample_y)
trained_lgb = lgb.train(param, train_data, 10218, verbose_eval = 100)

In [8]:
trained_lgb.save_model('log-for-LigthGBM-v2-all-vars.csv')

<lightgbm.basic.Booster at 0x7f6b0c96a048>

In [9]:
pred_for_test = trained_lgb.predict(test_X, num_iteration = 10218)

In [10]:
output = pd.DataFrame({'ID_code': test.ID_code,
                       'target': pred_for_test})
output.to_csv('submission.csv', index=False)

**References:**

[1] https://www.kaggle.com/gpreda/santander-fast-compact-solution/log

[2] https://www.kaggle.com/youhanlee/yh-eda-i-want-to-see-all

[3] https://lightgbm.readthedocs.io/en/latest/Python-API.html#training-api