In [1]:
## LIGHT GBM ##
# "LightGBM is a gradient boosting framework that uses tree based learning algorithms" 

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import gc
gc.enable()
import lightgbm as lgb
import utilities

In [3]:
train = utilities.read_csv('../../datatmp/data/trainjoincollnoisy.csv')
test = utilities.read_csv('../../datatmp/data/testjoincollnoisy.csv')

In [5]:
target = train['TARGET'].astype(int).copy()
train.drop('TARGET',axis=1, inplace=True)

In [6]:
testIds = test['SK_ID_CURR'].astype(int).copy()

train.drop('SK_ID_CURR', axis=1, inplace=True)
test.drop('SK_ID_CURR', axis=1, inplace=True)

In [7]:
# Due to the last version of lgb, we got an error related to special chars in column names.
train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train.columns]
test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test.columns]

In [8]:
importances = utilities.read_csv('../../datatmp/data/importances.csv')

In [9]:
# Select all non-zero importance feats
impFeats = []
for feat in importances:
    if importances.loc[0][feat] > 0:
        impFeats.append(feat)
impFeats = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in impFeats]

In [11]:
# Final train test
train = train[impFeats]
test = test[impFeats]

In [12]:
from sklearn.model_selection import KFold
def trainNpredLgb(train, test, y, nFolds):
    """ Training and predictions of test with Light Gmb Kfold """
    import lightgbm as lgb

    # Prepare test predictions series 
    testPreds = np.zeros(test.shape[0])
    # Unbalanced dataset => better to fold (use stratified K-Folds?)
    folds = KFold(n_splits=nFolds, shuffle=True, random_state=42)
    for fold, (tIds, vIds) in enumerate(folds.split(train, y)):
        trainX, trainY = train.iloc[tIds], y.iloc[tIds]
        valX, valY = train.iloc[vIds], y.iloc[vIds]
        # LightGBM parameters found by Bayesian optimization (from https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code)
        clf = lgb.LGBMClassifier(
            objective = 'binary',
            boosting_type = 'gbdt',
            nthread=4,
            n_estimators=5000,
            learning_rate=0.03,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )
        clf.fit(
            trainX,
            trainY,
            eval_set=[(trainX, trainY), (valX, valY)],
            eval_metric='auc',
            verbose=100,
            early_stopping_rounds=200 
        )
        # Average best iteration preds for test 
        testPreds += clf.predict_proba(test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits 
        # Free space
        del trainX, trainY, valX, valY
        gc.collect()
    del train, test, y
    gc.collect()
    return testPreds

In [13]:
# Train model on final train, test
%time predictions = trainNpredLgb(train, test, target, 5)

Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.785123	training's binary_logloss: 0.24009	valid_1's auc: 0.766725	valid_1's binary_logloss: 0.244508
[200]	training's auc: 0.807882	training's binary_logloss: 0.23075	valid_1's auc: 0.778798	valid_1's binary_logloss: 0.23974
[300]	training's auc: 0.821821	training's binary_logloss: 0.224989	valid_1's auc: 0.783112	valid_1's binary_logloss: 0.238003
[400]	training's auc: 0.833072	training's binary_logloss: 0.220401	valid_1's auc: 0.785588	valid_1's binary_logloss: 0.237039
[500]	training's auc: 0.842505	training's binary_logloss: 0.216501	valid_1's auc: 0.787064	valid_1's binary_logloss: 0.23647
[600]	training's auc: 0.850428	training's binary_logloss: 0.213215	valid_1's auc: 0.787689	valid_1's binary_logloss: 0.236201
[700]	training's auc: 0.857905	training's binary_logloss: 0.210023	valid_1's auc: 0.788533	valid_1's binary_logloss: 0.235904
[800]	training's auc: 0.864809	training's binary_logloss: 0.

In [14]:
# Create submission csv 
subdf = pd.DataFrame({'SK_ID_CURR': testIds, 'TARGET': predictions})

In [17]:
# Store it
subdf.to_csv('../../datatmp/data/submissionlgb.csv', index=False)