Reference: https://www.kaggle.com/eliotbarr/stacking-test-sklearn-xgboost-catboost-lightgbm

# Stacking Test-Sklearn, XGBoost, CatBoost, LightGBM

- Stacking Starter based on Allstate Faron's Script
https://www.kaggle.com/mmueller/allstate-claims-severity/stacking-starter/run/390867

- Preprocessing from ogrellier  
https://www.kaggle.com/ogrellier/good-fun-with-ligthgbm

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

In [2]:
NFOLDS = 3
SEED = 0
NROWS = None

In [3]:
data = pd.read_csv('data/application_train.csv')
test = pd.read_csv('data/application_test.csv')
prev = pd.read_csv('data/previous_application.csv')

In [4]:
data.shape, test.shape, prev.shape

((307511, 122), (48744, 121), (1670214, 37))

In [5]:
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [7]:
prev.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [8]:
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object'
]

In [9]:
categorical_feats

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

In [10]:
for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])

In [11]:
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,0,0,0,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,1,0,1,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,0,1,0,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,1,0,0,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,0,0,0,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,0,1,0,0,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,0,0,0,0,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,0,0,1,0,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,0,1,0,0,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,0,0,1,1,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


Categorical features are factorized

In [13]:
gc.enable()

In [14]:
y_train = data['TARGET']
del data['TARGET']

In [15]:
data.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,1,0,1,0,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,1,0,0,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
4,100007,0,0,0,0,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


TARGET feature is deleted

In [16]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]

for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [17]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
avg_prev.head()

Unnamed: 0_level_0,SK_ID_PREV,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,1369693.0,0.0,3951.0,24835.5,23787.0,2520.0,24835.5,4.0,13.0,0.0,...,0.0,8.0,2.0,0.0,365243.0,-1709.0,-1499.0,-1619.0,-1612.0,0.0
100002,1038818.0,0.0,9251.775,179055.0,179055.0,0.0,179055.0,0.0,9.0,0.0,...,8.0,24.0,3.0,8.0,365243.0,-565.0,125.0,-25.0,-17.0,0.0
100003,2281150.0,0.333333,56553.99,435436.5,484191.0,3442.5,435436.5,3.0,14.666667,0.0,...,2.666667,10.0,1.0,6.666667,365243.0,-1274.333333,-1004.333333,-1054.333333,-1047.333333,0.666667
100004,1564014.0,0.0,5357.25,24282.0,20106.0,4860.0,24282.0,4.0,5.0,0.0,...,0.0,4.0,0.0,10.0,365243.0,-784.0,-694.0,-724.0,-714.0,0.0
100005,2176837.0,0.5,4813.2,22308.75,20076.75,4464.0,44617.5,2.5,10.5,0.0,...,0.5,12.0,3.0,2.5,365243.0,-706.0,-376.0,-466.0,-460.0,0.0


In [18]:
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
cnt_prev.head()

Unnamed: 0_level_0,SK_ID_PREV
SK_ID_CURR,Unnamed: 1_level_1
100001,1
100002,1
100003,3
100004,1
100005,2


In [19]:
prev[prev['SK_ID_CURR'] == 100003]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
575941,1810518,100003,1,98356.995,900000.0,1035882.0,,900000.0,4,12,...,1,12.0,3,1,365243.0,-716.0,-386.0,-536.0,-527.0,1.0
1021650,2636178,100003,0,64567.665,337500.0,348637.5,0.0,337500.0,5,17,...,5,6.0,0,12,365243.0,-797.0,-647.0,-647.0,-639.0,0.0
1223745,2396755,100003,0,6737.31,68809.5,68053.5,6885.0,68809.5,0,15,...,2,12.0,0,7,365243.0,-2310.0,-1980.0,-1980.0,-1976.0,1.0


In [20]:
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']
avg_prev.head()

Unnamed: 0_level_0,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,...,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,nb_app
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,0.0,3951.0,24835.5,23787.0,2520.0,24835.5,4.0,13.0,0.0,1.0,...,8.0,2.0,0.0,365243.0,-1709.0,-1499.0,-1619.0,-1612.0,0.0,1
100002,0.0,9251.775,179055.0,179055.0,0.0,179055.0,0.0,9.0,0.0,1.0,...,24.0,3.0,8.0,365243.0,-565.0,125.0,-25.0,-17.0,0.0,1
100003,0.333333,56553.99,435436.5,484191.0,3442.5,435436.5,3.0,14.666667,0.0,1.0,...,10.0,1.0,6.666667,365243.0,-1274.333333,-1004.333333,-1054.333333,-1047.333333,0.666667,3
100004,0.0,5357.25,24282.0,20106.0,4860.0,24282.0,4.0,5.0,0.0,1.0,...,4.0,0.0,10.0,365243.0,-784.0,-694.0,-724.0,-714.0,0.0,1
100005,0.5,4813.2,22308.75,20076.75,4464.0,44617.5,2.5,10.5,0.0,1.0,...,12.0,3.0,2.5,365243.0,-706.0,-376.0,-466.0,-460.0,0.0,2


In [21]:
x_train = data.merge(right=avg_prev.reset_index(), how='left',
                     on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left',
                   on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

In [22]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

In [23]:
x_train = x_train[features]
x_test = x_test[features]

In [24]:
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

In [25]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]

In [26]:
class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]

In [27]:
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [28]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [29]:
ntrain = x_train.shape[0]
ntest = x_test.shape[0]

def get_oof(clf):
    # Change np.zeros(ntrain)
    oof_train = np.zeros((ntrain, ))
    oof_test = np.zeros((ntest, ))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [30]:
np.empty((NFOLDS, ntest))

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
et_params = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'num_leaves':123,
    'colsample_bytree':0.8,
    'subsample':0.9,
    'max_depth':15,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':2    
}

In [32]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf=CatBoostClassifier, seed=SEED, params=catboost_params)
lg = LightGBMWrapper(clf=LGBMClassifier, seed=SEED, params=lightgbm_params)

In [33]:
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

  if getattr(data, 'base', None) is not None and \


0:	total: 188ms	remaining: 37.4s
1:	total: 271ms	remaining: 26.8s
2:	total: 374ms	remaining: 24.6s
3:	total: 478ms	remaining: 23.4s
4:	total: 623ms	remaining: 24.3s
5:	total: 710ms	remaining: 23s
6:	total: 789ms	remaining: 21.8s
7:	total: 914ms	remaining: 21.9s
8:	total: 1s	remaining: 21.3s
9:	total: 1.11s	remaining: 21s
10:	total: 1.21s	remaining: 20.8s
11:	total: 1.32s	remaining: 20.7s
12:	total: 1.42s	remaining: 20.5s
13:	total: 1.51s	remaining: 20.1s
14:	total: 1.63s	remaining: 20.1s
15:	total: 1.71s	remaining: 19.6s
16:	total: 1.81s	remaining: 19.5s
17:	total: 1.9s	remaining: 19.2s
18:	total: 2.02s	remaining: 19.2s
19:	total: 2.12s	remaining: 19.1s
20:	total: 2.21s	remaining: 18.9s
21:	total: 2.29s	remaining: 18.5s
22:	total: 2.38s	remaining: 18.3s
23:	total: 2.47s	remaining: 18.1s
24:	total: 2.56s	remaining: 17.9s
25:	total: 2.64s	remaining: 17.7s
26:	total: 2.74s	remaining: 17.6s
27:	total: 2.84s	remaining: 17.4s
28:	total: 2.94s	remaining: 17.4s
29:	total: 3.03s	remaining: 17.2

41:	total: 3.9s	remaining: 14.7s
42:	total: 3.98s	remaining: 14.6s
43:	total: 4.07s	remaining: 14.4s
44:	total: 4.16s	remaining: 14.3s
45:	total: 4.25s	remaining: 14.2s
46:	total: 4.33s	remaining: 14.1s
47:	total: 4.42s	remaining: 14s
48:	total: 4.5s	remaining: 13.9s
49:	total: 4.6s	remaining: 13.8s
50:	total: 4.68s	remaining: 13.7s
51:	total: 4.77s	remaining: 13.6s
52:	total: 4.87s	remaining: 13.5s
53:	total: 4.95s	remaining: 13.4s
54:	total: 5.03s	remaining: 13.3s
55:	total: 5.13s	remaining: 13.2s
56:	total: 5.21s	remaining: 13.1s
57:	total: 5.3s	remaining: 13s
58:	total: 5.39s	remaining: 12.9s
59:	total: 5.46s	remaining: 12.7s
60:	total: 5.55s	remaining: 12.7s
61:	total: 5.65s	remaining: 12.6s
62:	total: 5.73s	remaining: 12.5s
63:	total: 5.82s	remaining: 12.4s
64:	total: 5.91s	remaining: 12.3s
65:	total: 6.01s	remaining: 12.2s
66:	total: 6.1s	remaining: 12.1s
67:	total: 6.18s	remaining: 12s
68:	total: 6.28s	remaining: 11.9s
69:	total: 6.36s	remaining: 11.8s
70:	total: 6.45s	remainin

84:	total: 7.71s	remaining: 10.4s
85:	total: 7.78s	remaining: 10.3s
86:	total: 7.86s	remaining: 10.2s
87:	total: 7.96s	remaining: 10.1s
88:	total: 8.05s	remaining: 10s
89:	total: 8.13s	remaining: 9.94s
90:	total: 8.22s	remaining: 9.85s
91:	total: 8.31s	remaining: 9.75s
92:	total: 8.41s	remaining: 9.68s
93:	total: 8.51s	remaining: 9.59s
94:	total: 8.59s	remaining: 9.49s
95:	total: 8.68s	remaining: 9.41s
96:	total: 8.76s	remaining: 9.31s
97:	total: 8.86s	remaining: 9.22s
98:	total: 8.95s	remaining: 9.13s
99:	total: 9.03s	remaining: 9.03s
100:	total: 9.12s	remaining: 8.94s
101:	total: 9.21s	remaining: 8.85s
102:	total: 9.29s	remaining: 8.75s
103:	total: 9.39s	remaining: 8.67s
104:	total: 9.47s	remaining: 8.57s
105:	total: 9.56s	remaining: 8.48s
106:	total: 9.65s	remaining: 8.39s
107:	total: 9.74s	remaining: 8.3s
108:	total: 9.82s	remaining: 8.2s
109:	total: 9.92s	remaining: 8.12s
110:	total: 10s	remaining: 8.02s
111:	total: 10.1s	remaining: 7.93s
112:	total: 10.2s	remaining: 7.85s
113:	to

In [34]:
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))

XG-CV: 0.25969546149690725
ET-CV: 0.26296502347137407
RF-CV: 0.26298562130066466
RF-CV: 0.33069084316813807


In [35]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

In [36]:
print("x_train shape & x_test shape: {}, {}".format(x_train.shape, x_test.shape))

(307511, 4), (48744, 4)


In [37]:
logistic_regression = LogisticRegression()
# Stacking
logistic_regression.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]

In [39]:
test[['SK_ID_CURR', 'TARGET']].to_csv('stacking.csv', index=False, float_format='%.8f')