In [12]:
import os
import gc
import numpy as np 
import pandas as pd 
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFE
import lightgbm as lgb



In [8]:
train_stacked = pd.read_csv('../oofs/kain-train-features-v0.1.2.csv', index_col=0)
test_stacked = pd.read_csv('../oofs/kain-test-features-v0.1.2.csv', index_col=0)

In [9]:
train = pd.read_csv('../../data/application_train.csv.zip', nrows=None)
n_train = train.shape[0]
y = train['TARGET']
gc.collect()

144

In [10]:
clf = lgb.LGBMClassifier(boosting_type='goss', learning_rate=0.03, objective='binary',
                         num_leaves=16, subsample=0.8, nthread=4,
                         max_depth=4, class_weight={0:1, 1:3}, metric='auc',
                         colsample_bytree=0.35, reg_alpha=0, reg_lambda=0)

In [13]:
selector = RFE(clf, 45, step=2)
selector = selector.fit(train_stacked, y)

In [14]:
selected_features = [i for i, y in enumerate(selector.ranking_) if y == 1]

In [15]:
selected_features

[9,
 11,
 12,
 13,
 16,
 23,
 24,
 25,
 26,
 27,
 34,
 38,
 39,
 40,
 41,
 43,
 44,
 48,
 52,
 53,
 56,
 57,
 61,
 62,
 64,
 66,
 67,
 69,
 72,
 76,
 81,
 83,
 84,
 86,
 87,
 105,
 106,
 110,
 127,
 129,
 134,
 145,
 146,
 149,
 150]

In [16]:
train_features = pd.DataFrame(train_stacked.iloc[:, selected_features].values, columns=
              ['y_' + str(i) for i in selected_features])
test_features = pd.DataFrame(test_stacked.iloc[:, selected_features].values, columns=['y_' + str(i) for i in selected_features] )

In [17]:
test_features.head()

Unnamed: 0,y_9,y_11,y_12,y_13,y_16,y_23,y_24,y_25,y_26,y_27,...,y_105,y_106,y_110,y_127,y_129,y_134,y_145,y_146,y_149,y_150
0,0.027859,0.034761,0.026316,0.027343,0.026407,0.029955,0.031383,0.029491,0.028048,0.030111,...,0.307194,0.092173,0.205684,0.0407,0.079934,0.007814,0.085077,0.059713,0.112115,0.059259
1,0.112407,0.117645,0.12317,0.122424,0.111397,0.122372,0.12197,0.108754,0.106518,0.114218,...,0.735932,0.294385,0.467233,0.136157,0.253435,0.095759,0.236243,0.293807,0.22096,0.127498
2,0.025666,0.021324,0.021003,0.025299,0.023309,0.031361,0.029568,0.02843,0.029473,0.033136,...,0.376777,0.061269,0.088654,0.034251,0.073055,0.0278,0.042244,0.063244,0.016997,0.037535
3,0.020858,0.027794,0.023603,0.024004,0.026581,0.033679,0.030641,0.032629,0.029063,0.03248,...,0.392371,0.105019,0.127739,0.044724,0.077136,0.026577,0.080953,0.120677,0.043664,0.035744
4,0.114807,0.118162,0.109459,0.111463,0.109929,0.105033,0.111265,0.113932,0.114818,0.10625,...,0.806458,0.284324,0.499732,0.126538,0.284348,0.107383,0.332027,0.250013,0.136448,0.118943


In [18]:
aucs = []
test_set = []
oof_preds = np.zeros(train.shape[0])

kf = KFold(n_splits=5, random_state=1002, shuffle=True)
kf.get_n_splits(train_features)

n_bagged = 6

for train_index, test_index in kf.split(train_features):
    print("TRAIN: ", train_index, "TEST: ", test_index)
    
    X = train_features
    y_ = y.values
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_[train_index], y_[test_index]
    
    
    oof_baggs = np.zeros([n_bagged , x_test.shape[0]])
    preds_baggs = np.zeros([n_bagged , test_features.shape[0]])
    
    for _it in range(1, n_bagged):
        
        
        dtest = test_features
        
        dtrain = lgb.Dataset(data=x_train, 
                                 label=y_train, 
                                 free_raw_data=False, silent=True)
        dvalid = lgb.Dataset(data=x_test,
                                 label=y_test, 
                                 free_raw_data=False, silent=True)


        params = {
                'objective': 'binary',
                'boosting_type': 'goss',
                'nthread': 4,
                'learning_rate': 0.03   
                'num_leaves': 2 ** 4,
                'colsample_bytree': 0.35,
                'subsample': 0.95,
                'max_depth': 4,
                'reg_alpha': 0,
                'reg_lambda': 0,
                'seed': _it,
                'scale_pos_weight': 3,
                'verbose': -1,
                'metric': 'auc'
            }
        
        
        
        model = lgb.train(
                params=params,
                train_set=dtrain,
                num_boost_round=10000,
                valid_sets=[dtrain, dvalid],
                early_stopping_rounds=100,
                verbose_eval=False
            ) 
        print(_it,' ' ,'Fold AUC :', roc_auc_score(y_test, model.predict(x_test)))
        oof_baggs[_it, :] = model.predict(x_test)
        preds_baggs[_it, :] = model.predict(dtest)
        
    val_preds = pd.DataFrame(oof_baggs).T
    test_preds = pd.DataFrame(preds_baggs).T
    
    oof_preds[test_index] = val_preds.rank(axis=0, method='min').mul(val_preds.shape[1] * [1 / val_preds.shape[1]]).sum(1) / val_preds.shape[0]

    print('Fold AUC :', roc_auc_score(y_test, val_preds.rank(axis=0, method='min').mul(val_preds.shape[1] * [1 / val_preds.shape[1]]).sum(1) / val_preds.shape[0]))
    aucs.append(roc_auc_score(y_test, val_preds.rank(axis=0, method='min').mul(val_preds.shape[1] * [1 / val_preds.shape[1]]).sum(1) / val_preds.shape[0]))
   

    test_set.append(test_preds.rank(axis=0, method='min').mul(test_preds.shape[1] * [1 / test_preds.shape[1]]).sum(1) / test_preds.shape[0])
    gc.collect()
    

print('AVERAGED AUC :', np.mean(aucs))

TRAIN:  [     0      1      2 ... 307505 307506 307508] TEST:  [     9     16     25 ... 307507 307509 307510]
1   Fold AUC : 0.8032532917979529
2   Fold AUC : 0.8031139063057514
3   Fold AUC : 0.8030156154306244
4   Fold AUC : 0.803177568336961
5   Fold AUC : 0.8032759847959174
Fold AUC : 0.8033467369313215
TRAIN:  [     0      2      3 ... 307508 307509 307510] TEST:  [     1      8     17 ... 307497 307503 307504]
1   Fold AUC : 0.8086677173810604
2   Fold AUC : 0.8085551858579718
3   Fold AUC : 0.8084422617684013
4   Fold AUC : 0.8082541323852884
5   Fold AUC : 0.8084206147882113
Fold AUC : 0.8086535921907593
TRAIN:  [     0      1      4 ... 307507 307509 307510] TEST:  [     2      3      6 ... 307494 307501 307508]
1   Fold AUC : 0.8069431543203686
2   Fold AUC : 0.8066535704118117
3   Fold AUC : 0.8069527604493103
4   Fold AUC : 0.8069806436947128
5   Fold AUC : 0.8068374299029122
Fold AUC : 0.8070061881959467
TRAIN:  [     0      1      2 ... 307508 307509 307510] TEST:  [    

In [19]:
preds = pd.DataFrame(test_set).T

In [21]:
y_hat = preds.rank(axis=0, method='min').mul(preds.shape[1] * [1 / preds.shape[1]]).sum(1) / preds.shape[0] 
    


sampl_sub = pd.read_csv('../../data/sample_submission.csv')


sampl_sub['TARGET'] = y_hat.values

sampl_sub.to_csv("lightgbm-stack-submission.csv", index=False)


sampl_sub.head()



Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.387863
1,100005,0.803968
2,100013,0.369838
3,100028,0.383772
4,100038,0.814689
