In [1]:
import time
import os
import gc
import numpy as np 
import pandas as pd 
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
import keras as ks
from keras import backend as K
from keras.models import load_model


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_stacked = pd.read_csv('../oofs/kain-train-features-v0.1.2.csv', index_col=0)
test_stacked = pd.read_csv('../oofs/kain-test-features-v0.1.2.csv', index_col=0)

In [3]:
train_stacked.head()

Unnamed: 0,LGB_cat_0_7956629719199435_csv,LGB_cat_0_796965615822933_csv,LGB_cat_0_7971758465559547_csv,LGB_cat_0_7971922012487902_csv,LGB_cat_0_7975664475835779_csv,LGB_cat_0_7982459487910958_csv,LGB_cat_0_7985379016892897_csv,LGB_cat_0_7986254088147751_csv,LGB_cat_0_7987384223756171_csv,LGB_cat_0_7988757981190066_csv,...,TARGET.72,TARGET.73,TARGET.74,TARGET.75,TARGET.76,TARGET.77,TARGET.78,TARGET.79,TARGET.80,TARGET.81
0,0.263165,0.210463,0.254966,0.272226,0.242135,0.217145,0.170371,0.194649,0.213564,0.166478,...,0.673457,0.577123,0.615324,0.409131,0.339133,0.31005,0.292603,0.274747,0.229829,0.377313
1,0.028534,0.022802,0.021364,0.027252,0.025308,0.024883,0.021984,0.021119,0.02301,0.0232,...,0.103917,0.039187,0.074826,0.037902,0.0,0.023711,0.018794,0.009405,0.019214,0.017564
2,0.044988,0.035597,0.037636,0.041098,0.039781,0.038767,0.047803,0.032391,0.03483,0.045556,...,0.098294,0.068005,0.105474,0.038344,0.055681,0.040151,0.030012,0.048608,0.041607,0.024725
3,0.028133,0.023076,0.02589,0.025638,0.025989,0.0269,0.027623,0.021785,0.027436,0.025201,...,0.035021,0.080346,0.060589,0.016141,0.048706,0.051309,0.037157,0.041925,0.030234,0.035943
4,0.061272,0.049731,0.047512,0.059441,0.059825,0.053356,0.059691,0.04363,0.050947,0.054172,...,0.129976,0.115208,0.147167,0.083344,0.075375,0.100184,0.088149,0.061036,0.05657,0.059047


In [4]:
train = pd.read_csv('../../data/application_train.csv')
y = train['TARGET']

n_train = train.shape[0]

In [13]:
selected_features = [2,
 3,
 6,
 9,
 10,
 11,
 13,
 14,
 15,
 16,
 17,
 19,
 20,
 21,
 22,
 24,
 25,
 26,
 27,
 28,
 32,
 34,
 36,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 47,
 48,
 49,
 50,
 52,
 60,
 61,
 62,
 64,
 66,
 70,
 71,
 72,
 73,
 74,
 75,
 82,
 85,
 89,
 90,
 91,
 94,
 95,
 96,
 102,
 103,
 104,
 105,
 106,
 108] # for example we have 60 selecte oofs (for the best case it was about 80)

In [14]:
train_features = train_stacked.iloc[:, selected_features].values
test_features = test_stacked.iloc[:, selected_features].values

In [15]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.aucs = []
        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            self.aucs.append(roc_auc_score(self.y_val, y_pred))
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [16]:
from contextlib import contextmanager
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print('[{' + name + '}] done in {' + str(round(time.time() - t0, 3)) + '} s')

In [18]:
aucs = []
test_set = []
validation_set = []
print("\nModeling Stage")

kf = KFold(n_splits=5, random_state=1002, shuffle=True)
kf.get_n_splits(train_features)

n_bagged = 6

for train_index, valid_index in kf.split(train_features):
    print("TRAIN: ", train_index, "TEST: ", valid_index)
 
    X = train_features
    y_ = y.values
    x_train, x_valid = X[train_index], X[valid_index] 
    y_train, y_valid = y_[train_index], y_[valid_index]
    
    scaler = preprocessing.StandardScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train) 
    x_valid = scaler.transform(x_valid) 
    x_test = scaler.transform(test_features)
    
    oof_baggs = np.zeros([n_bagged , x_valid.shape[0]])
    preds_baggs = np.zeros([n_bagged , test_features.shape[0]])
    
    for _it in range(1, n_bagged):
        
        #tf.set_random_seed(_it)
        
        print(x_train.shape, x_valid.shape, x_test.shape)

        file_path = "best_model.hdf5"
        check_point = ModelCheckpoint(file_path, monitor="val_loss", verbose=1,
                                      save_best_only =True, save_weights_only=False, mode='max', period = 1)
        ra_val = RocAucEvaluation(validation_data=(x_valid, y_valid), interval=1)
        early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=2)
        gc.collect()

        config = tf.ConfigProto(
        intra_op_parallelism_threads=4, use_per_session_threads=4, inter_op_parallelism_threads=6)
        with tf.Session(graph=tf.Graph(), config=config) as sess, timer('fit_predict'):
                    ks.backend.set_session(sess)
                    model_in = ks.Input(shape=(x_train.shape[1],), dtype='float32', sparse=False)
                    out = ks.layers.Dense(2 ** 7,  activation='sigmoid', kernel_initializer=
                      ks.initializers.RandomNormal(mean=0.00, stddev=0.08, seed=_it))(model_in)
                    out = ks.layers.Dropout(0.4)(out)
#                     out =  ks.layers.Dense(2 ** 7, activation='sigmoid', kernel_initializer=
#                       ks.initializers.RandomNormal(mean=0.00, stddev=0.05, seed=666))(out)
#                     out = ks.layers.Dropout(0.4)(out)
                    out =  ks.layers.Dense(2 ** 6, activation='relu', kernel_initializer=
                      ks.initializers.RandomNormal(mean=0.00, stddev=0.05, seed=_it))(out)
                    out = ks.layers.Dropout(0.4)(out)
                    out = ks.layers.Dense(1, activation='sigmoid', kernel_initializer=
                      ks.initializers.RandomNormal(mean=0.00, stddev=0.05, seed=_it))(out)
                    model = ks.models.Model(model_in, out)
                    model.compile(loss='binary_crossentropy',
                                  optimizer=ks.optimizers.Adam(lr=8e-3), metrics=['binary_crossentropy'])
                    batch_size = 2 ** 9
                    epochs = 10
                    nrounds = 5
                    for i in range(nrounds):
                        with timer('pass ' +  str(i + 1)):
                            model.fit(x=x_train, y=train['TARGET'].iloc[train_index].values, batch_size=batch_size+(batch_size*(2*i)), epochs=epochs, 
                                validation_data=(x_valid, train['TARGET'].iloc[valid_index].values), callbacks=[ra_val,  early_stop, check_point],
                                      shuffle=True, class_weight={0:1, 1:1})
 
                    #model = load_model(file_path)
                    y_pred = model.predict(x_valid).reshape(-1, 1)
                    print(roc_auc_score(y_true=train['TARGET'].iloc[valid_index].values, y_score=y_pred))
                    
                    oof_baggs[_it, :] = model.predict(x_valid)[:, 0]
                    preds_baggs[_it, :] = model.predict(x_test)[:, 0]
                    
                    del model
                    # os.remove(file_path)
                    gc.collect()

    val_preds = pd.DataFrame(oof_baggs).T
    test_preds = pd.DataFrame(preds_baggs).T
    
    print('Fold AUC :', roc_auc_score(y_valid, val_preds.rank(axis=0, method='min').mul(val_preds.shape[1] * [1 / val_preds.shape[1]]).sum(1) / val_preds.shape[0]))
    aucs.append(roc_auc_score(y_valid, val_preds.rank(axis=0, method='min').mul(val_preds.shape[1] * [1 / val_preds.shape[1]]).sum(1) / val_preds.shape[0]))
    test_set.append(test_preds.rank(axis=0, method='min').mul(test_preds.shape[1] * [1 / test_preds.shape[1]]).sum(1) / test_preds.shape[0])
    gc.collect()

print('AVERAGED AUC :', np.mean(aucs))


Modeling Stage
TRAIN:  [     0      1      2 ... 307505 307506 307508] TEST:  [     9     16     25 ... 307507 307509 307510]
(246008, 60) (61503, 60) (48744, 60)
Train on 246008 samples, validate on 61503 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.801987

Epoch 00001: val_loss improved from -inf to 0.23243, saving model to best_model.hdf5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.802078

Epoch 00002: val_loss improved from 0.23243 to 0.23625, saving model to best_model.hdf5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.801785

Epoch 00003: val_loss did not improve from 0.23625
[{pass 1}] done in {6.302} s
Train on 246008 samples, validate on 61503 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.802483

Epoch 00001: val_loss did not improve from 0.23625
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.802677

Epoch 00002: val_loss did not improve from 0.23625
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.802578

Epoch 00003: val_loss did not improve from 0.23625
[{pass 2}] done in {4.


 ROC-AUC - epoch: 6 - score: 0.801996

Epoch 00006: val_loss did not improve from 0.23612
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.802732

Epoch 00007: val_loss did not improve from 0.23612
[{pass 2}] done in {10.695} s
Train on 246008 samples, validate on 61503 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.802694

Epoch 00001: val_loss did not improve from 0.23612
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.802306

Epoch 00002: val_loss did not improve from 0.23612
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.802822

Epoch 00003: val_loss did not improve from 0.23612
[{pass 3}] done in {4.373} s
Train on 246008 samples, validate on 61503 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.802703

Epoch 00001: val_loss did not improve from 0.23612
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.802835

Epoch 00002: val_loss did not improve from 0.23612
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.802810

Epoch 00003: val_loss did not improve from 0.23612
Epoch 4/10

 ROC-AUC - epoch: 4 - 

KeyboardInterrupt: 

In [None]:
test_preds = pd.DataFrame(test_set).T

In [290]:
test_preds.shape

(48744, 5)

In [291]:
y_hat = test_preds.rank(axis=0, method='min').mul(test_preds.shape[1] * [1 / test_preds.shape[1]]).sum(1) / test_preds.shape[0] 
    



sampl_sub = pd.read_csv('../data/sample_submission.csv')


sampl_sub['TARGET'] = y_hat.values

sampl_sub.to_csv("ann-stack-submission.csv", index=False)





sampl_sub.head()



Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.331692
1,100005,0.821771
2,100013,0.375628
3,100028,0.366006
4,100038,0.785926
