In [1]:
data_dir = './data/mlboot_dataset/'
results_dir = './results/'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp

In [2]:
df = pd.read_csv(data_dir + 'preprocessed_new.csv') 
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil()
print(mat1.shape, mat2.shape, mat3.shape)

df['max_f1'] = mat1.tocsr().max(axis=1).todense()
df['max_f2'] = mat2.tocsr().max(axis=1).todense()
df['max_f3'] = mat3.tocsr().max(axis=1).todense()

train_mat1 = mat1[df_train_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]

limit = 4
mat1 = mat1.tocsc()[:, np.where(train_mat1.getnnz(axis=0) > limit)[0]].tocsr()
mat2 = mat2.tocsc()[:, np.where(train_mat2.getnnz(axis=0) > limit)[0]].tocsr()
mat3 = mat3.tocsc()[:, np.where(train_mat3.getnnz(axis=0) > limit)[0]].tocsr()
del train_mat1, train_mat2, train_mat3
print(mat1.shape, mat2.shape, mat3.shape)

(609018, 2053602) (609018, 2812610) (609018, 1057788)
(609018, 490678) (609018, 20275) (609018, 25595)


In [3]:
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [4]:
from sklearn.decomposition import TruncatedSVD
print('pca on matrix 1')
pca = TruncatedSVD(n_components = 200, algorithm='arpack')
mat1_pca = pca.fit_transform(mat1.astype(np.float32))
print('pca on matrix 3')
pca = TruncatedSVD(n_components = 200, algorithm='arpack')
mat3_pca = pca.fit_transform(mat3.astype(np.float32))

pca on matrix 1
pca on matrix 3


In [5]:
train_mat1 = mat1[df_train_index.tolist()]
test_mat1 = mat1[df_test_index.tolist()]
train_mat1_pca = mat1_pca[df_train_index.tolist()]
test_mat1_pca = mat1_pca[df_test_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
test_mat2 = mat2[df_test_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]
test_mat3 = mat3[df_test_index.tolist()]
train_mat3_pca = mat3_pca[df_train_index.tolist()]
test_mat3_pca = mat3_pca[df_test_index.tolist()]
import gc
del mat1,mat2,mat3,mat1_pca,mat3_pca
gc.collect()

0

In [6]:
from sklearn.preprocessing import normalize
train_mat1 = normalize(train_mat1, norm='l1', axis=1)
test_mat1 = normalize(test_mat1, norm='l1', axis=1)
train_mat2 = normalize(train_mat2, norm='l1', axis=1)
test_mat2 = normalize(test_mat2, norm='l1', axis=1)
train_mat3 = normalize(train_mat3, norm='l1', axis=1)
test_mat3 = normalize(test_mat3, norm='l1', axis=1)

In [7]:
y = X.target.values
from sklearn.feature_selection import SelectPercentile
sp = SelectPercentile(percentile=2)
sp.fit(train_mat1, y)
train_mat1 = sp.transform(train_mat1)
test_mat1 = sp.transform(test_mat1)
sp = SelectPercentile(percentile=2)
sp.fit(train_mat3, y)
train_mat3 = sp.transform(train_mat3)
test_mat3 = sp.transform(test_mat3)

In [8]:
import multiprocessing
cpu_cores = multiprocessing.cpu_count()

import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras import regularizers
from keras import optimizers
from keras import backend as K
from keras.utils import Sequence
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))

Using TensorFlow backend.


# Data preprocessing

In [9]:
df.columns

Index(['uid', 'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'nuniq_keys_f1_cat0', 'nuniq_keys_f2_cat0',
       'nuniq_keys_f3_cat0', 'nuniq_keys_f1_cat1', 'nuniq_keys_f2_cat1',
       'nuniq_keys_f3_cat1', 'nuniq_keys_f1_cat2', 'nuniq_keys_f2_cat2',
       'nuniq_keys_f3_cat2', 'nuniq_keys_f1_cat3', 'nuniq_keys_f2_cat3',
       'nuniq_keys_f3_cat3', 'nuniq_keys_f1_cat4', 'nuniq_keys_f2_cat4',
       'nuniq_keys_f3_cat4', 'nuniq_keys_f1_cat5', 'nuniq_keys_f2_cat5',
       'nuniq_keys_f3_cat5', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
       'nuniq_key

In [10]:
train_cols = ['num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'nuniq_keys_f1_cat0', 'nuniq_keys_f2_cat0',
       'nuniq_keys_f3_cat0', 'nuniq_keys_f1_cat1', 'nuniq_keys_f2_cat1',
       'nuniq_keys_f3_cat1', 'nuniq_keys_f1_cat2', 'nuniq_keys_f2_cat2',
       'nuniq_keys_f3_cat2', 'nuniq_keys_f1_cat3', 'nuniq_keys_f2_cat3',
       'nuniq_keys_f3_cat3', 'nuniq_keys_f1_cat4', 'nuniq_keys_f2_cat4',
       'nuniq_keys_f3_cat4', 'nuniq_keys_f1_cat5', 'nuniq_keys_f2_cat5',
       'nuniq_keys_f3_cat5', 'nuniq_keys_f1', 'sumval_keys_f1_cat0', 'sumval_keys_f2_cat0',
       'sumval_keys_f3_cat0', 'sumval_keys_f1_cat1', 'sumval_keys_f2_cat1',
       'sumval_keys_f3_cat1', 'sumval_keys_f1_cat2', 'sumval_keys_f2_cat2',
       'sumval_keys_f3_cat2', 'sumval_keys_f1_cat3', 'sumval_keys_f2_cat3',
       'sumval_keys_f3_cat3', 'sumval_keys_f1_cat4', 'sumval_keys_f2_cat4',
       'sumval_keys_f3_cat4', 'sumval_keys_f1_cat5', 'sumval_keys_f2_cat5',
       'sumval_keys_f3_cat5', 'sumval_keys_f1', 'most_freq_cat', 'diff_num_cats', 'unique_days',
       'max_f1', 'max_f2', 'max_f3']

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(X[train_cols].fillna(0).values)
X[train_cols] = scaler.transform(X[train_cols].fillna(0).values)
x_te[train_cols] = scaler.transform(x_te[train_cols].fillna(0).values)

In [12]:
import math
from sklearn.metrics import log_loss
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1
        K.set_value(self.model.optimizer.lr, self.clr())

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

            
class RocAucEvaluation(Callback):
    def __init__(self, X_seq, y, name, interval=1):
        super(Callback, self).__init__()
        self.X_seq, self.y = X_seq, y
        self.name = name
        self.interval = interval

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict_generator(self.X_seq, steps=len(self.X_seq), 
                                                          use_multiprocessing=False, workers=1, 
                                                          max_queue_size=4*cpu_cores).ravel()
            auc = roc_auc_score(self.y, y_pred)
            logloss = log_loss(self.y, y_pred)
            logs[self.name+"_auc"] = auc
            logs[self.name+"_logloss"] = logloss
            print((self.name+"_auc: {:.8f}; "+"_logloss: {:.8f}; ").format(auc,logloss))
            
class FeatureSequence(Sequence):
    
    def __init__(self, X, y, inx, batch_size, shuffle=False):
        
        self.X, self.y = X, y
        self.batch_size = batch_size
        
        self.inx = inx
        self.shuffle = shuffle
        if self.shuffle:
            np.random.shuffle(self.inx)

    def __len__(self):
        return math.ceil(self.inx.shape[0] / self.batch_size)

    def __getitem__(self, i):
        batch_inx = self.inx[i*self.batch_size:(i+1)*self.batch_size]
        
        batch = [x[batch_inx] for x in self.X[:3]] +  [x[batch_inx].todense() for x in self.X[-3:]]
        #batch = [x[batch_inx].todense() for x in self.X]
        return batch, self.y[batch_inx]
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.inx)            

In [13]:
train_mat1.shape

(427994, 9814)

In [23]:
model_name = "all_in_focal_loss2"

def split_inputs(X):
    return np.split(X, X.shape[-1], axis=-1)

def buildBaseModel():   
    num_inp = Input((len(train_cols),), name='num_inp')
    num_x = BatchNormalization()(num_inp)
    num_x = Dense(64, activation="relu")(num_x)
    num_x = BatchNormalization()(num_x)
    num_x = Dropout(0.5)(num_x)
    
    dense_inp1 = Input((train_mat1_pca.shape[1],), name='dense_inp1')
    dense1_x = BatchNormalization()(dense_inp1)
    dense1_x = Dropout(0.5)(dense1_x)
    
    dense_inp3 = Input((train_mat3_pca.shape[1],), name='dense_inp3')
    dense3_x = BatchNormalization()(dense_inp3)
    dense3_x = Dropout(0.3)(dense3_x)
    
    sparse_inp1 = Input((train_mat1.shape[1],), name='sparse_inp1')
    sparse1_x = BatchNormalization()(sparse_inp1)
    sparse1_x = Dense(256, activation="relu")(sparse1_x)
    sparse1_x = BatchNormalization()(sparse1_x)
    sparse1_x = Dropout(0.3)(sparse1_x)
    
    sparse_inp2 = Input((train_mat2.shape[1],), name='sparse_inp2')
    sparse2_x = BatchNormalization()(sparse_inp2)
    sparse2_x = Dense(256, activation="relu")(sparse2_x)
    sparse2_x = BatchNormalization()(sparse2_x)
    sparse2_x = Dropout(0.3)(sparse2_x)
    
    sparse_inp3 = Input((train_mat3.shape[1],), name='sparse_inp3')
    sparse3_x = BatchNormalization()(sparse_inp3)
    sparse3_x = Dense(64, activation="relu")(sparse3_x)
    sparse3_x = BatchNormalization()(sparse3_x)
    sparse3_x = Dropout(0.3)(sparse3_x)    
    
    x = concatenate([num_x, dense1_x, dense3_x, sparse1_x, sparse3_x, sparse2_x]) #, sparse1_x, sparse3_x
    #x = num_x
    
    x1 = Dense(512, activation="relu")(x)
    x1 = BatchNormalization()(x1)
    x1 = Dropout(0.3)(x1)
    
    x1 = Dense(256, activation="relu")(x1)
    x1 = BatchNormalization()(x1)
    x1 = Dropout(0.3)(x1)
    
    x4 = Dense(128, activation="relu")(x1)
    x4 = BatchNormalization()(x4)
    x4 = Dropout(0.3)(x4)

    x4 = Dense(64, activation="relu")(x4)
    x4 = BatchNormalization()(x4)
    x4 = Dropout(0.1)(x4)
    
    x_output = Dense(1, activation="sigmoid", name="output")(x4)
    return Model(inputs = [num_inp, dense_inp1, dense_inp3, sparse_inp1, sparse_inp3, sparse_inp2], outputs=x_output) #num_inp, ,  sparse_inp3

model = buildBaseModel()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
num_inp (InputLayer)            (None, 67)           0                                            
__________________________________________________________________________________________________
sparse_inp1 (InputLayer)        (None, 9814)         0                                            
__________________________________________________________________________________________________
sparse_inp3 (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
sparse_inp2 (InputLayer)        (None, 20275)        0                                            
__________________________________________________________________________________________________
batch_norm

In [24]:
y = X.target.values

In [35]:
from sklearn.model_selection import KFold
TRN_BATCH_SIZE = 512
INF_BATCH_SIZE = 512

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=239)
epochs = 32
pred = np.zeros(y.shape)
test_pred = 0
ifold = 0

fold_auc = []

In [36]:
cpu_cores = 4

def focal_loss(y_true, y_pred, alpha, gamma=0.5):
    alpha = K.variable(alpha)
    pt = K.abs(1. - y_true - y_pred)
    pt = K.clip(pt, K.epsilon(), 1. - K.epsilon())
    return K.mean(-alpha * K.pow(1. - pt, gamma) * K.log(pt), axis=-1)

te_seq = FeatureSequence([x_te[train_cols].values, 
                               test_mat1_pca,
                               test_mat3_pca,
                               test_mat1,
                               test_mat3,
                               test_mat2,
                              
                         ], 
                              y, np.array(list(range(len(x_te)))), INF_BATCH_SIZE, shuffle=False)

for trn_inx, val_inx in kf.split(y):
    print("Training fold {}".format(ifold))
    K.clear_session()
    model = buildBaseModel()
    
    model_file_name = model_name+"__f"+str(ifold)
    model_file = results_dir+model_file_name+'.h5'   
    
    trn_seq = FeatureSequence([X.loc[:,train_cols].values, 
                               train_mat1_pca,
                               train_mat3_pca,
                               train_mat1,
                               train_mat3,
                               train_mat2
                               
                              ], 
                              y, trn_inx, TRN_BATCH_SIZE, shuffle=True)
    val_seq = FeatureSequence([X.loc[:,train_cols].values, 
                               train_mat1_pca,
                               train_mat3_pca,
                               train_mat1,
                               train_mat3,
                               train_mat2
                              ], 
                              y, val_inx, INF_BATCH_SIZE, shuffle=False)
    
    # Callbacks
    model_checkpoint = ModelCheckpoint(model_file, monitor='val_auc', verbose=1, mode='max',
                                       save_best_only=True, save_weights_only=False, period=1)
    clr = CyclicLR(base_lr=0.00008, max_lr=0.0003, step_size=2*math.ceil(len(trn_seq)), mode='triangular2')
    early_stop = EarlyStopping(monitor='val_auc', min_delta=0, patience=3, verbose=1, mode='max')
    mse_eval = RocAucEvaluation(val_seq, y[val_inx], 'val')
    
    alpha = 10
    gamma = 0.05
    #alpha = 0.25
    #gamma = 2
    # Training
    opt=optimizers.Adam()
    model.compile(optimizer=opt, loss=lambda y_true, y_pred: focal_loss(y_true, y_pred, alpha=alpha, gamma=gamma))

    model.fit_generator(
        generator=trn_seq, steps_per_epoch=len(trn_seq),
        initial_epoch=0, epochs=epochs, shuffle=False, verbose=1,
        callbacks=[mse_eval, model_checkpoint, early_stop, clr], #
        use_multiprocessing=False, workers=1, max_queue_size=4*cpu_cores)
    
     
    # Predicting
    print("\nPredicting fold {}".format(ifold))
    del model  
    model = load_model(model_file, compile=True, custom_objects={'<lambda>':lambda y_true, y_pred: focal_loss(y_true, y_pred, alpha, gamma)})
    pred[val_inx] = model.predict_generator(val_seq, steps=len(val_seq), 
                                                    use_multiprocessing=False, workers=1, 
                                                    max_queue_size=4*cpu_cores).ravel()
    
    auc = roc_auc_score(y[val_inx], pred[val_inx])
    logloss = log_loss(y[val_inx], pred[val_inx])
    fold_auc.append(auc)
    print("fold: {}, auc: {}".format(ifold, auc))
    print("fold: {}, logloss: {}".format(ifold, logloss))
    print()
    
    test_pred += model.predict_generator(te_seq, steps=len(te_seq), 
                                                    use_multiprocessing=False, workers=1, 
                                                    max_queue_size=4*cpu_cores).ravel()/n_folds
    ifold += 1
    gc.collect()

Training fold 0
Epoch 1/32
val_auc: 0.53495128; _logloss: 0.34449428; 

Epoch 00001: val_auc improved from -inf to 0.53495, saving model to ./results/all_in_focal_loss2__f0.h5
Epoch 2/32


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


val_auc: 0.62354497; _logloss: nan; 

Epoch 00002: val_auc improved from 0.53495 to 0.62354, saving model to ./results/all_in_focal_loss2__f0.h5
Epoch 3/32
val_auc: 0.67126569; _logloss: 0.18818741; 

Epoch 00003: val_auc improved from 0.62354 to 0.67127, saving model to ./results/all_in_focal_loss2__f0.h5
Epoch 4/32
val_auc: 0.67913337; _logloss: nan; 

Epoch 00004: val_auc improved from 0.67127 to 0.67913, saving model to ./results/all_in_focal_loss2__f0.h5
Epoch 5/32
val_auc: 0.67270620; _logloss: nan; 

Epoch 00005: val_auc did not improve
Epoch 6/32
val_auc: 0.66594302; _logloss: 0.18908043; 

Epoch 00006: val_auc did not improve
Epoch 7/32
val_auc: 0.63942794; _logloss: 0.19793087; 

Epoch 00007: val_auc did not improve
Epoch 00007: early stopping

Predicting fold 0
fold: 0, auc: 0.6791333664356554
fold: 0, logloss: 0.18800643544981258

Training fold 1
Epoch 1/32
val_auc: 0.55882924; _logloss: nan; 

Epoch 00001: val_auc improved from -inf to 0.55883, saving model to ./results/al

In [33]:
np.max(pred[val_inx])

1.0

In [30]:
pred[val_inx] = model.predict_generator(val_seq, steps=len(val_seq), 
                                                use_multiprocessing=False, workers=1, 
                                                max_queue_size=4*cpu_cores).ravel()
roc_auc_score(y[val_inx], pred[val_inx])

0.65678267215533004

In [37]:
print(fold_auc)
print(np.mean(fold_auc), np.std(fold_auc))

[0.67913336643565536, 0.66947506902045895, 0.67372815584920964, 0.67654302981288006, 0.6713488910715073]
0.674045702438 0.00347520607131


In [38]:
roc_auc_score(X.target.values, pred)

0.67178394143798681

In [140]:
print(fold_auc)
print(np.mean(fold_auc), np.std(fold_auc))

[0.67471445792908868, 0.6776204840472132, 0.66228297937384784, 0.65440382200465985, 0.68059564081109813, 0.6651061611077137, 0.66869623987870552, 0.66059282869963254, 0.67129747976272391, 0.66042014410130478, 0.66056694779168046]
0.666936107773 0.00791248017156


In [42]:
roc_auc_score(X.target.values, pred)

0.67178394143798681

In [39]:
model_name = 'all_in_focal_loss5'
np.save(results_dir + 'train_' + model_name +'.npy', pred)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

In [40]:
sub = x_te[['uid','target']].copy()
sub['target'] = test_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.055612
1,ac4b8244f3ae82df511b002257473c11,0.047963
2,483d8b91e49522c8a5bbe37f3872c749,0.123365
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.022827
4,fdbfba9842ff0bf86d600eb334c7c42b,0.044397


In [41]:
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)

In [40]:
print(fold_auc)
print(np.mean(fold_auc), np.std(fold_auc))

[0.6644702445490992, 0.64294169489778663, 0.63130878709266391, 0.65312112328188054, 0.65141053255286208, 0.65287791990095678, 0.65218096819032323, 0.65849314197669018, 0.64645771109742023, 0.64635569082469579]
0.649961781436 0.0085515308576


In [None]:
from sklearn.preprocessing import minmax_scale
a = pd.DataFrame()
a['sol0'] = minmax_scale(pd.read_csv(results_dir + 'nn_base_model.csv', header=None)[0].values)
a['sol2'] = minmax_scale(pd.read_csv(results_dir + 'baseline_sparse_10folds.csv', header=None)[0].values)
a.corr()

In [62]:
a['target'] = a.sol0*0.05 + a.sol2*0.95

a.head()

Unnamed: 0,sol0,sol2,target
0,0.325992,0.184876,0.191932
1,0.231779,0.096343,0.103115
2,0.242137,0.151491,0.156023
3,0.214527,0.055863,0.063797
4,0.132649,0.051491,0.055549


In [63]:
a.corr()

Unnamed: 0,sol0,sol2,target
sol0,1.0,0.323359,0.441124
sol2,0.323359,1.0,0.991874
target,0.441124,0.991874,1.0


In [64]:
a[['target']].to_csv(results_dir + 'blend' + '.csv', header=False, index=False)