In [1]:
data_dir = './data/mlboot_dataset/'
results_dir = './results/'
model_name = 'nn_we_v4'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp

In [2]:
df = pd.read_csv(data_dir + 'preprocessed_new.csv') 
q = pd.read_csv(data_dir + 'sessions.csv')
df = df.merge(q, on='uid', how='left')
del q
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

In [3]:
data_svd = pd.DataFrame(np.load(data_dir + 'pca_cat10.npy'), index=df.index)
data_svd.columns = ['svd_'+str(i+1) for i in range(10)]
df = pd.concat([df, data_svd], axis=1)    
del data_svd

In [4]:
from sklearn.decomposition import TruncatedSVD

mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil()

mat = sp.hstack([mat1,mat2,mat3]).tolil()
del mat1,mat2,mat3

In [5]:
train_mat = mat[df_train_index.tolist()]
test_mat = mat[df_test_index.tolist()]
mat = mat.tocsc()[:, np.where((train_mat.getnnz(axis=0) > 1) & (test_mat.getnnz(axis=0) > 1))[0]].tocsr()

In [6]:
import tqdm
idcs = []
for q in tqdm.tqdm(range(len(df))):
    idcs.append(mat[q].nonzero()[1] + 1)
df['idcs'] = np.array(idcs)

100%|██████████| 609018/609018 [01:22<00:00, 7342.00it/s]


In [7]:
df['len_idcs'] = df.idcs.apply(len)

In [8]:
df.len_idcs.describe()

count    609018.000000
mean        749.315358
std         903.071800
min           0.000000
25%         156.000000
50%         450.000000
75%        1001.000000
max       17519.000000
Name: len_idcs, dtype: float64

In [9]:
df['max_idc'] = df.idcs.apply(lambda r : np.max(r) if len(r) > 0 else 0)
np.max(df.max_idc)

378668

In [10]:
del mat,train_mat,test_mat

In [11]:
import gc
gc.collect()

21

In [12]:
def padding(r):
    if len(r) > 750:
        return(r[:750])
    return r+[0]*(750-len(r))
df['idcs'] = df['idcs'].apply(lambda r : np.array(padding(r.tolist())))

In [None]:
df[['uid','idcs']].to_csv(data_dir + 'indices.csv',index=False)

In [13]:
mat_pca = np.load(data_dir + 'pca_cat100.npy')

In [14]:
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [15]:
from sklearn.preprocessing import MaxAbsScaler 
scaler_mat = MaxAbsScaler()
mat_pca = scaler_mat.fit_transform(mat_pca)
train_mat_pcat = mat_pca[df_train_index.tolist()]
test_mat_pcat = mat_pca[df_test_index.tolist()]

In [16]:
import multiprocessing
cpu_cores = multiprocessing.cpu_count()

import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras import regularizers
from keras import optimizers
from keras import backend as K
from keras.utils import Sequence
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))

Using TensorFlow backend.


# Data preprocessing

In [17]:
df.columns

Index(['uid', 'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'nuniq_keys_f1_cat0', 'nuniq_keys_f2_cat0',
       'nuniq_keys_f3_cat0', 'nuniq_keys_f1_cat1', 'nuniq_keys_f2_cat1',
       'nuniq_keys_f3_cat1', 'nuniq_keys_f1_cat2', 'nuniq_keys_f2_cat2',
       'nuniq_keys_f3_cat2', 'nuniq_keys_f1_cat3', 'nuniq_keys_f2_cat3',
       'nuniq_keys_f3_cat3', 'nuniq_keys_f1_cat4', 'nuniq_keys_f2_cat4',
       'nuniq_keys_f3_cat4', 'nuniq_keys_f1_cat5', 'nuniq_keys_f2_cat5',
       'nuniq_keys_f3_cat5', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
       'nuniq_key

In [18]:
train_cols = ['sess_keys_mean','sess_keys_max','diff_key1_mean','diff_key1_max','diff_key2_mean',
       'diff_key2_max','diff_key3_mean','diff_key3_max','quot_key1_mean','quot_key1_max',
       'quot_key2_mean','quot_key2_max','quot_key3_mean','quot_key3_max',
       'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'nuniq_keys_f1_cat0', 'nuniq_keys_f2_cat0',
       'nuniq_keys_f3_cat0', 'nuniq_keys_f1_cat1', 'nuniq_keys_f2_cat1',
       'nuniq_keys_f3_cat1', 'nuniq_keys_f1_cat2', 'nuniq_keys_f2_cat2',
       'nuniq_keys_f3_cat2', 'nuniq_keys_f1_cat3', 'nuniq_keys_f2_cat3',
       'nuniq_keys_f3_cat3', 'nuniq_keys_f1_cat4', 'nuniq_keys_f2_cat4',
       'nuniq_keys_f3_cat4', 'nuniq_keys_f1_cat5', 'nuniq_keys_f2_cat5',
       'nuniq_keys_f3_cat5', 'nuniq_keys_f1', 'sumval_keys_f1_cat0', 'sumval_keys_f2_cat0',
       'sumval_keys_f3_cat0', 'sumval_keys_f1_cat1', 'sumval_keys_f2_cat1',
       'sumval_keys_f3_cat1', 'sumval_keys_f1_cat2', 'sumval_keys_f2_cat2',
       'sumval_keys_f3_cat2', 'sumval_keys_f1_cat3', 'sumval_keys_f2_cat3',
       'sumval_keys_f3_cat3', 'sumval_keys_f1_cat4', 'sumval_keys_f2_cat4',
       'sumval_keys_f3_cat4', 'sumval_keys_f1_cat5', 'sumval_keys_f2_cat5',
       'sumval_keys_f3_cat5', 'sumval_keys_f1', 'most_freq_cat', 'diff_num_cats', 'unique_days'] + ['svd_'+str(i+1) for i in range(10)]

In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(X[train_cols].fillna(0).values)
X[train_cols] = scaler.transform(X[train_cols].fillna(0).values)
x_te[train_cols] = scaler.transform(x_te[train_cols].fillna(0).values)

In [20]:
import math
from sklearn.metrics import log_loss
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1
        K.set_value(self.model.optimizer.lr, self.clr())

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

            
class RocAucEvaluation(Callback):
    def __init__(self, X_seq, y, name, interval=1):
        super(Callback, self).__init__()
        self.X_seq, self.y = X_seq, y
        self.name = name
        self.interval = interval

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict_generator(self.X_seq, steps=len(self.X_seq), 
                                                          use_multiprocessing=False, workers=1, 
                                                          max_queue_size=4*cpu_cores).ravel()
            auc = roc_auc_score(self.y, y_pred)
            logloss = log_loss(self.y, y_pred)
            logs[self.name+"_auc"] = auc
            logs[self.name+"_logloss"] = logloss
            print((self.name+"_auc: {:.8f}; "+"_logloss: {:.8f}; ").format(auc,logloss))
            
class FeatureSequence(Sequence):
    
    def __init__(self, X, y, inx, batch_size, shuffle=False):
        
        self.X, self.y = X, y
        self.batch_size = batch_size
        
        self.inx = inx
        self.shuffle = shuffle
        if self.shuffle:
            np.random.shuffle(self.inx)

    def __len__(self):
        return math.ceil(self.inx.shape[0] / self.batch_size)

    def __getitem__(self, i):
        batch_inx = self.inx[i*self.batch_size:(i+1)*self.batch_size]
        
        batch = [x[batch_inx] for x in self.X[:2]] +  [x[batch_inx] for x in [self.X[-1]]]
        return batch, self.y[batch_inx]
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.inx)            

In [41]:
"""
    Hash embedding layer. Note that the zero word index is always used for masking.
    # Properties
        max_word_idx: maximum word index (e.g. the maximum dictionary value).
        num_buckets: number of buckets
        embedding_size: size of embedding
        num_hash_functions: number of hash functions
        W_trainable = True, if the embedding should be trainable
        p_trainable = True, if the importance parameters should be trainable
        append_weight, True if the importance parameters should be appended
        aggregation_mode: either 'sum' or 'concatenate' depending on whether
                        the component vectors should be summed or concatenated
"""
class HashEmbedding(Layer):
    def __init__(self, max_word_idx = 378669, num_buckets = 40000, embedding_size = 37, num_hash_functions=2,
                 W_trainable=True, p_trainable = True, append_weight= True, aggregation_mode = 'sum', seed=3, **kwargs):
        super(HashEmbedding, self).__init__(**kwargs)
        np.random.seed(seed)
        self.word_count = max_word_idx
        W = np.random.normal(0, 0.1, (num_buckets, embedding_size))
        self.num_buckets = W.shape[0]
        self.mask_zero = True
        self.append_weight = append_weight
        self.p = None
        self.trainable_weights = []
        self.p_trainable = p_trainable
        self.num_hashes = num_hash_functions
        self.p_init_std = 0.0005

        self.num_hash_functions = num_hash_functions
        self.hashing_vals = []
        self.hashing_offset_vals = []


        # Initialize hash table. Note that this could easily be implemented by a modulo operation
        tab = (np.random.randint(0, 2 ** 30, size=(self.word_count, self.num_hash_functions)) % self.num_buckets) + 1
        self.hash_tables = K.variable(tab, dtype='int32')

        # Initialize word importance parameters
        p_init = np.random.normal(0, self.p_init_std, (self.word_count, self.num_hashes))
        self.p = K.variable(p_init,name='p_hash')
        if self.p_trainable:
            self.trainable_weights.append(self.p)


        #Initialize the embedding matrix
        # add zero vector for nulls (for masking)
        W = np.row_stack((np.zeros((1, W.shape[1])), W)).astype('float32')
        self.embedding_size = W.shape[1]
        W_shared = K.variable(W, name='W_hash')
        self.W = W_shared
        if W_trainable:
            self.trainable_weights.append(self.W)

        if aggregation_mode == 'sum':
            self.aggregation_function = sum
        else:
            if aggregation_mode == 'concatenate':
                self.aggregation_function = lambda x: K.concatenate(x,axis = -1)
            else:
                raise('unknown aggregation function')
        self.aggregation_mode = aggregation_mode

    def compute_mask(self, x, mask=None):
        if not self.mask_zero:
            return None
        else:
            return K.not_equal(x, 0)

    def call(self, input, mask=None):
        W = self.W
        pvals = []
        retvals = []
        input_w = input%self.word_count
        input_p = (3+input)%self.word_count
        idx_bucket_all = K.gather(self.hash_tables, input_w)
        for hash_fun_num in range(self.num_hash_functions):
            W0 = K.gather(W, idx_bucket_all[:,:,hash_fun_num]*(1-K.cast(K.equal(0, input_w), 'int32')))
            p_0 = K.gather(self.p[:,hash_fun_num], input_p)
            p = K.expand_dims(p_0, -1)
            pvals.append(p)
            retvals.append(W0*p)
        retval = self.aggregation_function(retvals)
        if self.append_weight:
            retval = K.concatenate([retval]+pvals,axis=-1)
        return retval


    def compute_output_shape(self, input_shape):
        weight_addition = 0
        if self.append_weight:
            weight_addition = self.num_hash_functions
        if self.aggregation_mode == 'sum':
            return (input_shape[0], input_shape[1], self.embedding_size+weight_addition)
        else:
            return (input_shape[0], input_shape[1], self.embedding_size * self.num_hash_functions + weight_addition)


class ReduceSum(Layer):
    def __init__(self, **kwargs):
        self.supports_masking = True

        super(ReduceSum, self).__init__(**kwargs)

    def call(self, x, mask=None):
        x, m = x
        x = x * K.cast(K.expand_dims(K.not_equal(m,0), -1), 'float32')
        x = K.cast(x, 'float32')
        return K.sum(x, axis=1,keepdims=False)

    def compute_mask(self, input, mask=None):
        return None

    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][2])

In [42]:
def split_inputs(X):
    return np.split(X, X.shape[-1], axis=-1)

def buildBaseModel():   
    num_inp = Input((len(train_cols),), name='num_inp')
    num_x = BatchNormalization()(num_inp)
    num_x = Dense(128, activation="relu")(num_x)
    num_x = BatchNormalization()(num_x)
    num_x = Dropout(0.7)(num_x)
    
    dense_inp = Input((train_mat_pcat.shape[1],), name='dense_inp')
    dense_x = BatchNormalization()(dense_inp)
    dense_x = Dropout(0.7)(dense_x)

    embedding = HashEmbedding()    
    input_words = Input([None], dtype='int32', name='input_words')
    sparse_x = embedding(input_words)
    sparse_x = ReduceSum()([sparse_x, input_words])
    sparse_x = BatchNormalization()(sparse_x)
    #sparse_x = Dense(32, activation="relu")(sparse_x)
    #sparse_x = BatchNormalization()(sparse_x)
    sparse_x = Dropout(0.7)(sparse_x)
    
    x = concatenate([num_x, dense_x, sparse_x]) #
    
    x1 = Dense(1024, activation="relu")(x)
    x1 = BatchNormalization()(x1)
    x1 = Dropout(0.6)(x1)
    
    x1 = Dense(512, activation="relu")(x1)
    x1 = BatchNormalization()(x1)
    x1 = Dropout(0.6)(x1)
    
    x1 = Dense(256, activation="relu")(x1)
    x1 = BatchNormalization()(x1)
    x1 = Dropout(0.6)(x1)
    
    x4 = Dense(128, activation="relu")(x1)
    x4 = BatchNormalization()(x4)
    x4 = Dropout(0.6)(x4)

    x4 = Dense(64, activation="relu")(x4)
    x4 = BatchNormalization()(x4)
    x4 = Dropout(0.5)(x4)
    
    x_output = Dense(1, activation="sigmoid", name="output")(x4)
    return Model(inputs = [num_inp, dense_inp, input_words], outputs=x_output)

model = buildBaseModel()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
num_inp (InputLayer)            (None, 88)           0                                            
__________________________________________________________________________________________________
input_words (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
batch_normalization_10 (BatchNo (None, 88)           352         num_inp[0][0]                    
__________________________________________________________________________________________________
hash_embedding_2 (HashEmbedding (None, None, 39)     2237375     input_words[0][0]                
__________________________________________________________________________________________________
dense_7 (D

In [43]:
y = X.target.values

In [44]:
from sklearn.model_selection import KFold
TRN_BATCH_SIZE = 512
INF_BATCH_SIZE = 512

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=31239)
epochs = 60
pred = np.zeros(y.shape)
test_pred = 0
ifold = 0

fold_auc = []

import gc
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import MaxAbsScaler 
cpu_cores = 4

def focal_loss(y_true, y_pred, alpha, gamma=0.5):
    alpha = K.variable(alpha)
    pt = K.abs(1. - y_true - y_pred)
    pt = K.clip(pt, K.epsilon(), 1. - K.epsilon())
    return K.mean(-alpha * K.pow(1. - pt, gamma) * K.log(pt), axis=-1)

for trn_inx, val_inx in kf.split(y):
    print("Training fold {}".format(ifold))
    K.clear_session()
    
    model_file_name = model_name+"__f"+str(ifold)
    model_file = results_dir+model_file_name+'.h5'   
        
    model = buildBaseModel()
    
    trn_seq = FeatureSequence([X.loc[trn_inx,train_cols].values, 
                               train_mat_pcat,
                               np.vstack(X.loc[trn_inx,'idcs'].values)
                              ], 
                              y[trn_inx], np.array(list(range(len(trn_inx)))), TRN_BATCH_SIZE, shuffle=True)
    val_seq = FeatureSequence([X.loc[val_inx,train_cols].values, 
                               train_mat_pcat[val_inx],
                               np.vstack(X.loc[val_inx,'idcs'].values)
                              ], 
                              y[val_inx], np.array(list(range(len(val_inx)))), INF_BATCH_SIZE, shuffle=False)
    te_seq = FeatureSequence([x_te[train_cols].values, 
                               test_mat_pcat,
                               np.vstack(x_te.idcs.values)
                             ], 
                              y, np.array(list(range(len(x_te)))), INF_BATCH_SIZE, shuffle=False)
        
    # Callbacks
    model_checkpoint = ModelCheckpoint(model_file, monitor='val_auc', verbose=1, mode='max',
                                       save_best_only=True, save_weights_only=False, period=1)
    clr = CyclicLR(base_lr=0.00005, max_lr=0.0005, step_size=2*math.ceil(len(trn_seq)), mode='triangular2')
    early_stop = EarlyStopping(monitor='val_auc', min_delta=0, patience=3, verbose=1, mode='max')
    mse_eval = RocAucEvaluation(val_seq, y[val_inx], 'val')
    
    alpha = 10
    gamma = 0.05
    #alpha = 0.25
    #gamma = 2
    # Training
    opt=optimizers.Nadam()
    #model.compile(optimizer=opt, loss=lambda y_true, y_pred: focal_loss(y_true, y_pred, alpha=alpha, gamma=gamma))
    model.compile(optimizer=opt, loss='binary_crossentropy')

    model.fit_generator(
        generator=trn_seq, steps_per_epoch=len(trn_seq),
        initial_epoch=0, epochs=epochs, shuffle=False, verbose=2,
        callbacks=[mse_eval, model_checkpoint, early_stop, clr], #
        class_weight={0:0.06,1:0.94},
        use_multiprocessing=False, workers=1, max_queue_size=4*cpu_cores)
    
     
    # Predicting
    print("\nPredicting fold {}".format(ifold))
    del model  
    model = load_model(model_file, compile=True, custom_objects={'HashEmbedding':HashEmbedding,'ReduceSum':ReduceSum})
    pred[val_inx] = model.predict_generator(val_seq, steps=len(val_seq), 
                                                    use_multiprocessing=False, workers=1, 
                                                    max_queue_size=4*cpu_cores).ravel()
    
    auc = roc_auc_score(y[val_inx], pred[val_inx])
    logloss = log_loss(y[val_inx], pred[val_inx])
    fold_auc.append(auc)
    print("fold: {}, auc: {}".format(ifold, auc))
    print("fold: {}, logloss: {}".format(ifold, logloss))
    print()
    
    test_pred += model.predict_generator(te_seq, steps=len(te_seq), 
                                                    use_multiprocessing=False, workers=1, 
                                                    max_queue_size=4*cpu_cores).ravel()/n_folds
    ifold += 1
    
    gc.collect()

Training fold 0
Epoch 1/60
 - 37s - loss: 0.0975
val_auc: 0.48553972; _logloss: 0.55264953; 

Epoch 00001: val_auc improved from -inf to 0.48554, saving model to ./results/nn_we_v3__f0.h5
Epoch 2/60
 - 33s - loss: 0.0807
val_auc: 0.52631309; _logloss: 0.57766869; 

Epoch 00002: val_auc improved from 0.48554 to 0.52631, saving model to ./results/nn_we_v3__f0.h5
Epoch 3/60
 - 33s - loss: 0.0747
val_auc: 0.55541822; _logloss: 0.60882694; 

Epoch 00003: val_auc improved from 0.52631 to 0.55542, saving model to ./results/nn_we_v3__f0.h5
Epoch 4/60
 - 34s - loss: 0.0730
val_auc: 0.56135020; _logloss: 0.60781251; 

Epoch 00004: val_auc improved from 0.55542 to 0.56135, saving model to ./results/nn_we_v3__f0.h5
Epoch 5/60
 - 34s - loss: 0.0727
val_auc: 0.57104963; _logloss: 0.60567294; 

Epoch 00005: val_auc improved from 0.56135 to 0.57105, saving model to ./results/nn_we_v3__f0.h5
Epoch 6/60
 - 34s - loss: 0.0723
val_auc: 0.58177708; _logloss: 0.60545672; 

Epoch 00006: val_auc improved from

 - 34s - loss: 0.0699
val_auc: 0.65886940; _logloss: 0.52320862; 

Epoch 00009: val_auc improved from 0.62793 to 0.65887, saving model to ./results/nn_we_v3__f3.h5
Epoch 10/60
 - 34s - loss: 0.0679
val_auc: 0.67017801; _logloss: 0.51557934; 

Epoch 00010: val_auc improved from 0.65887 to 0.67018, saving model to ./results/nn_we_v3__f3.h5
Epoch 11/60
 - 34s - loss: 0.0652
val_auc: 0.66940031; _logloss: 0.48615981; 

Epoch 00011: val_auc did not improve
Epoch 12/60
 - 34s - loss: 0.0621
val_auc: 0.66449533; _logloss: 0.46041997; 

Epoch 00012: val_auc did not improve
Epoch 13/60
 - 34s - loss: 0.0597
val_auc: 0.65647097; _logloss: 0.44019393; 

Epoch 00013: val_auc did not improve
Epoch 00013: early stopping

Predicting fold 3
fold: 3, auc: 0.6701780072958545
fold: 3, logloss: 0.5155793344661259

Training fold 4
Epoch 1/60
 - 38s - loss: 0.0971
val_auc: 0.55346655; _logloss: 0.49942147; 

Epoch 00001: val_auc improved from -inf to 0.55347, saving model to ./results/nn_we_v3__f4.h5
Epoch 

 - 42s - loss: 0.0721
val_auc: 0.59419158; _logloss: 0.58562540; 

Epoch 00006: val_auc improved from 0.58242 to 0.59419, saving model to ./results/nn_we_v3__f7.h5
Epoch 7/60
 - 42s - loss: 0.0707
val_auc: 0.65777308; _logloss: 0.55159588; 

Epoch 00007: val_auc improved from 0.59419 to 0.65777, saving model to ./results/nn_we_v3__f7.h5
Epoch 8/60
 - 39s - loss: 0.0684
val_auc: 0.67409644; _logloss: 0.52884824; 

Epoch 00008: val_auc improved from 0.65777 to 0.67410, saving model to ./results/nn_we_v3__f7.h5
Epoch 9/60
 - 42s - loss: 0.0668
val_auc: 0.68150503; _logloss: 0.51606579; 

Epoch 00009: val_auc improved from 0.67410 to 0.68151, saving model to ./results/nn_we_v3__f7.h5
Epoch 10/60
 - 41s - loss: 0.0652
val_auc: 0.68061563; _logloss: 0.47347408; 

Epoch 00010: val_auc did not improve
Epoch 11/60
 - 42s - loss: 0.0623
val_auc: 0.67508321; _logloss: 0.45743295; 

Epoch 00011: val_auc did not improve
Epoch 12/60
 - 39s - loss: 0.0585
val_auc: 0.66546037; _logloss: 0.43475477; 



In [45]:
print(fold_auc)
print(np.mean(fold_auc), np.std(fold_auc))
roc_auc_score(X.target.values, pred)

[0.68844837917165991, 0.67650598076953861, 0.67525259300480944, 0.67017800729585453, 0.68378521827936078, 0.67084569038186515, 0.67685253261110634, 0.68150503170545651, 0.67459303649937374, 0.67577252157510848]
0.677373899129 0.00539477894431


0.67658314668821706

In [46]:
np.save(results_dir + 'train_' + model_name +'.npy', pred)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

In [47]:
sub = x_te[['uid','target']].copy()
sub['target'] = test_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.51498
1,ac4b8244f3ae82df511b002257473c11,0.468628
2,483d8b91e49522c8a5bbe37f3872c749,0.559089
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.362585
4,fdbfba9842ff0bf86d600eb334c7c42b,0.217259


In [48]:
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)