In [25]:
data_dir = './data/mlboot_dataset/'
model_name = 'fm_svd_3br'
results_dir = './results/'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import scipy.sparse as sp

In [2]:
df = pd.read_csv(data_dir + 'preprocessed_new.csv') 
q = pd.read_csv(data_dir + 'sessions.csv')
df = df.merge(q, on='uid', how='left')
del q
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil()
print(mat1.shape, mat2.shape, mat3.shape)

df['max_f1'] = mat1.tocsr().max(axis=1).todense()
df['max_f2'] = mat2.tocsr().max(axis=1).todense()
df['max_f3'] = mat3.tocsr().max(axis=1).todense()

print(mat1.shape, mat2.shape, mat3.shape)

(609018, 2053602) (609018, 20275) (609018, 1057788)
(609018, 2053602) (609018, 20275) (609018, 1057788)


In [3]:
from sklearn.decomposition import TruncatedSVD
print('pca on matrix 1')
pca = TruncatedSVD(n_components = 300, algorithm='arpack')
mat1 = pca.fit_transform(mat1.astype(np.float32))
print('pca on matrix 2')
pca = TruncatedSVD(n_components = 300, algorithm='arpack')
mat2 = pca.fit_transform(mat2.astype(np.float32))
print('pca on matrix 3')
pca = TruncatedSVD(n_components = 300, algorithm='arpack')
mat3 = pca.fit_transform(mat3.astype(np.float32))

pca on matrix 1
pca on matrix 2
pca on matrix 3


In [4]:
from sklearn.preprocessing import StandardScaler 

X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [None]:
mat_pca1 = np.load(data_dir + 'pca_cat100.npy')
mat_pca2 = np.load(data_dir + 'svd_tfidf300.npy')
mat_pca = np.hstack([mat_pca1,mat_pca2])

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler_mat = MinMaxScaler(feature_range=(-1,1))
mat_pca = scaler_mat.fit_transform(mat_pca)
train_mat_pcat = mat_pca[df_train_index.tolist()]
test_mat_pcat = mat_pca[df_test_index.tolist()]

In [7]:
%%time
from sklearn.preprocessing import MinMaxScaler
scaler_mat = MinMaxScaler(feature_range=(-1,1))
mat1 = scaler_mat.fit_transform(mat1)
scaler_mat = MinMaxScaler(feature_range=(-1,1))
mat2 = scaler_mat.fit_transform(mat2)
scaler_mat = MinMaxScaler(feature_range=(-1,1))
mat3 = scaler_mat.fit_transform(mat3)

train_mat1 = mat1[df_train_index.tolist()]
test_mat1 = mat1[df_test_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
test_mat2 = mat2[df_test_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]
test_mat3 = mat3[df_test_index.tolist()]
import gc
#del mat1,mat2,mat3
#gc.collect()
print(np.min(train_mat1), np.max(train_mat1), np.std(train_mat1))

-1.0 1.0 0.241341
CPU times: user 8.69 s, sys: 84 ms, total: 8.77 s
Wall time: 2.93 s


In [9]:
import multiprocessing
cpu_cores = multiprocessing.cpu_count()

import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras import regularizers
from keras import optimizers
from keras import backend as K
from keras.utils import Sequence
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))

# Data preprocessing

In [10]:
train_cols = ['num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'diff_num_cats', 'unique_days',
        'sess_keys_mean', 'sess_keys_max', 'diff_key1_mean',
       'diff_key1_max', 'diff_key2_mean', 'diff_key2_max', 'diff_key3_mean',
       'diff_key3_max', 'quot_key1_mean', 'quot_key1_max', 'quot_key2_mean',
       'quot_key2_max', 'quot_key3_mean', 'quot_key3_max'
    ]

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(X[train_cols].fillna(0).values)
X[train_cols] = scaler.transform(X[train_cols].fillna(0).values)
x_te[train_cols] = scaler.transform(x_te[train_cols].fillna(0).values)

In [12]:
import math
from sklearn.metrics import log_loss
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1
        K.set_value(self.model.optimizer.lr, self.clr())

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

            
class RocAucEvaluation(Callback):
    def __init__(self, X_seq, y, name, interval=1):
        super(Callback, self).__init__()
        self.X_seq, self.y = X_seq, y
        self.name = name
        self.interval = interval

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict_generator(self.X_seq, steps=len(self.X_seq), 
                                                          use_multiprocessing=False, workers=1, 
                                                          max_queue_size=4*cpu_cores).ravel()
            auc = roc_auc_score(self.y, y_pred)
            logloss = log_loss(self.y, y_pred)
            logs[self.name+"_auc"] = auc
            logs[self.name+"_logloss"] = logloss
            print((self.name+"_auc: {:.8f}; "+"_logloss: {:.8f}; ").format(auc,logloss))
            
class FeatureSequence(Sequence):
    
    def __init__(self, X, y, inx, batch_size, shuffle=False):
        
        self.X, self.y = X, y
        self.batch_size = batch_size
        
        self.inx = inx
        self.shuffle = shuffle
        if self.shuffle:
            np.random.shuffle(self.inx)

    def __len__(self):
        return math.ceil(self.inx.shape[0] / self.batch_size)

    def __getitem__(self, i):
        batch_inx = self.inx[i*self.batch_size:(i+1)*self.batch_size]
        
        batch = [x[batch_inx] for x in self.X]
        #batch = [x[batch_inx].todense() for x in self.X]
        return batch, self.y[batch_inx]
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.inx)            

In [17]:
def split_inputs(X):
    return np.split(X, X.shape[-1], axis=-1)

def buildBaseModel():   
    num_inp = Input((len(train_cols),), name='num_inp')
    num_x = BatchNormalization()(num_inp)
    num_x = Dense(64, activation="relu")(num_x)
    num_x = BatchNormalization()(num_x)
    num_x = Dropout(0.4)(num_x)
    
    dense_inp = Input((train_mat_pcat.shape[1],), name='dense_inp')
    dense_x = BatchNormalization()(dense_inp)
    dense_x = Dense(128, activation="relu")(dense_x)
    dense_x = BatchNormalization()(dense_x)
    dense_x = Dense(64, activation="relu")(dense_x)
    dense_x = BatchNormalization()(dense_x)    
    dense_x = Dropout(0.5)(dense_x)    
    
    sparse_inp1 = Input((train_mat1.shape[1],), name='sparse_inp1')
    sparse1_x = BatchNormalization()(sparse_inp1)
    sparse1_x = Dense(128, activation="relu")(sparse1_x)
    sparse1_x = BatchNormalization()(sparse1_x)
    sparse1_x = Dense(64, activation="relu")(sparse1_x)
    sparse1_x = BatchNormalization()(sparse1_x)
    sparse1_x = Dropout(0.5)(sparse1_x)
    
    sparse_inp2 = Input((train_mat2.shape[1],), name='sparse_inp2')
    sparse2_x = BatchNormalization()(sparse_inp2)
    sparse2_x = Dense(128, activation="relu")(sparse2_x)
    sparse2_x = BatchNormalization()(sparse2_x)
    sparse2_x = Dense(64, activation="relu")(sparse2_x)
    sparse2_x = BatchNormalization()(sparse2_x)
    sparse2_x = Dropout(0.5)(sparse2_x)
    
    sparse_inp3 = Input((train_mat3.shape[1],), name='sparse_inp3')
    sparse3_x = BatchNormalization()(sparse_inp3)
    sparse3_x = Dense(128, activation="relu")(sparse3_x)
    sparse3_x = BatchNormalization()(sparse3_x)
    sparse3_x = Dense(64, activation="relu")(sparse3_x)
    sparse3_x = BatchNormalization()(sparse3_x)
    sparse3_x = Dropout(0.5)(sparse3_x)    
    
    x = concatenate([num_x, dense_x, sparse1_x, sparse3_x, sparse2_x, 
                     multiply([sparse1_x, sparse2_x]),
                     multiply([sparse1_x, sparse3_x]),
                     multiply([sparse2_x, sparse3_x])
                              ])
    
    x1 = Dense(1024, activation="relu")(x)
    x1 = BatchNormalization()(x1)
    x1 = Dropout(0.5)(x1)
    
    x2 = concatenate([x, x1])
    x2 = Dense(512, activation="relu")(x2)
    x2 = BatchNormalization()(x2)
    x2 = Dropout(0.5)(x2)
    
    x3 = concatenate([x, x1, x2])
    x3 = Dense(256, activation="relu")(x3)
    x3 = BatchNormalization()(x3)
    x3 = Dropout(0.5)(x3)
    
    x4 = x3
    x4 = Dense(128, activation="relu")(x4)
    x4 = BatchNormalization()(x4)
    x4 = Dropout(0.1)(x4)
    
    x_output = Dense(1, activation="sigmoid", name="output")(x4)
    return Model(inputs = [num_inp, dense_inp, sparse_inp1, sparse_inp2, sparse_inp3], outputs=x_output)

model = buildBaseModel()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sparse_inp1 (InputLayer)        (None, 300)          0                                            
__________________________________________________________________________________________________
sparse_inp3 (InputLayer)        (None, 300)          0                                            
__________________________________________________________________________________________________
sparse_inp2 (InputLayer)        (None, 300)          0                                            
__________________________________________________________________________________________________
dense_inp (InputLayer)          (None, 400)          0                                            
__________________________________________________________________________________________________
batch_norm

In [26]:
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import MaxAbsScaler 
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import KFold
TRN_BATCH_SIZE = 512
INF_BATCH_SIZE = 512

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=6239)
epochs = 100
pred = np.zeros(y.shape)
test_pred = 0
ifold = 0
cpu_cores = 4

fold_auc = []

y = X.target.values

for trn_inx, val_inx in kf.split(y):
    print("Training fold {}".format(ifold))
    K.clear_session()

    
    model_file_name = model_name+"__f"+str(ifold)
    model_file = results_dir+model_file_name+'.h5'   
      
    model = buildBaseModel()    
    
    trn_seq = FeatureSequence([X.loc[trn_inx,train_cols].values, 
                               train_mat_pcat[trn_inx],
                               train_mat1[trn_inx],
                               train_mat2[trn_inx],
                               train_mat3[trn_inx]
                              ], 
                              y[trn_inx], np.array(list(range(len(trn_inx)))), TRN_BATCH_SIZE, shuffle=True)
    val_seq = FeatureSequence([X.loc[val_inx,train_cols].values, 
                               train_mat_pcat[val_inx],
                               train_mat1[val_inx],
                               train_mat2[val_inx],
                               train_mat3[val_inx]
                              ], 
                              y[val_inx], np.array(list(range(len(val_inx)))), INF_BATCH_SIZE, shuffle=False)
    te_seq = FeatureSequence([x_te[train_cols].values, 
                               test_mat_pcat,
                               test_mat1,
                               test_mat2,
                               test_mat3
                             ], 
                              y, np.array(list(range(len(x_te)))), INF_BATCH_SIZE, shuffle=False)
    
    
    # Callbacks
    model_checkpoint = ModelCheckpoint(model_file, monitor='val_auc', verbose=1, mode='max',
                                       save_best_only=True, save_weights_only=False, period=1)
    clr = CyclicLR(base_lr=0.0001, max_lr=0.001, step_size=2*math.ceil(len(trn_seq)), mode='triangular2')
    early_stop = EarlyStopping(monitor='val_auc', min_delta=0, patience=4, verbose=1, mode='max')
    mse_eval = RocAucEvaluation(val_seq, y[val_inx], 'val')
    
    # Training
    opt=optimizers.Nadam()
    model.compile(optimizer=opt, loss='binary_crossentropy')

    model.fit_generator(
        generator=trn_seq, steps_per_epoch=len(trn_seq),
        initial_epoch=0, epochs=epochs, shuffle=False, verbose=1,
        callbacks=[mse_eval, model_checkpoint, early_stop, clr], #
        class_weight={0:0.06,1:0.94},
        use_multiprocessing=False, workers=1, max_queue_size=4*cpu_cores)
    
     
    # Predicting
    print("\nPredicting fold {}".format(ifold))
    del model  
    model = load_model(model_file, compile=True)
    pred[val_inx] = model.predict_generator(val_seq, steps=len(val_seq), 
                                                    use_multiprocessing=False, workers=1, 
                                                    max_queue_size=4*cpu_cores).ravel()
    
    auc = roc_auc_score(y[val_inx], pred[val_inx])
    logloss = log_loss(y[val_inx], pred[val_inx])
    fold_auc.append(auc)
    print("fold: {}, auc: {}".format(ifold, auc))
    print("fold: {}, logloss: {}".format(ifold, logloss))
    print()
    
    test_pred += minmax_scale(model.predict_generator(te_seq, steps=len(te_seq), 
                                                    use_multiprocessing=False, workers=1, 
                                                    max_queue_size=4*cpu_cores).ravel())/n_folds
    ifold += 1
    
    gc.collect()

Training fold 0
Epoch 1/100
val_auc: 0.65254332; _logloss: 0.58095176; 

Epoch 00001: val_auc improved from -inf to 0.65254, saving model to ./results/fm_svd_3br__f0.h5
Epoch 2/100
val_auc: 0.68125005; _logloss: inf; 

Epoch 00002: val_auc improved from 0.65254 to 0.68125, saving model to ./results/fm_svd_3br__f0.h5
Epoch 3/100
  4/753 [..............................] - ETA: 13s - loss: 0.0699

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


val_auc: 0.68566512; _logloss: 0.55363646; 

Epoch 00003: val_auc improved from 0.68125 to 0.68567, saving model to ./results/fm_svd_3br__f0.h5
Epoch 4/100
val_auc: 0.68551506; _logloss: 0.54780107; 

Epoch 00004: val_auc did not improve
Epoch 5/100
val_auc: 0.68056939; _logloss: 0.55745832; 

Epoch 00005: val_auc did not improve
Epoch 6/100
val_auc: 0.68145990; _logloss: 0.51887668; 

Epoch 00006: val_auc did not improve
Epoch 7/100
val_auc: 0.67399974; _logloss: 0.54468511; 

Epoch 00007: val_auc did not improve
Epoch 00007: early stopping

Predicting fold 0
fold: 0, auc: 0.6856651240157376
fold: 0, logloss: 0.553636457954255

Training fold 1
Epoch 1/100
val_auc: 0.64575555; _logloss: inf; 

Epoch 00001: val_auc improved from -inf to 0.64576, saving model to ./results/fm_svd_3br__f1.h5
Epoch 2/100
val_auc: 0.67559406; _logloss: 0.54680288; 

Epoch 00002: val_auc improved from 0.64576 to 0.67559, saving model to ./results/fm_svd_3br__f1.h5
Epoch 3/100
val_auc: 0.67422709; _logloss: in

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


val_auc: 0.68475744; _logloss: 0.54379399; 

Epoch 00003: val_auc improved from 0.68249 to 0.68476, saving model to ./results/fm_svd_3br__f4.h5
Epoch 4/100
val_auc: 0.68481632; _logloss: 0.54622424; 

Epoch 00004: val_auc improved from 0.68476 to 0.68482, saving model to ./results/fm_svd_3br__f4.h5
Epoch 5/100
val_auc: 0.68144960; _logloss: inf; 

Epoch 00005: val_auc did not improve
Epoch 6/100
val_auc: 0.67916439; _logloss: 0.53026257; 

Epoch 00006: val_auc did not improve
Epoch 7/100
val_auc: 0.67639474; _logloss: 0.50821741; 

Epoch 00007: val_auc did not improve
Epoch 8/100
val_auc: 0.67186913; _logloss: 0.49647208; 

Epoch 00008: val_auc did not improve
Epoch 00008: early stopping

Predicting fold 4
fold: 4, auc: 0.684816323493742
fold: 4, logloss: 0.5462242379415131

Training fold 5
Epoch 1/100
val_auc: 0.64418654; _logloss: inf; 

Epoch 00001: val_auc improved from -inf to 0.64419, saving model to ./results/fm_svd_3br__f5.h5
Epoch 2/100
val_auc: 0.67374675; _logloss: inf; 

Ep

val_auc: 0.67920198; _logloss: 0.53824795; 

Epoch 00003: val_auc improved from 0.67329 to 0.67920, saving model to ./results/fm_svd_3br__f9.h5
Epoch 4/100
val_auc: 0.67930798; _logloss: 0.55131637; 

Epoch 00004: val_auc improved from 0.67920 to 0.67931, saving model to ./results/fm_svd_3br__f9.h5
Epoch 5/100
val_auc: 0.67913418; _logloss: 0.53383736; 

Epoch 00005: val_auc did not improve
Epoch 6/100
val_auc: 0.67567816; _logloss: 0.54280944; 

Epoch 00006: val_auc did not improve
Epoch 7/100
val_auc: 0.67336952; _logloss: 0.53767618; 

Epoch 00007: val_auc did not improve
Epoch 8/100
val_auc: 0.66778025; _logloss: 0.50813075; 

Epoch 00008: val_auc did not improve
Epoch 00008: early stopping

Predicting fold 9
fold: 9, auc: 0.6793079758201476
fold: 9, logloss: 0.5513163697759864



In [27]:
print(fold_auc)
print(np.mean(fold_auc), np.std(fold_auc))
roc_auc_score(X.target.values, pred)

[0.68566512401573765, 0.67755454115452407, 0.6899301968288829, 0.69156559148919783, 0.684816323493742, 0.67964767405987003, 0.68350906424626523, 0.6822850598113267, 0.68039108189606778, 0.67930797582014757]
0.683467263282 0.00437852082673


0.68298767304681229

In [28]:
np.save(results_dir + 'train_' + model_name +'.npy', pred)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

In [29]:
sub = x_te[['uid','target']].copy()
sub['target'] = test_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.758445
1,ac4b8244f3ae82df511b002257473c11,0.407326
2,483d8b91e49522c8a5bbe37f3872c749,0.495455
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.422068
4,fdbfba9842ff0bf86d600eb334c7c42b,0.398006


In [30]:
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)

In [30]:
print(fold_auc)
print(np.mean(fold_auc), np.std(fold_auc))

[0.67407794423432454, 0.65804794886639673, 0.65067841889905376, 0.66493955106895541, 0.66182499546367923, 0.66190969589725657, 0.65623060743527573, 0.6691731672095117, 0.65720702631251637, 0.66147049027858973]
0.661555984567 0.00635904933355


In [22]:
sample_sub.head()

Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.058912
1,ac4b8244f3ae82df511b002257473c11,0.047257
2,483d8b91e49522c8a5bbe37f3872c749,0.059204
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.042887
4,fdbfba9842ff0bf86d600eb334c7c42b,0.03032
