In [1]:
data_dir = './data/mlboot_dataset/'
model_name = 'autoencoder'
results_dir = './results/'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp

In [2]:
mat1 = sp.load_npz(data_dir+'bmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'bmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'bmat3.npz').tolil()
print(mat1.shape, mat2.shape, mat3.shape)

mat = sp.hstack([mat1,mat2,mat3]).T
del mat1,mat2,mat3
mat.shape

(609018, 2053602) (609018, 20275) (609018, 1057788)


(3131665, 609018)

In [3]:
import multiprocessing
cpu_cores = multiprocessing.cpu_count()

import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras import regularizers
from keras import optimizers
from keras import backend as K
from keras.utils import Sequence
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))

Using TensorFlow backend.


# Data preprocessing

In [4]:
import math
from sklearn.metrics import log_loss
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1
        K.set_value(self.model.optimizer.lr, self.clr())

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

            
class RocAucEvaluation(Callback):
    def __init__(self, X_seq, y, name, interval=1):
        super(Callback, self).__init__()
        self.X_seq, self.y = X_seq, y
        self.name = name
        self.interval = interval

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict_generator(self.X_seq, steps=len(self.X_seq), 
                                                          use_multiprocessing=False, workers=1, 
                                                          max_queue_size=4*cpu_cores).ravel()
            auc = roc_auc_score(self.y.todense(), y_pred)
            logloss = log_loss(self.y.todense(), y_pred)
            logs[self.name+"_auc"] = auc
            logs[self.name+"_logloss"] = logloss
            print((self.name+"_auc: {:.8f}; "+"_logloss: {:.8f}; ").format(auc,logloss))
            
class FeatureSequence(Sequence):
    
    def __init__(self, X, y, inx, batch_size, shuffle=False):
        
        self.X, self.y = X, y
        self.batch_size = batch_size
        
        self.inx = inx
        self.shuffle = shuffle
        if self.shuffle:
            np.random.shuffle(self.inx)

    def __len__(self):
        return math.ceil(self.inx.shape[0] / self.batch_size)

    def __getitem__(self, i):
        batch_inx = self.inx[i*self.batch_size:(i+1)*self.batch_size]
        
        batch = [x[batch_inx] for x in [self.X[0]]] +  [x[batch_inx].todense() for x in [self.X[1]]]
        return batch, self.y[batch_inx].todense()
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.inx)            

In [39]:
def buildAutoEncoder():   
    
    sparse_inp1 = Input((mat.shape[1],), name='sparse_inp1')
    sparse1_x = BatchNormalization()(sparse_inp1)
    #sparse1_x = sparse_inp1
    
    x = sparse1_x
    
    x = Dense(64,  activation='relu')(x)
    x = Dense(32,  activation='relu')(x)
    
    encoded = x
    decoded = Dense(mat.shape[1], activation="sigmoid")(encoded)
    
    autoencoder = Model(inputs=sparse_inp1, outputs=decoded, name="autoencoder")
    encoder = Model(inputs=sparse_inp1, outputs=encoded, name="encoder")
    
    return encoder, autoencoder

_, autoencoder = buildAutoEncoder()
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sparse_inp1 (InputLayer)     (None, 609018)            0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 609018)            2436072   
_________________________________________________________________
dense_10 (Dense)             (None, 64)                38977216  
_________________________________________________________________
dense_11 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_12 (Dense)             (None, 609018)            20097594  
Total params: 61,512,962
Trainable params: 60,294,926
Non-trainable params: 1,218,036
_________________________________________________________________


In [41]:
def bc_sum(y_true, y_pred):
    return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)

K.clear_session()

_, autoencoder = buildAutoEncoder()
autoencoder.summary()

opt=optimizers.Nadam()
autoencoder.compile(optimizer=opt, loss=bc_sum, metrics=["categorical_crossentropy","accuracy"])

epochs=8
batch_size=128

#mat = mat.tocsr()

class FeatureSequence(Sequence):
    
    def __init__(self, X, y, inx, batch_size, shuffle=False):
        self.X, self.y = X, y
        self.batch_size = batch_size
        
        self.inx = inx
        self.shuffle = shuffle
        if self.shuffle:
            np.random.shuffle(self.inx)

    def __len__(self):
        return math.ceil(self.inx.shape[0] / self.batch_size)

    def __getitem__(self, i):
        batch_inx = self.inx[i*self.batch_size:(i+1)*self.batch_size]
        
        batch = self.X[batch_inx].todense()
        return batch, self.y[batch_inx].todense()
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.inx)
            

fts = FeatureSequence(mat,mat,np.array(list(range(mat.shape[0]))),batch_size)

autoencoder.fit_generator(
        generator=fts, steps_per_epoch=len(fts),
        initial_epoch=0, epochs=epochs, shuffle=False, verbose=1,
        use_multiprocessing=False, workers=1, max_queue_size=16)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sparse_inp1 (InputLayer)     (None, 609018)            0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 609018)            2436072   
_________________________________________________________________
dense_1 (Dense)              (None, 64)                38977216  
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 609018)            20097594  
Total params: 61,512,962
Trainable params: 60,294,926
Non-trainable params: 1,218,036
_________________________________________________________________
Epoch 1/8
 3374/24467 [===>..........................] - ETA: 2:47:29 - loss: 773.2886 - categorical_crossentropy: 578.648

KeyboardInterrupt: 