In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost
import multiprocessing as mp
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score


from scipy.stats import spearmanr, ttest_rel
import itertools
import time

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
plt.style.use('seaborn-poster')
sns.set_palette('Set1', 10, desat=0.75)

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 200)

import warnings
warnings.filterwarnings('ignore')

In [None]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_xgb(preds, y):
    y = y.get_label()
    return 'gini', gini(y, preds) / gini(y, y)

def gini_lgb(preds, y):
    y = y.get_label()
    return 'gini', gini(y, preds) / gini(y, y), True

def categorical_encoding_by_target(data, cat_features, target, inplace=False, 
                                   regul=5, smoothing=10, random_seed=0):
    
    encoding_dict = dict()
    data['target'] = target
    data['myid'] = np.arange(0, data.shape[0], 1)
    mean_target = data.target.mean()
    N_SPLITS = 5

    unique_values_dict = dict()
    for col in cat_features:
        data[col+'_enc'] = 0
        encoding_dict[col] = dict()
        unique_values_dict[col] = data[col].unique()

    for train_ind, test_ind in KFold(n_splits=N_SPLITS, 
                                               shuffle=True, 
                                               random_state=random_seed).split(data, data.target.values):    
        for col in cat_features:
            tmp_aggregated_df = (data
                                 .loc[train_ind]
                                 .groupby(col, as_index=False)
                                 .agg({'target': 'mean', 'myid':'count'})
                                 .set_index(col)
                                )
            mean_target = data.loc[train_ind].target.mean()
            mean_dict    = tmp_aggregated_df.target.to_dict()
            counter_dict = tmp_aggregated_df.myid.to_dict()
            
            for x in unique_values_dict[col]:
                if counter_dict.get(x,0) < 10:
                    encoding_dict[col][x] = mean_target
                    continue        
                
                
#                 encoding_dict[col][x] = encoding_dict[col].get(x,0) + \
#                 (mean_dict.get(x,0)*counter_dict.get(x,0) + mean_target*regul) / (counter_dict.get(x,0)+regul)
                
                encoding_dict[col][x] = encoding_dict[col].get(x,0) + \
                mean_target * (1 - 1 / (1 + np.exp(-(counter_dict.get(x,0) - regul) / smoothing)))\
                    + mean_dict.get(x,0) * 1 / (1 + np.exp(-(counter_dict.get(x,0) - regul) / smoothing))
                    
            data.loc[test_ind, col+'_enc'] = data.loc[test_ind, col].apply(lambda x: 
                   mean_target * (1 - 1 / (1 + np.exp(-(counter_dict.get(x,0) - regul) / smoothing)))\
                    + mean_dict.get(x,0) * 1 / (1 + np.exp(-(counter_dict.get(x,0) - regul) / smoothing)))

#             data.loc[test_ind, col+'_enc'] = data.loc[test_ind, col].apply(lambda x: 
#                    (mean_dict.get(x,0)*counter_dict.get(x,0) + mean_target*regul) / (counter_dict.get(x,0)+regul))
            
            #test[col+'_enc'] = test[col].apply(lambda x: 
            #        (mean_dict.get(x,0)*counter_dict.get(x,0) + mean_target*regul) / (counter_dict.get(x,0)+regul))
            
            #print ('Train OOF:', data.loc[test_ind][['target', col+'_enc']].corr(method='spearman').iloc[0][1].round(4))    
            #print ('Test corr:', test[['target', col+'_enc']].corr(method='spearman').iloc[0][1].round(4)) 
            
    for col in cat_features:
        for x in encoding_dict[col]:
            encoding_dict[col][x] = encoding_dict[col][x]/N_SPLITS
        if inplace:
            data[col] = data[col+'_enc']
            data.drop(col+'_enc', axis=1, inplace=True)
    data = data.drop(['target', 'myid'], axis=1)
    return data, encoding_dict

def categorical_encoding_by_order(data, cat_features, target, inplace=False):
    
    data['target'] = target
    encoding_dict = dict()
    postfix = '_enc'*(not inplace) # '' if inplace==True, '_enc' if inplace=True

    for col in cat_features:
        encoding_dict[col] = dict()

        col_sorted = (data[[col, 'target']]
                     .groupby(col, as_index=False)
                     .agg({'target':np.mean})
                     .sort_values('target')
                     [col]
                    )
        
        cat_number = 0
        for category in col_sorted:
            encoding_dict[col][category] = cat_number
            cat_number += 1
    
        data[col+postfix] = data[col].apply(lambda x: encoding_dict[col].get(x, -1))
    data = data.drop('target', axis=1)
    return data, encoding_dict

In [None]:
def transform_df(df):
    df = pd.DataFrame(df)
    dcol = [c for c in df.columns if c not in ['id','target']]
    
    df['negative_one_vals'] = np.sum((df[dcol]==-1).values, axis=1)
    for c in dcol:
        if '_bin' not in c and add_ranges:
            df[c+str('_median_range')] = (df[c].values > d_median[c]).astype(np.int)
            df[c+str('_mean_range')] = (df[c].values > d_mean[c]).astype(np.int)

    for c in one_hot:
        if (len(one_hot[c])>2 and len(one_hot[c]) <= max_categories_for_ohe)\
        and ('_cat' in c or not ohe_cat_only):
            for val in one_hot[c]:
                df[c+'_OH_' + str(val)] = (df[c].values == val).astype(np.int)
            df = df.drop(c, axis=1)
    return df

def multi_transform(df):
    #print('Init Shape: ', df.shape)
    p = mp.Pool(6)
    df = p.map(transform_df, np.array_split(df, 6))
    df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
    p.close(); p.join()
    #print('After Shape: ', df.shape)
    return df

def recon(reg):
    integer = int(np.round((40*reg)**2)) 
    for a in range(32):
        if (integer - a) % 31 == 0:
            A = a
    M = (integer - A)//31
    return A, M

def preproc(df, part1=True, part2=True):
    
    if part1:
        df['ps_reg_A'] = df['ps_reg_03'].apply(lambda x: recon(x)[0])
        df['ps_reg_M'] = df['ps_reg_03'].apply(lambda x: recon(x)[1])
        df['ps_reg_A'].replace(19,-1, inplace=True)
        df['ps_reg_M'].replace(51,-1, inplace=True)
        df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
    
    if part2:
        df['mult']   = df['ps_reg_01'] * df['ps_reg_03'] * df['ps_reg_02']
        df['ps_car'] = df['ps_car_13'] * df['ps_reg_03'] * df['ps_car_13']
        df['ps_ind'] = df['ps_ind_03'] * df['ps_ind_15']

    return df

import scipy

def get_pval_for_binom_test(p1, p2, n1, n2):
    p = (p1*n1 + p2*n2)/(n1+n2)
    z_stat = (p1-p2)/np.sqrt( p*(1-p)*(1/n1 + 1/n2) )
    pval = min(scipy.stats.norm.cdf(z_stat), 1-scipy.stats.norm.cdf(z_stat))/2
    return pval

def target_encoding_with_pval(train, target_train, cat, pval):
    train['target'] = target_train
    train['cnt'] = 0
    cat_agg = (train
               .groupby(cat, as_index=False)
               .agg({'target':'mean'
                    ,'cnt':'count'})
               .sort_values('target')
               .reset_index(drop=True)
              )

    group_dict = dict()
    
    max_pval = 1
    while max_pval > pval:

        pvals = [get_pval_for_binom_test(
                    cat_agg.iloc[i].target, 
                    cat_agg.iloc[i+1].target, 
                    cat_agg.iloc[i].cnt, 
                    cat_agg.iloc[i+1].cnt)
                 for i in range(cat_agg.shape[0]-1)
                ]
        max_pval, max_pval_ind = max(pvals), np.argmax(pvals)

        replaced_group  = cat_agg.loc[max_pval_ind]  [cat]
        replacing_group = cat_agg.loc[max_pval_ind+1][cat]

        group_dict[replaced_group] = replacing_group

        cat_agg = cat_agg.set_value(max_pval_ind, cat, cat_agg.loc[max_pval_ind+1][cat])
        cat_agg = (cat_agg
                   .assign(target = lambda df: df.cnt*df.target)
                   .groupby(cat, as_index=False)
                   .agg({'target':'sum'
                        ,'cnt':'sum'})
                   .assign(target = lambda df: df.target/df.cnt)
                   .sort_values('target')
                   .reset_index(drop=True)
                  )
        if cat_agg.shape[0] == 1:
            break
        
    all_keys = list(group_dict.keys())

    replace_count = 1
    while replace_count>0:
        replace_count=0
        for c in all_keys:
            for c_old in group_dict:
                if group_dict [c_old] == c:
                    group_dict[c_old] = group_dict[c]
                    replace_count +=1
    train.drop(['target', 'cnt'], axis=1, inplace=True)

    return group_dict

In [None]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(train_series=None, target=None):

    temp = pd.concat([train_series, pd.Series(target, name='target')], axis=1)
    # Compute target mean
    aggregated_values = temp.groupby(by=train_series.name)['target'].agg(["mean", "count", np.std])
    total_std = np.std(target)
    aggregated_values["std"].fillna(total_std, inplace=True)

    # Compute smoothing
    smoothing_component = aggregated_values["count"] * total_std ** 2
    smoothing = smoothing_component / (aggregated_values["std"] ** 2 + smoothing_component)

    # Apply average function to all target data
    mean_total = target.mean()
    mean_values = mean_total * (1 - smoothing) + aggregated_values["mean"] * smoothing

    mean_values_dict = mean_values.to_dict()

    train_columns = train_series.replace(mean_values_dict).fillna(mean_total)

    return mean_values_dict

def add_iteractions(df):
    df['ps_car_13_X_ps_ind_05_cat'] = df['ps_car_13'] + df['ps_ind_05_cat']
    df['ps_car_11_cat_X_ps_ind_05_cat'] = df['ps_car_11_cat'] + df['ps_ind_05_cat']
    df['ps_ind_05_cat_X_ps_ind_17_bin'] = df['ps_ind_05_cat'] + df['ps_ind_17_bin']

    df['ps_car_13_X_ps_ind_17_bin'] = df['ps_car_13'] + df['ps_ind_17_bin']

    df['ps_car_11_cat_X_ps_ind_17_bin'] = df['ps_car_11_cat'] + df['ps_ind_17_bin']

    return df

In [None]:
LOAD_TEST = True

train = pd.concat([pd.read_csv('train.csv'),
                   #pd.read_csv('train_clusters.csv', usecols=['CL_6', 'CL_13', 'CL_21'])
                  ], axis=1)
y = train.target
features = [c for c in train.columns if c not in ['id', 'target']]
print('Train shape:', train.shape)

if LOAD_TEST:
    test = pd.concat([pd.read_csv('test.csv'),
                   #pd.read_csv('test_clusters.csv', usecols=['CL_6', 'CL_13', 'CL_21'])
                  ], axis=1)
    print('Test shape:', test.shape)
else:
    train, test, y, ytest = train_test_split(train, 
                                        train['target'].values, 
                                        test_size=0.2, 
                                        random_state=90
                                        )
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    target_test = test['target'].values
    
target_train = train['target'].values    
id_train = train['id'].values
id_test  = test ['id'].values

In [None]:
train = train.iloc[:,2:]
test = test.iloc[:,1:]

cols_use = [c for c in train.columns if (not c.startswith('ps_calc_'))]

train = train[cols_use]
test = test[cols_use]

col_vals_dict = {c: list(train[c].unique()) for c in train.columns if c.endswith('_cat')}

embed_cols = []
for c in col_vals_dict:
    if len(col_vals_dict[c])>2:
        embed_cols.append(c)
        print(c + ': %d values' % len(col_vals_dict[c]))
embed_cols = sorted(embed_cols)

In [None]:
cols_to_scale = [c for c in train.columns if c not in embed_cols]
scaler = MinMaxScaler()
train[cols_to_scale] = scaler.fit_transform(train[cols_to_scale])
test [cols_to_scale] = scaler.transform(test[cols_to_scale])

In [None]:
def preproc(X_train, X_val, X_test):

    input_list_train = []
    input_list_val = []
    input_list_test = []
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in embed_cols:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_val.append(X_val[c].map(val_map).fillna(0).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
     
    #the rest of the columns
    other_cols = [c for c in X_train.columns if (not c in embed_cols)]
    input_list_train.append(X_train[other_cols].values)
    input_list_val.append(X_val[other_cols].values)
    input_list_test.append(X_test[other_cols].values)
    
    return input_list_train, input_list_val, input_list_test  

In [None]:
#%env KERAS_BACKEND=theano
from keras import callbacks

class AUC_SKlearn_callback(callbacks.Callback):
    def __init__(self, X_train, y_train, useCv = True):
        super(AUC_SKlearn_callback, self).__init__()
        self.bestAucCv = 0
        self.bestAucTrain = 0
        self.cvLosses = []
        self.bestCvLoss = 1,
        self.X_train = X_train
        self.y_train = y_train
        self.useCv = useCv

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        train_pred = self.model.predict(self.X_train)
        aucTrain = roc_auc_score(self.y_train, train_pred)
        #print("Train AUC: " + str(aucTrain))

        if (self.bestAucTrain < aucTrain):
            self.bestAucTrain = aucTrain
            #print ("Best SKlearn AUC training score so far")
            #**TODO: Add your own logging/saving/record keeping code here

        if (self.useCv) :
#             print (self.validation_data[0].shape)
#             print (len(self.validation_data))
#             for i in self.validation_data:
#                 print (i.shape)
            cv_pred = self.model.predict(self.validation_data[:len(embed_cols)+1])
            aucCv = roc_auc_score(self.validation_data[len(embed_cols)+1], cv_pred)
            #print ("Valid AUC: " +  str(aucCv))

            if (self.bestAucCv < aucCv) :
                # Great! New best *actual* CV AUC found (as opposed to the proxy AUC surface we are descending)
                #print("Best SKLearn genuine AUC so far so saving model")
                self.bestAucCv = aucCv

                # **TODO: Add your own logging/model saving/record keeping code here.
                self.model.save("best_auc_model.hdf5", overwrite=True)

            vl = logs.get('val_loss')
            if (self.bestCvLoss < vl) :
                pass
                #print("Best val loss on SoftAUC so far")
                #**TODO -  Add your own logging/saving/record keeping code here.
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        # logs include loss, and optionally acc( if accuracy monitoring is enabled).
        return

In [None]:
import keras.backend as K
import tensorflow as tf
import theano

def pair_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    parts = tf.dynamic_partition(y_pred, y_true, 2)
    y_pos = parts[1]
    y_neg = parts[0]
    y_pos = tf.expand_dims(y_pos, 0)
    y_neg = tf.expand_dims(y_neg, -1)
    out = K.sigmoid(y_neg - y_pos)
    return K.mean(out)

def pair_loss(y_true, y_pred):
    # Extract 1s
    pos_pred_vr = y_pred[y_true.nonzero()]
    # Extract zeroes
    neg_pred_vr = y_pred[theano.tensor.eq(y_true, 0).nonzero()]
    # Broadcast the subtraction to give a matrix of differences  between pairs of observations.
    pred_diffs_vr = pos_pred_vr.dimshuffle(0, 'x') - neg_pred_vr.dimshuffle('x', 0)
    # Get signmoid of each pair.
    stats = theano.tensor.nnet.sigmoid(pred_diffs_vr * 2)
    # Take average and reverse sign
    return 1-theano.tensor.mean(stats) # as we want to minimise, and get this to zero

def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)    
    return FP/N

def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)    
    return TP/P

def auc_metric(y_true, y_pred):   
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return -K.sum(s, axis=0)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Convolution1D, Activation, Merge, Reshape
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam, SGD, Adadelta
from keras.layers.advanced_activations import PReLU
from keras.utils.np_utils import to_categorical
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding


def batch_generator(X, y, BATCH_SIZE, EPOCH_PARTION):
    """
    Batch generator for nnet training
    input:
        X - train dataset, numpy array or csr matrix
        y - target, numpy array
        BATCH_SIZE - int, number of objects in batch. If X is csr matrix, it will be transformed 
        to dense array so batch size must be small enough for this array to fit in memory.
        EPOCH_PARTION - float. If in interval (0, 1) - share of objects that will be used for training in epoch.
            Objects are chosen randomly. If equals to 1 - nnet will be trained on all samples without randomization.
    """
    
    batch_number = 0
    sample_index = np.arange(X.shape[0])
    batches_per_epoch = np.ceil(X.shape[0]/BATCH_SIZE*EPOCH_PARTION)
    
    while True:
        if EPOCH_PARTION==1:
            batch_indexes = sample_index[BATCH_SIZE*batch_number : BATCH_SIZE*(batch_number+1)]    
        else:
            batch_indexes = np.random.choice(X.shape[0], BATCH_SIZE)
        
        if type(X).__name__ == 'csr_matrix':
            X_batch = X[batch_indexes].toarray()
        else:
            X_batch = X[batch_indexes]
        y_batch = to_categorical(y, num_classes=2)[batch_indexes]
        
        batch_number += 1
        if batch_number == batches_per_epoch-1:
            batch_number = 0
        yield X_batch, y_batch
            
def batch_generator_p(X, BATCH_SIZE):
    """
    Batch generator for nnet predicitons
    input:
        X - train dataset,  numpy array or csr matrix
        BATCH_SIZE - number of objects in batch. If X is csr matrix, it will be transformed 
        to dense array so batch size must be small enough for this array to fit in memory        
    """
    batches_per_epoch = np.ceil(X.shape[0]/BATCH_SIZE)
    batch_number = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_indexes = sample_index[BATCH_SIZE*batch_number : BATCH_SIZE*(batch_number+1)]
        if type(X).__name__ == 'csr_matrix':
            X_batch = X[batch_indexes].toarray()
        else:
            X_batch = X[batch_indexes]
        batch_number += 1
        yield (X_batch)
        if batch_number == batches_per_epoch:
            batch_number = 0
            
def compile_nnet(data, n0, d0, n1, n2, n3, d1, d2, d3, regul, use_batch_norm, **kwargs):
    """
    Function to compile simple nnet. Architecture is self-explanatory with code
    input:
        data - numpy arary or csr matrix for training
        n1, n2 - ints, number of neurons in first and second layers
        d1, d2 - float, dropouts in first and second layers
        regul - float, regularization paramter, the same for both layers
        parameters might be passed as a dictionary
    output:
        nnet model
    """
    models = []

    model_ps_car_01_cat = Sequential()
    model_ps_car_01_cat.add(Embedding(13, 7, input_length=1))
    model_ps_car_01_cat.add(Reshape(target_shape=(7,)))
    models.append(model_ps_car_01_cat)
    
    model_ps_car_02_cat = Sequential()
    model_ps_car_02_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_02_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_02_cat)
    
    model_ps_car_03_cat = Sequential()
    model_ps_car_03_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_03_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_03_cat)
    
    model_ps_car_04_cat = Sequential()
    model_ps_car_04_cat.add(Embedding(10, 5, input_length=1))
    model_ps_car_04_cat.add(Reshape(target_shape=(5,)))
    models.append(model_ps_car_04_cat)
    
    model_ps_car_05_cat = Sequential()
    model_ps_car_05_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_05_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_05_cat)
    
    model_ps_car_06_cat = Sequential()
    model_ps_car_06_cat.add(Embedding(18, 6, input_length=1))
    model_ps_car_06_cat.add(Reshape(target_shape=(6,)))
    models.append(model_ps_car_06_cat)
    
    model_ps_car_07_cat = Sequential()
    model_ps_car_07_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_07_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_07_cat)
    
    model_ps_car_09_cat = Sequential()
    model_ps_car_09_cat.add(Embedding(6, 3, input_length=1))
    model_ps_car_09_cat.add(Reshape(target_shape=(3,)))
    models.append(model_ps_car_09_cat)
    
    model_ps_car_10_cat = Sequential()
    model_ps_car_10_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_10_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_10_cat)
    
    model_ps_car_11_cat = Sequential()
    model_ps_car_11_cat.add(Embedding(104, 10, input_length=1))
    model_ps_car_11_cat.add(Reshape(target_shape=(10,)))
    models.append(model_ps_car_11_cat)
    
    
    model_ps_ind_02_cat = Sequential()
    model_ps_ind_02_cat.add(Embedding(5, 3, input_length=1))
    model_ps_ind_02_cat.add(Reshape(target_shape=(3,)))
    models.append(model_ps_ind_02_cat)
    
    model_ps_ind_04_cat = Sequential()
    model_ps_ind_04_cat.add(Embedding(3, 2, input_length=1))
    model_ps_ind_04_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_ind_04_cat)
    
    model_ps_ind_05_cat = Sequential()
    model_ps_ind_05_cat.add(Embedding(8, 5, input_length=1))
    model_ps_ind_05_cat.add(Reshape(target_shape=(5,)))
    models.append(model_ps_ind_05_cat)  
    
    
    
   

    model_rest = Sequential()
    model_rest.add(Dense(n0, input_dim=24))
    model_rest.add(Dropout(d0))
    models.append(model_rest)

    model = Sequential()
    model.add(Merge(models, mode='concat'))
    
    
    if regul>0:
        model.add(Dense(n1, 
                        kernel_regularizer=l2(regul), 
                        activity_regularizer=l1(regul)))
    else:    
        model.add(Dense(n1)),#data.shape[1]))
    model.add(PReLU())
    if use_batch_norm:
        model.add(BatchNormalization(axis=1))
    model.add(Dropout(d1))
    
    if regul>0:
        model.add(Dense(n2, 
                        kernel_regularizer=l2(regul), 
                        activity_regularizer=l1(regul)))
    else:    
        model.add(Dense(n2))
    model.add(PReLU())
    if use_batch_norm:
        model.add(BatchNormalization(axis=1))
    model.add(Dropout(d2))
    
    if regul>0 and n3>0:
        model.add(Dense(n3, 
                        kernel_regularizer=l2(regul), 
                        activity_regularizer=l1(regul)))
        model.add(PReLU())
        if use_batch_norm:
            model.add(BatchNormalization(axis=1))
        model.add(Dropout(d3))
    elif n3>0:    
        model.add(Dense(n3))
        model.add(PReLU())
        if use_batch_norm:
            model.add(BatchNormalization(axis=1))
        model.add(Dropout(d3))
    

    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    
    #adam = Adam()
    #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_crossentropy'])
    
    #model.compile(loss=pair_loss, metrics=[pair_loss], optimizer='adam') 
    model.compile(loss='binary_crossentropy', metrics=[auc_metric], optimizer='adam') 
    
    return model

# def nnet_pred(params, train, ytrain, valid, yvalid, 
#              # test_fold, 
#               kagg):
#     """
#     input:
#         params - dictionary of parameters to be passed to function compile_nnet. May also contain size of a 
#             batch and share of objects per epoch
#         train, valid, test_fold, kagg - numpy arrays or csr matrices. Nnet is trained on train data, best 
#             number of epochs is chosen by binary_crossentropy loss on valid. Best model is saved every epoch 
#             and is loaded if there was no improvement on valid set for 10 epochs in a row. Test_fold and 
#             kagg - matrices, for which predictions are returned.
#         ytrain, yvalid - 1-dim numpy arrays, labels for train and valid sets.
#     output: two 1-dim numpy arrays with predicted positive class probability for test_fold and kagg datasets
#     """
    
#     if params.get('use_scaler', False):
#         if type(train).__name__ == 'csr_matrix':
#             scaler = MaxAbsScaler()
#         else:
#             scaler = MinMaxScaler()
#         train = scaler.fit_transform(train)
#         valid = scaler.transform(valid)
#         test_fold  = scaler.transform(test_fold)
#         kagg  = scaler.transform(kagg)
    
#     model = compile_nnet(train, **params)
#     early_stopper = EarlyStopping(monitor='val_binary_crossentropy', patience=10, verbose=0, mode='auto')
#     checkpoint = ModelCheckpoint(filepath='nnet_checkpoint.hdf5', 
#                                  monitor='val_binary_crossentropy', 
#                                  save_best_only=True)

#     BATCH_SIZE = params.get('BATCH_SIZE', 256)
#     EPOCH_PARTION = params.get('EPOCH_PARTION', 1)
#     #nb_epoch
#     model.fit(train, ytrain, sample_weight=np.array([1 if i==0 else 5 for i in ytrain]),
#               epochs=500, batch_size=BATCH_SIZE, verbose=0,

#                         validation_data=(valid, yvalid, np.array([1 if i==0 else 5 for i in yvalid])),
#                         callbacks=[early_stopper, checkpoint]
#              )
    
#     model.load_weights('nnet_checkpoint.hdf5') 
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_crossentropy'])
    
#     fold_pred = model.predict(valid)
#     kagg_pred = model.predict(kagg)
    
#     return fold_pred.reshape(-1), kagg_pred.reshape(-1)

In [None]:
def nnet_pred(params, train, ytrain, valid, yvalid, 
             # test_fold, 
              kagg):
    
    model = compile_nnet(train, **params)
    
    callbacksList = [AUC_SKlearn_callback(train, ytrain, useCv=True)]
    early_stopping = callbacks.EarlyStopping(monitor='val_auc_metric', min_delta=0.00001, patience=6,
                                                       verbose=2, mode='min')
    callbacksList.append( early_stopping )
    

    BATCH_SIZE = params.get('BATCH_SIZE', 256)
    EPOCH_PARTION = params.get('EPOCH_PARTION', 1)
    SCALE_POS_WEIGHT = params.get('scale_pos_weight', 1)
    model.fit(train, ytrain,  sample_weight=np.array([1 if i==0 else SCALE_POS_WEIGHT for i in ytrain]),
              epochs=100, batch_size=BATCH_SIZE, verbose=0,

             validation_data=(valid, yvalid),
             callbacks=callbacksList
             )
    
    model.load_weights('best_auc_model.hdf5') 
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc_metric])
    
    fold_pred = model.predict(valid)
    kagg_pred = model.predict(kagg)
    
    return fold_pred.reshape(-1), kagg_pred.reshape(-1)

In [None]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb

from sklearn.utils import shuffle
from sklearn.model_selection import GroupKFold, KFold
from scipy.stats import spearmanr

def xgb_pred(params, train, ytrain, valid, yvalid, test_fold, kagg):
    """
    input:
        params - dictionary of parameters to be passed to xgb.train
        train, valid, test_fold, kagg - numpy arrays or csr matrices. Model is trained on train data, best 
            number of epochs is chosen by loss on valid. Test_fold and kagg - matrices, for which predictions 
            are returned.
        ytrain, yvalid - 1-dim numpy arrays, labels for train and valid sets.
    output: two 1-dim numpy arrays with predicted positive class probability for test_fold and kagg datasets
    """
    
    train  = xgb.DMatrix(train, ytrain)
    dvalid = xgb.DMatrix(valid, yvalid)
    watchlist = [(train, 'train'), (dvalid, 'eval')]

    boost = xgb.train(params, train, 
                    num_boost_round=10000, 
                    evals=watchlist,
                    verbose_eval=False,
                    maximize=True,
                    early_stopping_rounds=50)
    
    # if we trained a linear model, then it has no ntree_limit parameter
    if params['booster'] == 'gbtree':
        fold_pred = boost.predict(xgb.DMatrix(test_fold), ntree_limit=boost.best_iteration)
        kagg_pred = boost.predict(xgb.DMatrix(kagg),      ntree_limit=boost.best_iteration)
        print (boost.best_iteration)
    else:
        fold_pred = boost.predict(xgb.DMatrix(test_fold))
        kagg_pred = boost.predict(xgb.DMatrix(kagg))
        
    return fold_pred, kagg_pred

def lgb_pred(params, train, ytrain, valid, yvalid, test_fold, kagg):
    """
    input:
        params - dictionary of parameters to be passed to lgb.train
        train, valid, test_fold, kagg - numpy arrays or csr matrices. Model is trained on train data, best 
            number of epochs is chosen by loss on valid. Test_fold and kagg - matrices, for which predictions 
            are returned.
        ytrain, yvalid - 1-dim numpy arrays, labels for train and valid sets.
    output: two 1-dim numpy arrays with predicted positive class probability for test_fold and kagg datasets
    """
    
    dtrain = lgb.Dataset(train, ytrain)
    dvalid = lgb.Dataset(valid, yvalid, reference=dtrain)
    
    gbm = lgb.train(params, dtrain,
                    num_boost_round=100000,
                    valid_sets=[dtrain, dvalid],
                    verbose_eval=False,
                    early_stopping_rounds=50)
    
    fold_pred = gbm.predict(test_fold, num_iteration=gbm.best_iteration)
    kagg_pred = gbm.predict(kagg,      num_iteration=gbm.best_iteration) 
    print (gbm.best_iteration)
    
    #train_pred = gbm.predict(train, num_iteration=gbm.best_iteration)
    #valid_pred = gbm.predict(valid, num_iteration=gbm.best_iteration)
    #print (gini(ytrain, train_pred)/gini(ytrain, ytrain))
    #print (gini(yvalid, valid_pred)/gini(yvalid, yvalid))
    
    return fold_pred, kagg_pred

def fastfm_pred(params, train, ytrain, test_fold, kagg):
    fmc = FMClassification(**params)
    fmc.fit(train, ytrain)
    fold_pred = fmc.predict_proba(test)
    kagg_pred = fmc.predict_proba(kagg)
    return fold_pred, kagg_pred

def mod_stat(y, train_duperate):
    test_duperate = 0.5
    a = test_duperate/train_duperate
    b = (1-test_duperate) / (1-train_duperate)
    return a*y / (a*y + b*(1-y))

    
def get_oofs(model, data, kagg, y, features=[], cat_features=[], n_splits=5, iters_total=2, 
             params=None, valid_size=0, SEED=100, ykagg=None):
    """
    input:
        model: string ('lgb'/'fastfm'/'xgb'/'nnet') or any sklearn model
        data: train data in format of pandas DataFrame / numpay array / csr sparse matrix. All columns will be 
            used as features. Must not contain target or ids or non-numeric columns
        kagg: test data in format of pandas DataFrame / numpay array / csr sparse matrix. All columns will be 
            used as features. Must not contain target or ids or non-numeric columns
        y: target for train data in a form of pandas DataFrame / pandas Series / numpay array
        ids: graph ids in a form of pandas Series. Graphs ids are used to split train data into separate graphs 
            to prevent overfitting. In other words, quesitons of the same graph will always be in one fold.
        n_splits: number of splits for train data. Default value is 5. In order to get one OOF prediciton, 
            model must be fitted n_splits times
        iters_total: number of total iterations. Default value is 3. OOFs then will be blended. 
            Total number of times a model will be fitted is n_splits*iters_total.
        params: model parameters in a form of dictionary.
    output: 
        data_oofs: numpy array of OOF predictions for train data. Result is blended iters_total times
        kagg_oofs: numpy array of predictions for train data. Result is blended n_splits*iters_total times
    """
        
    if type(data).__name__=='DataFrame':
        if len(features) == 0:
            features = data.columns
        data = data[features]#.values
    if type(kagg).__name__=='DataFrame':
        if len(features) == 0:
            features = kagg.columns
        kagg = kagg[features]#.values
        
    if model == 'fastfm':
        y = y.replace(0, -1)
        if type(data).__name__ != 'csr_matrix':
            data = sparse.csr_matrix(data)
            kagg = sparse.csr_matrix(kagg)
    if type(y).__name__=='Series' or type(y).__name__=='DataFrame':
        y = y.values
    
    # matrices to store preditions
    data_oofs = np.zeros((data.shape[0]))
    kagg_oofs = np.zeros((kagg.shape[0]))
    
    data_oofs_ranks = np.zeros((data.shape[0]))
    kagg_oofs_ranks = np.zeros((kagg.shape[0]))
    
    for iter_num in range(iters_total):
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=iter_num+SEED)
        for train_ind, test_ind in kf.split(data, y):
            kagg_fold = kagg
            
            # Adding validation sets (from train set) for models that require it. 
            # Validation size is 12.5% of train fold.
            if (model=='xgb' or model=='lgb' or model=='nnet') and valid_size>0:
                kf_valid = StratifiedKFold(n_splits=int(1/valid_size), shuffle=True, random_state=iter_num+SEED)
                train_data, ytrain = data[train_ind], y[train_ind]
                train_ind, valid_ind = list(kf_valid.split(train_data, ytrain))[0]
                valid_data, yvalid = train_data[valid_ind], ytrain.iloc[valid_ind]
                train_data, ytrain = train_data[train_ind], ytrain.iloc[train_ind]
                
            else:
                train_data, ytrain = data.iloc[train_ind], y[train_ind]
                valid_data, yvalid = data.iloc[test_ind],  y[test_ind]

            train_data = train_data.reset_index(drop=True)
            valid_data = valid_data.reset_index(drop=True)
            
            
            pos = (pd.Series(ytrain == 1))
            train_data = pd.concat([train_data, train_data.loc[pos], 
                                    train_data.loc[pos], train_data.loc[pos]], axis=0)
            ytrain = np.concatenate([ytrain, ytrain[pos], ytrain[pos], ytrain[pos]])
            idx = np.arange(len(train_data))
            np.random.shuffle(idx)
            train_data = train_data.iloc[idx]
            ytrain = ytrain[idx]


            proc_train_data, proc_valid_data, proc_kagg_fold = preproc(train_data, valid_data, kagg_fold)
            new_features = list(set(features) - set(cat_features)) + list(map(lambda x: x+'_enc', cat_features))


            if model=='lgb':
                fold_pred, kagg_pred = lgb_pred (params, train_data[new_features], ytrain, 
                                                         valid_data[new_features], yvalid,
                                                         data.iloc[test_ind][new_features], 
                                                         kagg_fold[new_features])
            elif model=='xgb':
                fold_pred, kagg_pred = xgb_pred (params, train_data[new_features], ytrain, 
                                                         valid_data[new_features], yvalid,
                                                         data.iloc[test_ind][new_features], 
                                                         kagg_fold[new_features])
            elif model=='nnet':
                fold_pred, kagg_pred = nnet_pred(params, proc_train_data, ytrain, 
                                                         proc_valid_data, yvalid,
                                                         #data.iloc [test_ind][new_features], 
                                                         proc_kagg_fold)
            elif model=='fastfm':
                fold_pred, kagg_pred = fastfm_pred(params, data[train_ind], y[train_ind], data[test_ind], kagg)

            # Block for working with sklearn models
            else:
                model.fit(data[train_ind], y[train_ind])
                try:
                    fold_pred = model.predict_proba(data[test_ind])[:,1]
                    kagg_pred = model.predict_proba(kagg)[:,1]
                except:
                    try:
                        fold_pred = model.predict_proba(data[test_ind])
                        kagg_pred = model.predict_proba(kagg)
                    except:
                        fold_pred = model.predict(data[test_ind])
                        kagg_pred = model.predict(kagg)
                        
            data_oofs[test_ind] += mod_stat(fold_pred, np.mean(fold_pred))
            kagg_oofs += mod_stat(kagg_pred, np.mean(kagg_pred))
            
            data_oofs_ranks[test_ind] += fold_pred.argsort().argsort()
            kagg_oofs_ranks += kagg_pred.argsort().argsort()
            

            try:
                print ('fold loss: ', round(gini(ykagg, kagg_pred)/gini(ykagg, ykagg), 4),
                   round(np.mean(kagg_pred), 4))
            except:
                print ('fold loss: ', round(gini(y[test_ind], fold_pred)/gini(y[test_ind], y[test_ind]), 4),
                    round(np.mean(kagg_pred), 4))
    
        print ('iteration OOF score:', gini(y, data_oofs/(iter_num+1))/gini(y, y))
    data_oofs /= iters_total
    kagg_oofs /= (iters_total*n_splits)
    return data_oofs, kagg_oofs, data_oofs_ranks, kagg_oofs_ranks

train_preds = pd.DataFrame()
test_preds  = pd.DataFrame()

In [None]:
%%time
col = 'nnet'
params = {'n0': 32, 'd0':0.25, 
          'n1':128, 'n2':64, 'n3':0,
          'd1':0.25, 'd2':0.25, 'd3':0, 'regul':0, 
          'use_batch_norm': False, 'scale_pos_weight': 5, 
          'BATCH_SIZE':4096, 'EPOCH_PARTION':1, 'use_scaler':False}

train_preds[col], test_preds[col], train_preds[col+'_rank'], \
test_preds[col+'_rank']= get_oofs('nnet'
                                ,train#.iloc[:10000]
                                ,test#.iloc[:10000]
                                ,target_train#[:10000]
                                ,list(train.columns)#sorted(list(set(features)-set([''])))
                                ,cat_features=[]
                                ,n_splits=5
                                ,iters_total=1
                                ,params=params
                                ,ykagg=target_test#[:10000]
                                ,SEED=2
                                 )

In [None]:
%%time
col = 'nnet'
params = {'n0': 32, 'd0':0.25, 
          'n1':128, 'n2':64, 'n3':0,
          'd1':0.25, 'd2':0.25, 'd3':0, 'regul':0, 
          'use_batch_norm': False, 'scale_pos_weight': 10, 
          'BATCH_SIZE':4096, 'EPOCH_PARTION':1, 'use_scaler':False}

train_preds[col], test_preds[col], train_preds[col+'_rank'], \
test_preds[col+'_rank']= get_oofs('nnet'
                                ,train#.iloc[:10000]
                                ,test#.iloc[:10000]
                                ,target_train#[:10000]
                                ,list(train.columns)#sorted(list(set(features)-set([''])))
                                ,cat_features=[]
                                ,n_splits=5
                                ,iters_total=1
                                ,params=params
                                ,ykagg=target_test#[:10000]
                                ,SEED=2
                                 )

In [None]:
%%time
col = 'nnet'
params = {'n0': 32, 'd0':0.25, 
          'n1':64, 'n2':32, 'n3':0,
          'd1':0.25, 'd2':0.25, 'd3':0, 'regul':0, 
          'use_batch_norm': False, 'scale_pos_weight': 5, 
          'BATCH_SIZE':4096, 'EPOCH_PARTION':1, 'use_scaler':False}

train_preds[col], test_preds[col], train_preds[col+'_rank'], \
test_preds[col+'_rank']= get_oofs('nnet'
                                ,train#.iloc[:10000]
                                ,test#.iloc[:10000]
                                ,target_train#[:10000]
                                ,list(train.columns)#sorted(list(set(features)-set([''])))
                                ,cat_features=[]
                                ,n_splits=5
                                ,iters_total=1
                                ,params=params
                                ,ykagg=target_test#[:10000]
                                ,SEED=2
                                 )

In [None]:
%%time
# early stopping and los - logloss
col = 'target'
params = {'n0': 32, 'd0':0.25, 
          'n1':128, 'n2':64, 'n3':0,
          'd1':0.25, 'd2':0.25, 'd3':0, 'regul':0, 
          'use_batch_norm': False, 'scale_pos_weight': 20, 
          'BATCH_SIZE':4096, 'EPOCH_PARTION':1, 'use_scaler':False}

train_preds[col], test_preds[col], train_preds[col+'_rank'], \
test_preds[col+'_rank']= get_oofs('nnet'
                                ,train#.iloc[:10000]
                                ,test#.iloc[:10000]
                                ,target_train#[:10000]
                                ,list(train.columns)#sorted(list(set(features)-set([''])))
                                ,cat_features=[]
                                ,n_splits=5
                                ,iters_total=2
                                ,params=params
                                ,ykagg=None#target_test#[:10000]
                                ,SEED=100
                                 )
test_preds[[ 'target']].to_csv('test_11.csv', index=False)

In [None]:
%%time
# early stopping and los - logloss
col = 'target'
params = {'n0': 16, 'd0':0.2, 
          'n1': 64, 'n2':32, 'n3':16,
          'd1':0.3, 'd2':0.2, 'd3':0.2, 'regul':0, 
          'use_batch_norm': False, 'scale_pos_weight': 20, 
          'BATCH_SIZE':4096, 'EPOCH_PARTION':1, 'use_scaler':False}

train_preds[col], test_preds[col], train_preds[col+'_rank'], \
test_preds[col+'_rank']= get_oofs('nnet'
                                ,train#.iloc[:10000]
                                ,test#.iloc[:10000]
                                ,target_train#[:10000]
                                ,list(train.columns)#sorted(list(set(features)-set([''])))
                                ,cat_features=[]
                                ,n_splits=5
                                ,iters_total=2
                                ,params=params
                                ,ykagg=None#target_test#[:10000]
                                ,SEED=200
                                 )
test_preds[[ 'target']].to_csv('test_15.csv', index=False)

In [None]:
%%time
#early stopping - auc
col = 'target'
params = {'n0': 16, 'd0':0.2, 
          'n1': 64, 'n2':32, 'n3':16,
          'd1':0.3, 'd2':0.2, 'd3':0.2, 'regul':0, 
          'use_batch_norm': False, 'scale_pos_weight': 20, 
          'BATCH_SIZE':4096, 'EPOCH_PARTION':1, 'use_scaler':False}

train_preds[col], test_preds[col], train_preds[col+'_rank'], \
test_preds[col+'_rank']= get_oofs('nnet'
                                ,train#.iloc[:10000]
                                ,test#.iloc[:10000]
                                ,target_train#[:10000]
                                ,list(train.columns)#sorted(list(set(features)-set([''])))
                                ,cat_features=[]
                                ,n_splits=5
                                ,iters_total=2
                                ,params=params
                                ,ykagg=None#target_test#[:10000]
                                ,SEED=200
                                 )
test_preds[['target']].to_csv('test_16.csv', index=False)

In [None]:
%%time
#early stopping - auc, oversampling 1+3
col = 'target'
params = {'n0': 16, 'd0':0.2, 
          'n1': 64, 'n2':32, 'n3':16,
          'd1':0.3, 'd2':0.2, 'd3':0.2, 'regul':0, 
          'use_batch_norm': False, 'scale_pos_weight': 5, 
          'BATCH_SIZE':4096, 'EPOCH_PARTION':1, 'use_scaler':False}

train_preds[col], test_preds[col], train_preds[col+'_rank'], \
test_preds[col+'_rank']= get_oofs('nnet'
                                ,train#.iloc[:10000]
                                ,test#.iloc[:10000]
                                ,target_train#[:10000]
                                ,list(train.columns)#sorted(list(set(features)-set([''])))
                                ,cat_features=[]
                                ,n_splits=5
                                ,iters_total=2
                                ,params=params
                                ,ykagg=None#target_test#[:10000]
                                ,SEED=200
                                 )
test_preds[['target']].to_csv('test_17.csv', index=False)