In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import pickle
import time

import warnings
warnings.filterwarnings('ignore')

from scipy import sparse

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler

DATA_PATH = 'data/'

# Data loading

In [2]:
%%time
train = pd.concat([pd.read_csv(os.path.join(DATA_PATH, 'train_nonNLP_features.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'train_NLP_features.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'train_pos_diff_matrix.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'train_PN.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'train_lgb_tfidf_oof.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'train_nnet_tfidf_oof.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'train_fm_oof.csv'))
                  ], axis=1)

test  = pd.concat([pd.read_csv(os.path.join(DATA_PATH, 'test_nonNLP_features.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'test_NLP_features.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'test_pos_diff_matrix.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'test_PN.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'test_lgb_tfidf_oof.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'test_nnet_tfidf_oof.csv')),
                   pd.read_csv(os.path.join(DATA_PATH, 'test_fm_oof.csv'))
                  ], axis=1)

ytrain = pd.read_csv(os.path.join(DATA_PATH, 'target.csv')).target
ids = pd.read_csv(os.path.join(DATA_PATH, 'train_ids.csv'), usecols=['graph_id'])

train.fillna(-1, inplace=True)
test .fillna(-1, inplace=True)
train.replace([-np.inf, np.inf], -1, inplace=True)
test .replace([-np.inf, np.inf], -1, inplace=True)

CPU times: user 2min 15s, sys: 46.7 s, total: 3min 1s
Wall time: 3min 19s


In [3]:
try:
    with open(os.path.join('data', 'bad_features.pkl'), 'rb') as F:
        bad_features = pickle.load(F)
except:
    bad_features = ['q2_char_len_src', 'bigram_all_jaccard_max_src', 'f_Qratio_src', 'pos_diff_end_src', 
                    'unigram_all_jaccard_max_src', 'unigram_jaccard_stem', 'jac_nostops', 'diffl_stem', 
                    'jac_stem', 'trigram_all_jaccard_stem', 'trigram_jaccard_src', 'trigram_jaccard_stem', 
                    'q1_char_len_stem', 'trigram_jaccard_nostops', 'bigram_all_jaccard_max_nostops', 
                    'unigram_all_jaccard_max_stem', 'unigram_jaccard_nostops', 'loc_q1_country_num', 
                    'bigram_all_jaccard_max_stem', 'bigram_all_jaccard_stem', 'unigram_all_jaccard_max_nostops', 
                    'q1_word_len_stem', 'loc_country_match_relative', 'f_Qratio_nostops', 'f_Qratio_stem', 
                    'unigram_jaccard_src', 'trigram_all_jaccard_max_nostops', 'trigram_all_jaccard_max_stem', 
                    'jac_src']

features = list(set(train.columns) - set(bad_features))
print (len(features))

185


In [4]:
for col in test.columns:
    if test[col].dtypes == 'O':
        test[col].replace('xxx', 0., inplace=True)
        test[col] = test[col].astype(float)

In [5]:
%%time

def load_sparse_csr(filename):
    loader = np.load(filename)
    return sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                             shape = loader['shape'])

def batched_hstack_csr(matrices, batch_size=-1):
    """
    Scipy sparse hstack operation works in linear time only on csc matrices or small csr matrices. 
    So in this function matrices hstacked by small batches, and then the batches are stacked vertically. Vertical
    stacking for csr matrices is very cheap operation.
    
    Input: 
        matrices: list of matrices to be hstacked. All must have the same number of rows. 
            Acceptable formats: csr matrix, pandas DataFrame or numpy array.
        batch_size: int, number of rows to hstack per batch. If not defined then batch_size is set equal to 
            the number of batches, i.e. square root of number of rows in matricies. Recommended batch size 1K - 10K.
    Output: scipy csr matrix    
    """
    
    if batch_size == -1 or batch_size == 'dynamic':
        batch_size = np.ceil(np.sqrt(matrices[0].shape[0]))
     
    if batch_size == 0:
        return sparse.hstack(matrices, format='csr')
    
    batch_size = int(batch_size)
    batches = []
    for i in range(0, matrices[0].shape[0], batch_size):
        lower_bound = i
        upper_bound = min(i+batch_size, matrices[0].shape[0])
        batches.append(sparse.hstack([matrix[lower_bound:upper_bound] for matrix in matrices]
                                     , format='csr'))
    
    return sparse.vstack(batches, format='csr')


train = sparse.csr_matrix(train[features])
test  = sparse.csr_matrix(test [features])

train_tfidf = load_sparse_csr(os.path.join(DATA_PATH, 'data_tfidf_stem_tags.npz'))
test_tfidf  = load_sparse_csr(os.path.join(DATA_PATH, 'kagg_tfidf_stem_tags.npz'))

train = batched_hstack_csr([train, train_tfidf])
test  = batched_hstack_csr([test,  test_tfidf ])

print ('Number of dense features:', len(features))
print ('Total number of features:', train.shape[1])

del train_tfidf, test_tfidf

Number of dense features: 185
Total number of features: 59811
CPU times: user 1min 52s, sys: 2min 31s, total: 4min 24s
Wall time: 7min 6s


# OOF blending pipeline

### Neural net definitions

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Convolution1D, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam, SGD, Adadelta
from keras.layers.advanced_activations import PReLU
from keras.utils.np_utils import to_categorical
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization

def batch_generator(X, y, BATCH_SIZE, EPOCH_PARTION):
    """
    Batch generator for nnet training
    input:
        X - train dataset, numpy array or csr matrix
        y - target, numpy array
        BATCH_SIZE - int, number of objects in batch. If X is csr matrix, it will be transformed 
        to dense array so batch size must be small enough for this array to fit in memory.
        EPOCH_PARTION - float. If in interval (0, 1) - share of objects that will be used for training in epoch.
            Objects are chosen randomly. If equals to 1 - nnet will be trained on all samples without randomization.
    """
    
    batch_number = 0
    sample_index = np.arange(X.shape[0])
    batches_per_epoch = np.ceil(X.shape[0]/BATCH_SIZE*EPOCH_PARTION)
    
    while True:
        if EPOCH_PARTION==1:
            batch_indexes = sample_index[BATCH_SIZE*batch_number : BATCH_SIZE*(batch_number+1)]    
        else:
            batch_indexes = np.random.choice(X.shape[0], BATCH_SIZE)
        
        if type(X).__name__ == 'csr_matrix':
            X_batch = X[batch_indexes].toarray()
        else:
            X_batch = X[batch_indexes]
        y_batch = to_categorical(y, num_classes=2)[batch_indexes]
        
        batch_number += 1
        if batch_number == batches_per_epoch-1:
            batch_number = 0
        yield X_batch, y_batch
            
def batch_generator_p(X, BATCH_SIZE):
    """
    Batch generator for nnet predicitons
    input:
        X - train dataset,  numpy array or csr matrix
        BATCH_SIZE - number of objects in batch. If X is csr matrix, it will be transformed 
        to dense array so batch size must be small enough for this array to fit in memory        
    """
    batches_per_epoch = np.ceil(X.shape[0]/BATCH_SIZE)
    batch_number = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_indexes = sample_index[BATCH_SIZE*batch_number : BATCH_SIZE*(batch_number+1)]
        if type(X).__name__ == 'csr_matrix':
            X_batch = X[batch_indexes].toarray()
        else:
            X_batch = X[batch_indexes]
        batch_number += 1
        yield (X_batch)
        if batch_number == batches_per_epoch:
            batch_number = 0
            
def compile_nnet(data, n1, n2, d1, d2, regul, **kwargs):
    """
    Function to compile simple nnet. Architecture is self-explanatory with code
    input:
        data - numpy arary or csr matrix for training
        n1, n2 - ints, number of neurons in first and second layers
        d1, d2 - float, dropouts in first and second layers
        regul - float, regularization paramter, the same for both layers
        parameters might be passed as a dictionary
    output:
        nnet model
    """
    model = Sequential()
    if regul>0:
        model.add(Dense(n1, input_dim=data.shape[1], 
                        kernel_regularizer=l2(regul), 
                        activity_regularizer=l1(regul)))
    else:    
        model.add(Dense(n1, input_dim=data.shape[1]))
    model.add(PReLU())
    model.add(Dropout(d1))
    
    if regul>0:
        model.add(Dense(n2, 
                        kernel_regularizer=l2(regul), 
                        activity_regularizer=l1(regul)))
    else:    
        model.add(Dense(n2))
    model.add(PReLU())
    model.add(Dropout(d2))

    model.add(Dense(2))
    model.add(Activation('softmax'))
    
    adam = Adam()
    model.compile(loss='binary_crossentropy',
                  optimizer='adadelta',
                  metrics=['binary_crossentropy'])
    return model

def nnet_pred(params, train, ytrain, valid, yvalid, test_fold, kagg):
    """
    input:
        params - dictionary of parameters to be passed to function compile_nnet. May also contain size of a 
            batch and share of objects per epoch
        train, valid, test_fold, kagg - numpy arrays or csr matrices. Nnet is trained on train data, best 
            number of epochs is chosen by binary_crossentropy loss on valid. Best model is saved every epoch 
            and is loaded if there was no improvement on valid set for 10 epochs in a row. Test_fold and 
            kagg - matrices, for which predictions are returned.
        ytrain, yvalid - 1-dim numpy arrays, labels for train and valid sets.
    output: two 1-dim numpy arrays with predicted positive class probability for test_fold and kagg datasets
    """
    
    if type(train).__name__ == 'csr_matrix':
        scaler = MaxAbsScaler()
    else:
        scaler = MinMaxScaler()
    train = scaler.fit_transform(train)
    valid = scaler.transform(valid)
    test_fold  = scaler.transform(test_fold)
    kagg  = scaler.transform(kagg)
    
    model = compile_nnet(train, **params)
    early_stopper = EarlyStopping(monitor='val_binary_crossentropy', patience=10, verbose=0, mode='auto')
    checkpoint = ModelCheckpoint(filepath='nnet_checkpoint.hdf5', 
                                 monitor='val_binary_crossentropy', 
                                 save_best_only=True)

    BATCH_SIZE = params.get('BATCH_SIZE', 256)
    EPOCH_PARTION = params.get('EPOCH_PARTION', 1)
        
    model.fit_generator(generator=batch_generator(train, ytrain, BATCH_SIZE, EPOCH_PARTION),
                        samples_per_epoch=np.ceil(train.shape[0]/BATCH_SIZE*EPOCH_PARTION),
                        verbose=0, nb_epoch=1000,

                        validation_data=batch_generator(valid, yvalid, BATCH_SIZE, EPOCH_PARTION), 
                        validation_steps = int(valid.shape[0]/BATCH_SIZE),

                        callbacks=[early_stopper, checkpoint])
    
    model.load_weights('nnet_checkpoint.hdf5') 
    model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['binary_crossentropy'])
    
    fold_pred = model.predict_generator(generator=batch_generator_p(test_fold, BATCH_SIZE), 
                                        val_samples=test_fold.shape[0]/BATCH_SIZE)
    kagg_pred = model.predict_generator(generator=batch_generator_p(kagg, BATCH_SIZE), 
                                        val_samples=kagg.shape[0]/BATCH_SIZE)
    
    return fold_pred[:,1], kagg_pred[:,1]

Using Theano backend.


### Other models definitions

In [7]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb

from sklearn.utils import shuffle
from sklearn.model_selection import GroupKFold

def xgb_pred(params, train, ytrain, valid, yvalid, test_fold, kagg):
    """
    input:
        params - dictionary of parameters to be passed to xgb.train
        train, valid, test_fold, kagg - numpy arrays or csr matrices. Model is trained on train data, best 
            number of epochs is chosen by loss on valid. Test_fold and kagg - matrices, for which predictions 
            are returned.
        ytrain, yvalid - 1-dim numpy arrays, labels for train and valid sets.
    output: two 1-dim numpy arrays with predicted positive class probability for test_fold and kagg datasets
    """
    
    train  = xgb.DMatrix(train, ytrain)
    dvalid = xgb.DMatrix(valid, yvalid)
    watchlist = [(train, 'train'), (dvalid, 'eval')]

    boost = xgb.train(params, train, 
                    num_boost_round=10000, 
                    evals=watchlist,
                    verbose_eval=False,
                    early_stopping_rounds=20)
    
    # if we trained a linear model, then it has no ntree_limit parameter
    if params['booster'] == 'gbtree':
        fold_pred = boost.predict(xgb.DMatrix(test_fold), ntree_limit=boost.best_iteration)
        kagg_pred = boost.predict(xgb.DMatrix(kagg),      ntree_limit=boost.best_iteration)
    else:
        fold_pred = boost.predict(xgb.DMatrix(test_fold))
        kagg_pred = boost.predict(xgb.DMatrix(kagg))
        
    return fold_pred, kagg_pred

def lgb_pred(params, train, ytrain, valid, yvalid, test_fold, kagg):
    """
    input:
        params - dictionary of parameters to be passed to lgb.train
        train, valid, test_fold, kagg - numpy arrays or csr matrices. Model is trained on train data, best 
            number of epochs is chosen by loss on valid. Test_fold and kagg - matrices, for which predictions 
            are returned.
        ytrain, yvalid - 1-dim numpy arrays, labels for train and valid sets.
    output: two 1-dim numpy arrays with predicted positive class probability for test_fold and kagg datasets
    """
    
    train  = lgb.Dataset(train, ytrain)
    dvalid = lgb.Dataset(valid, yvalid, reference=train)
    
    gbm = lgb.train(params, train,
                    num_boost_round=100000,
                    valid_sets=[train, dvalid],
                    verbose_eval=False,
                    early_stopping_rounds=20)
    
    fold_pred = gbm.predict(test_fold, num_iteration=gbm.best_iteration)
    kagg_pred = gbm.predict(kagg,      num_iteration=gbm.best_iteration) 
    return fold_pred, kagg_pred

def fastfm_pred(params, train, ytrain, test_fold, kagg):
    fmc = FMClassification(**params)
    fmc.fit(train, ytrain)
    fold_pred = fmc.predict_proba(test)
    kagg_pred = fmc.predict_proba(kagg)
    return fold_pred, kagg_pred
    

### Main function

In [8]:
def get_oofs(model, data, kagg, y, ids, n_splits=5, iters_total=3, params=None):
    """
    input:
        model: string ('lgb'/'fastfm'/'xgb'/'nnet') or any sklearn model
        data: train data in format of pandas DataFrame / numpay array / csr sparse matrix. All columns will be 
            used as features. Must not contain target or ids or non-numeric columns
        kagg: test data in format of pandas DataFrame / numpay array / csr sparse matrix. All columns will be 
            used as features. Must not contain target or ids or non-numeric columns
        y: target for train data in a form of pandas DataFrame / pandas Series / numpay array
        ids: graph ids in a form of pandas Series. Graphs ids are used to split train data into separate graphs 
            to prevent overfitting. In other words, quesitons of the same graph will always be in one fold.
        n_splits: number of splits for train data. Default value is 5. In order to get one OOF prediciton, 
            model must be fitted n_splits times
        iters_total: number of total iterations. Default value is 3. OOFs then will be blended. 
            Total number of times a model will be fitted is n_splits*iters_total.
        params: model parameters in a form of dictionary.
    output: 
        data_oofs: numpy array of OOF predictions for train data. Result is blended iters_total times
        kagg_oofs: numpy array of predictions for train data. Result is blended n_splits*iters_total times
    """
    
    if type(data).__name__=='DataFrame':
        data = data.values
    if type(kagg).__name__=='DataFrame':
        kagg = kagg.values
    if model == 'fastfm':
        y = y.replace(0, -1)
        if type(data).__name__ != 'csr_matrix':
            data = sparse.csr_matrix(data)
            kagg = sparse.csr_matrix(kagg)
    if type(y).__name__=='Series' or type(y).__name__=='DataFrame':
        y = y.values
    
    # matrices to store preditions
    data_oofs = np.zeros((data.shape[0]))
    kagg_oofs = np.zeros((kagg.shape[0]))
    
    graph_ids_unique = ids.graph_id.unique()
    
    for iter_num in range(iters_total):
        kf = KFold(n_splits=n_splits, shuffle=True)
        for train_graphs, test_graphs  in kf.split(graph_ids_unique):
            train_ind = ids[ids.graph_id.isin(graph_ids_unique[train_graphs])].index.values
            test_ind  = ids[ids.graph_id.isin(graph_ids_unique[test_graphs ])].index.values
            
            # Adding validation sets (from train set) for models, that require it. 
            # Validation size is 12.5% of train fold.
            if model=='xgb' or model=='lgb' or model=='nnet':
                kf_valid = KFold(n_splits=8, shuffle=True)
                graph_ids_train = graph_ids_unique[train_graphs]
                train_graphs, valid_graphs = list(kf_valid.split(graph_ids_train))[0]
                train_ind = ids[ids.graph_id.isin(graph_ids_train[train_graphs])].index.values
                valid_ind = ids[ids.graph_id.isin(graph_ids_train[valid_graphs])].index.values
                
            if model=='lgb':
                fold_pred, kagg_pred = lgb_pred (params, data[train_ind], y[train_ind], 
                                                         data[valid_ind], y[valid_ind],
                                                         data[test_ind], kagg)
            elif model=='xgb':
                fold_pred, kagg_pred = xgb_pred (params, data[train_ind], y[train_ind], 
                                                         data[valid_ind], y[valid_ind],
                                                         data[test_ind], kagg)
            elif model=='nnet':
                fold_pred, kagg_pred = nnet_pred(params, data[train_ind], y[train_ind], 
                                                         data[valid_ind], y[valid_ind],
                                                         data[test_ind], kagg)
            elif model=='fastfm':
                fold_pred, kagg_pred = fastfm_pred(params, data[train_ind], y[train_ind], data[test_ind], kagg)
               
            # Block for working with sklearn models
            else:
                model.fit(data[train_ind], y[train_ind])
                try:
                    fold_pred = model.predict_proba(data[test_ind])[:,1]
                    kagg_pred = model.predict_proba(kagg)[:,1]
                except:
                    try:
                        fold_pred = model.predict_proba(data[test_ind])
                        kagg_pred = model.predict_proba(kagg)
                    except:
                        fold_pred = model.predict(data[test_ind])
                        kagg_pred = model.predict(kagg)
            
            
            data_oofs[test_ind] += fold_pred
            kagg_oofs += kagg_pred
            print ('fold loss: ', log_loss(y[test_ind], fold_pred))            
        
        print ('iteration OOF score:', log_loss(y, data_oofs/(iter_num+1)))
    data_oofs /= iters_total
    kagg_oofs /= (iters_total*n_splits)
    return data_oofs, kagg_oofs

In [9]:
%%time
train_preds = pd.DataFrame()
test_preds  = pd.DataFrame()

try:
    with open(os.path.join('data', 'lgb_best_params.pkl'), 'rb') as F:
        params = pickle.load(F)
except:
    pass
    params = {'bagging_fraction': 0.9,
             'bagging_freq': 1,
             'boosting_type': 'gbdt',
             'feature_fraction': 0.85,
             'lambda_l2': 1,
             'learning_rate': 0.05,
             'max_bin': 500,
             'metric': 'binary_logloss',
             'min_data_in_leaf': 71,
             'min_gain_to_split': 0,
             'num_leaves': 32,
             'objective': 'binary',
             'task': 'train'}

train_preds['lgb_oof'], test_preds['lgb_oof'] = get_oofs('lgb'
                                                        ,train
                                                        ,test
                                                        ,ytrain
                                                        ,ids 
                                                        ,params=params)
print (log_loss(ytrain, train_preds.lgb_oof))

fold loss:  0.206652461036
fold loss:  0.210304145792
fold loss:  0.211509647168
fold loss:  0.210966099118
fold loss:  0.20803044701
iteration OOF score: 0.209479414098
fold loss:  0.204708527536
fold loss:  0.214105014042
fold loss:  0.213041694471
fold loss:  0.213451965665
fold loss:  0.204162369423
iteration OOF score: 0.208514718953
fold loss:  0.211135022158
fold loss:  0.215385021926
fold loss:  0.2054668681
fold loss:  0.204049653532
fold loss:  0.211985623477
iteration OOF score: 0.208117206419
0.208117206419
CPU times: user 4h 54min 52s, sys: 16min 38s, total: 5h 11min 30s
Wall time: 1h 1min 5s


In [10]:
train_preds.to_csv(os.path.join(DATA_PATH, 'train_preds.csv'), index=False)
test_preds .to_csv(os.path.join(DATA_PATH, 'test_preds.csv'),  index=False)