In [1]:
import os
import time 
import warnings
warnings.filterwarnings('ignore')

import nltk
import re
import pickle

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
plt.style.use('seaborn-poster')
sns.set_palette('Set1', 10, desat=0.75)

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 200)

import scipy.sparse as sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler



DATA_PATH = 'data/'

Populating the interactive namespace from numpy and matplotlib


In [2]:
%%time
def load_sparse_csr(filename):
    "Loads scipy sparse matrix with csr format"
    loader = np.load(filename)
    return sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

ytrain = pd.read_csv(os.path.join(DATA_PATH, 'target.csv')).target#[:10000]
ids = pd.read_csv(os.path.join(DATA_PATH, 'train_ids.csv'), usecols=['graph_id'])#[:10000]

data = load_sparse_csr(os.path.join(DATA_PATH, 'data_tfidf_stem_tags.npz'))#[:10000]
kagg = load_sparse_csr(os.path.join(DATA_PATH, 'kagg_tfidf_stem_tags.npz'))#[:10000]


CPU times: user 2.56 s, sys: 792 ms, total: 3.35 s
Wall time: 3.47 s


In [3]:
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

import lightgbm as lgb
from fastFM.sgd import FMClassification

def lgb_pred(params, train, ytrain, valid, yvalid, test, kagg):
    train  = lgb.Dataset(train, ytrain)
    dvalid = lgb.Dataset(valid, yvalid, reference=train)
    
    gbm = lgb.train(params,train,
                    num_boost_round=100000,
                    valid_sets=[train, dvalid],
                    verbose_eval=False,
                    early_stopping_rounds=20)
    
    fold_pred  = gbm.predict(test,  num_iteration=gbm.best_iteration)
    kagg_pred  = gbm.predict(kagg,  num_iteration=gbm.best_iteration) 
    return fold_pred, kagg_pred

def fastfm_pred(params, train, ytrain, test, kagg):
    fmc = FMClassification(**params)
    fmc.fit(train, ytrain)
    fold_pred = fmc.predict_proba(test)
    kagg_pred = fmc.predict_proba(kagg)
    return fold_pred, kagg_pred
        
def get_oofs(model, data, kagg, y, ids, 
             n_splits=5, iters_total=3, params=None):
    """
    input:
        model: string ('lgb'/'fastfm'/'xgb'/'nnet') or any sklearn model
        data: train data in format of pandas DataFrame / numpay array / csr sparse matrix. All columns will be 
            used as features. Must not contain target or ids or non-numeric columns
        kagg: test data in format of pandas DataFrame / numpay array / csr sparse matrix. All columns will be 
            used as features. Must not contain target or ids or non-numeric columns
        y: target for train data in a form of pandas DataFrame / pandas Series / numpay array
        ids: graph ids in a form of pandas Series. Graphs ids are used to split train data into separate graphs 
            to prevent overfitting. In other words, quesitons of the same graph will always be in one fold.
        n_splits: number of splits for train data. Default value is 5. In order to get one OOF prediciton, 
            model must be fitted n_splits times
        iters_total: number of total iterations. Default value is 3. OOFs then will be blended. 
            Total number of times a model will be fitted is n_splits*iters_total.
        params: model parameters in a form of dictionary.
    output: 
        data_oofs: numpy array of OOF predictions for train data. Result is blended iters_total times
        kagg_oofs: numpy array of predictions for train data. Result is blended n_splits*iters_total times
    """
    
    if type(data).__name__=='DataFrame':
        data = data.values
    if type(kagg).__name__=='DataFrame':
        kagg = kagg.values
    if model == 'fastfm':
        y = y.replace(0, -1)
        if type(data).__name__ != 'csr_matrix':
            data = sparse.csr_matrix(data)
            kagg = sparse.csr_matrix(kagg)
    if type(y).__name__=='Series' or type(y).__name__=='DataFrame':
        y = y.values
    
    # matrices to store preditions
    data_oofs = np.zeros((data.shape[0]))
    kagg_oofs = np.zeros((kagg.shape[0]))
    
    graph_ids_unique = ids.graph_id.unique()
    
    for iter_num in range(iters_total):
        kf = KFold(n_splits=n_splits, shuffle=True)
        for train_graphs, test_graphs  in kf.split(graph_ids_unique):
            train_ind = ids[ids.graph_id.isin(graph_ids_unique[train_graphs])].index.values
            test_ind  = ids[ids.graph_id.isin(graph_ids_unique[test_graphs ])].index.values
            
            # Adding validation sets (from train set) for models, that require it. 
            # Validation size is 12.5% of train fold.
            if model=='xgb' or model=='lgb' or model=='nnet':
                kf_valid = KFold(n_splits=8, shuffle=True)
                graph_ids_train = graph_ids_unique[train_graphs]
                train_graphs, valid_graphs = list(kf_valid.split(graph_ids_train))[0]
                train_ind = ids[ids.graph_id.isin(graph_ids_train[train_graphs])].index.values
                valid_ind = ids[ids.graph_id.isin(graph_ids_train[valid_graphs])].index.values
                
            if model=='lgb':
                fold_pred, kagg_pred = lgb_pred (params, data[train_ind], y[train_ind], 
                                                         data[valid_ind], y[valid_ind],
                                                         data[test_ind], kagg)
            elif model=='xgb':
                fold_pred, kagg_pred = xgb_pred (params, data[train_ind], y[train_ind], 
                                                         data[valid_ind], y[valid_ind],
                                                         data[test_ind], kagg)
            elif model=='nnet':
                fold_pred, kagg_pred = nnet_pred(params, data[train_ind], y[train_ind], 
                                                         data[valid_ind], y[valid_ind],
                                                         data[test_ind], kagg)
            elif model=='fastfm':
                fold_pred, kagg_pred = fastfm_pred(params, data[train_ind], y[train_ind], data[test_ind], kagg)
               
            # Block for working with sklearn models
            else:
                model.fit(data[train_ind], y[train_ind])
                try:
                    fold_pred = model.predict_proba(data[test_ind])[:,1]
                    kagg_pred = model.predict_proba(kagg)[:,1]
                except:
                    try:
                        fold_pred = model.predict_proba(data[test_ind])
                        kagg_pred = model.predict_proba(kagg)
                    except:
                        fold_pred = model.predict(data[test_ind])
                        kagg_pred = model.predict(kagg)
            
            
            data_oofs[test_ind] += fold_pred
            kagg_oofs += kagg_pred
            print ('fold loss: ', log_loss(y[test_ind], fold_pred))            
        
        print ('iteration OOF score:', log_loss(y, data_oofs/(iter_num+1)))
    data_oofs /= iters_total
    kagg_oofs /= (iters_total*n_splits)
    return data_oofs, kagg_oofs

# OOF predicitons with LightGBM on TF-IDF transformed data.

In [4]:
%%time

# PCA transofmed matrices
pca = TruncatedSVD(n_components=20)
pca.fit(data)
data_pca = pca.transform(data)
kagg_pca = pca.transform(kagg)

# Empty dfs for oof preditions
data_preds = pd.DataFrame()
kagg_preds = pd.DataFrame()

# LightGBM hyperparameters
params = {
        'task': 'train','boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss',
        'feature_fraction': 0.95,
        'min_data_in_leaf': 10, 
        'bagging_freq': 3, 
        'min_gain_to_split': 0, 
        'lambda_l2': 1, 
        'learning_rate': 0.075, 
        'num_leaves': 128, 
        'bagging_fraction': 0.85}

t_start = time.time()
data_preds['lgb_tfidf_oof'], kagg_preds['lgb_tfidf_oof'] = get_oofs('lgb'
                                                                    ,data
                                                                    ,kagg
                                                                    ,ytrain
                                                                    ,ids 
                                                                    ,params=params)
print ('lgb_tfidf_oof is done in {} minutes \n'.format(round((time.time()-t_start)/60,1) ))

t_start = time.time()
data_preds['lgb_tfidfpca_oof'], kagg_preds['lgb_tfidfpca_oof'] = get_oofs('lgb'
                                                                            ,data_pca
                                                                            ,kagg_pca
                                                                            ,ytrain
                                                                            ,ids
                                                                            ,params=params)
print ('lgb_tfidfpca_oof is done in {} minutes \n'.format(round((time.time()-t_start)/60,1) ))


data_preds.to_csv(os.path.join(DATA_PATH, 'train_lgb_tfidf_oof.csv'), index=False)
kagg_preds.to_csv(os.path.join(DATA_PATH, 'test_lgb_tfidf_oof.csv' ), index=False)

fold loss:  0.39505452836
fold loss:  0.391707470137
fold loss:  0.403296928625
fold loss:  0.394772659807
fold loss:  0.402759217594
iteration OOF score: 0.39760496311
fold loss:  0.39553976321
fold loss:  0.396333165703
fold loss:  0.408735197222
fold loss:  0.396774328361
fold loss:  0.394911306052
iteration OOF score: 0.391597208616
fold loss:  0.391230999644
fold loss:  0.398633593536
fold loss:  0.39557736232
fold loss:  0.398128801855
fold loss:  0.400803811999
iteration OOF score: 0.389326754038
lgb_tfidf_oof is done in 116.8 minutes 

fold loss:  0.560250481386
fold loss:  0.561786047197
fold loss:  0.555596095719
fold loss:  0.56354098972
fold loss:  0.575602435166
iteration OOF score: 0.563433610872
fold loss:  0.564527125784
fold loss:  0.565327783431
fold loss:  0.55716693628
fold loss:  0.558937258263
fold loss:  0.568529505479
iteration OOF score: 0.559943220869
fold loss:  0.558031928643
fold loss:  0.561895321244
fold loss:  0.559661966874
fold loss:  0.569648454457
fo

In [5]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Convolution1D, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam, SGD, Adadelta
from keras.layers.advanced_activations import PReLU
from keras.utils.np_utils import to_categorical
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization

def batch_generator(X, y, BATCH_SIZE, EPOCH_PARTION):
    """
    Batch generator for nnet training
    input:
        X - train dataset,  numpy array or csr matrix
        y - target, numpy array
        BATCH_SIZE - int, number of objects in batch. If X is csr matrix, it will be transformed 
        to dense array so batch size must be small enough for this array to fit in memory
        EPOCH_PARTION - float, share of objects that will be used for training in epoch
        
    Important: in this implementation each batch is constructed from random objects from train.
    """
    
    batch_number = 0
    batches_per_epoch = int(X.shape[0]/BATCH_SIZE*EPOCH_PARTION)
    while True:
        batch_indexes = np.random.choice(X.shape[0], BATCH_SIZE)
        if type(X).__name__ == 'csr_matrix':
            X_batch = X[batch_indexes].toarray()
        else:
            X_batch = X[batch_indexes]
        y_batch = to_categorical(y, num_classes=2)[batch_indexes]
        batch_number += 1
        yield X_batch, y_batch
        if batch_number == batches_per_epoch-1:
            batch_number = 0
            
def batch_generator_p(X, BATCH_SIZE):
    """
    Batch generator for nnet predicitons
    input:
        X - train dataset,  numpy array or csr matrix
        BATCH_SIZE - number of objects in batch. If X is csr matrix, it will be transformed 
        to dense array so batch size must be small enough for this array to fit in memory        
    """
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/BATCH_SIZE)
    batch_number = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_indexes = sample_index[BATCH_SIZE*batch_number : BATCH_SIZE*(batch_number+1)]
        if type(X).__name__ == 'csr_matrix':
            X_batch = X[batch_indexes].toarray()
        else:
            X_batch = X[batch_indexes]
        batch_number += 1
        yield (X_batch)
        if batch_number == number_of_batches:
            batch_number = 0
            
def nnet(data, n1, n2, d1, d2, regul, **kwargs):
    """
    Function to compile simple nnet. Architecture is self-explanatory with code
    input:
        data - numpy arary or csr matrix for training
        n1, n2 - ints, number of neurons in first and second layers
        d1, d2 - float, dropouts in first and second layers
        regul - float, regularization paramter, the same for both layers
    output:
        nnet model
    """
    model = Sequential()
    if regul>0:
        model.add(Dense(n1, input_dim=data.shape[1], 
                        kernel_regularizer=l2(regul), 
                        activity_regularizer=l1(regul)))
    else:    
        model.add(Dense(n1, input_dim=data.shape[1]))
    model.add(PReLU())
    model.add(Dropout(d1))
    
    if regul>0:
        model.add(Dense(n2, 
                        kernel_regularizer=l2(regul), 
                        activity_regularizer=l1(regul)))
    else:    
        model.add(Dense(n2))
    model.add(PReLU())
    model.add(Dropout(d2))

    model.add(Dense(2))
    model.add(Activation('softmax'))
    
    adam = Adam()
    model.compile(loss='binary_crossentropy',
                  optimizer='adadelta',
                  metrics=['binary_crossentropy'])
    return model

def nnet_pred(params, train, ytrain, valid, yvalid, test, kagg):
    """
    
    """
    if type(train).__name__ == 'csr_matrix':
        scaler = MaxAbsScaler()
    else:
        scaler = MinMaxScaler()
    
    train = scaler.fit_transform(train)
    valid = scaler.transform(valid)
    test  = scaler.transform(test)
    kagg  = scaler.transform(kagg)
    
    model = nnet(train, **params)
    early_stopper = EarlyStopping(monitor='val_binary_crossentropy', patience=10, verbose=0, mode='auto')
    checkpoint = ModelCheckpoint(filepath='nnet_test.hdf5', monitor='val_binary_crossentropy', save_best_only=True)

    BATCH_SIZE = params.get('BATCH_SIZE', 256)
    EPOCH_PARTION = params.get('EPOCH_PARTION', 0.75)
    
    model.fit_generator(generator=batch_generator(train, ytrain, BATCH_SIZE, EPOCH_PARTION),
                        samples_per_epoch=int(train.shape[0]/BATCH_SIZE*EPOCH_PARTION),
                        verbose=0, nb_epoch=1000,

                        validation_data=batch_generator(valid, yvalid, BATCH_SIZE, EPOCH_PARTION), 
                        #nb_val_samples=int(valid.shape[0]/BATCH_SIZE*EPOCH_PARTION)*2, 
                        validation_steps = int(valid.shape[0]/BATCH_SIZE),

                        callbacks=[early_stopper, checkpoint])
    
    model.load_weights('nnet_test.hdf5') 
    model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['binary_crossentropy'])
    
    fold_pred = model.predict_generator(generator=batch_generator_p(test, BATCH_SIZE), 
                                val_samples=test.shape[0]/BATCH_SIZE)
    kagg_pred = model.predict_generator(generator=batch_generator_p(kagg, BATCH_SIZE), 
                                val_samples=kagg.shape[0]/BATCH_SIZE)
    return fold_pred[:,1], kagg_pred[:,1]

Using Theano backend.


In [6]:
data_preds = pd.DataFrame()
kagg_preds = pd.DataFrame()
params = {'n1':512, 'n2':256, 'd1':0.5, 'd2':0.25, 'regul':1e-9, 'BATCH_SIZE':512, 'EPOCH_PARTION':0.75}


t_start = time.time()
data_preds['nnet_tfidf_oof'], kagg_preds['nnet_tfidf_oof'] = get_oofs('nnet'
                                                            ,data
                                                            ,kagg
                                                            ,ytrain
                                                            ,ids 
                                                            ,params=params)
print ('nnet_tfidf_oof is done in {} minutes \n'.format(round((time.time()-t_start)/60,1) ))


data_preds.to_csv(os.path.join(DATA_PATH, 'train_nnet_tfidf_oof.csv'), index=False)
kagg_preds.to_csv(os.path.join(DATA_PATH, 'test_nnet_tfidf_oof.csv' ), index=False)

fold loss:  0.422063976781
fold loss:  0.446170907029
fold loss:  0.427456399184
fold loss:  0.429504116896
fold loss:  0.446972422033
iteration OOF score: 0.434428633895
fold loss:  0.423245459602
fold loss:  0.454989037373
fold loss:  0.436115124954
fold loss:  0.441851586837
fold loss:  0.430626961851
iteration OOF score: 0.41762264606
fold loss:  0.426923811564
fold loss:  0.458325888393
fold loss:  0.422328773042
fold loss:  0.432564368697
fold loss:  0.432415393343
iteration OOF score: 0.412751694357
nnet_tfidf_oof is done in 2336.2 minutes 



# OOF predicitons with factorization machines on TF-IDF transformed data.

In [7]:
%%time
data_preds = pd.DataFrame()
kagg_preds = pd.DataFrame()

params = {'n_iter':5000, 'init_stdev':5e-3, 'step_size':5e-3, 'l2_reg_w':10, 'l2_reg':10}
data_preds['FM_oof'], kagg_preds['FM_oof'] = get_oofs('fastfm', data, kagg, ytrain, ids, params=params)

data_preds.to_csv(os.path.join(DATA_PATH, 'train_fm_oof.csv'), index=False)
kagg_preds.to_csv(os.path.join(DATA_PATH, 'test_fm_oof.csv' ), index=False)

fold loss:  0.65599898572
fold loss:  0.647734455597
fold loss:  0.650891742555
fold loss:  0.698675414611
fold loss:  0.664693375386
iteration OOF score: 0.66414823069
fold loss:  0.674839754699
fold loss:  0.655822259507
fold loss:  0.666759726946
fold loss:  0.66029638867
fold loss:  0.651864755985
iteration OOF score: 0.662976429274
fold loss:  0.654551261602
fold loss:  0.671557997558
fold loss:  0.665547201307
fold loss:  0.666611129147
fold loss:  0.65511445235
iteration OOF score: 0.662868041762
CPU times: user 2min 41s, sys: 23.2 s, total: 3min 4s
Wall time: 3min 5s


# END