In [1]:
import os
import time 
import warnings
warnings.filterwarnings('ignore')

import nltk
import re
import pickle

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
plt.style.use('seaborn-poster')
sns.set_palette('Set1', 10, desat=0.75)

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 200)

import scipy.sparse as sparse
from sklearn.decomposition import TruncatedSVD

DATA_PATH = 'data/'

Populating the interactive namespace from numpy and matplotlib


In [17]:
%%time
def load_sparse_csr(filename):
    "Loads scipy sparse matrix with csr format"
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

ytrain = pd.read_csv(os.path.join(DATA_PATH, 'final/target.csv')).target[:100000]
ids = pd.read_csv(os.path.join(DATA_PATH, 'final/data_ids.csv'), usecols=['graph_id'])[:100000]

data = load_sparse_csr(os.path.join(DATA_PATH, 'data_tfidf_stem_tags.npz'))[:100000]
kagg = load_sparse_csr(os.path.join(DATA_PATH, 'kagg_tfidf_stem_tags.npz'))[:100000]


CPU times: user 2.66 s, sys: 992 ms, total: 3.66 s
Wall time: 3.88 s


In [3]:
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

import lightgbm as lgb
from fastFM.sgd import FMClassification

def lgb_pred(params, train, ytrain, valid, yvalid, test, kagg):
    train  = lgb.Dataset(train, ytrain)
    dvalid = lgb.Dataset(valid, yvalid, reference=train)
    
    gbm = lgb.train(params,train,
                    num_boost_round=100000,
                    valid_sets=[train, dvalid],
                    verbose_eval=False,
                    early_stopping_rounds=20)
    
    fold_pred  = gbm.predict(test,  num_iteration=gbm.best_iteration)
    kagg_pred  = gbm.predict(kagg,  num_iteration=gbm.best_iteration) 
    return fold_pred, kagg_pred

def fastfm_pred(params, train, ytrain, test, kagg):
    fmc = FMClassification(**params)
    fmc.fit(train, ytrain)
    fold_pred = fmc.predict_proba(test)
    kagg_pred = fmc.predict_proba(kagg)
    return fold_pred, kagg_pred
        
def get_oofs(model, data, kagg, y, ids, n_splits=5, iters_total=3, params=None):
    """
    input:
        model: string ('lgb'/'fastfm'/'xgb'/'nnet') or any sklearn model
        data: train data in format of pandas DataFrame / numpay array / csr sparse matrix. All columns will be 
            used as features. Must not contain target or ids or non-numeric columns
        kagg: test data in format of pandas DataFrame / numpay array / csr sparse matrix. All columns will be 
            used as features. Must not contain target or ids or non-numeric columns
        y: target for train data in a form of pandas DataFrame / pandas Series / numpay array
        ids: graph ids in a form of pandas Series. Graphs ids are used to split train data into separate graphs 
            to prevent overfitting. In other words, quesitons of the same graph will always be in one fold.
        n_splits: number of splits for train data. Default value is 5. In order to get one OOF prediciton, 
            model must be fitted n_splits times
        iters_total: number of total iterations. Default value is 3. OOFs then will be blended. 
            Total number of times a model will be fitted is n_splits*iters_total.
        params: model parameters in a form of dictionary.
    output: 
        data_oofs: numpy array of OOF predictions for train data. Result is blended iters_total times
        kagg_oofs: numpy array of predictions for train data. Result is blended n_splits*iters_total times
    """
    
    if type(data).__name__=='DataFrame':
        data = data.values
    if type(kagg).__name__=='DataFrame':
        kagg = kagg.values
    if model == 'fastfm':
        y = y.replace(0, -1)
        if type(data).__name__ != 'csr_matrix':
            data = sparse.csr_matrix(data)
            kagg = sparse.csr_matrix(kagg)
    if type(y).__name__=='Series' or type(y).__name__=='DataFrame':
        y = y.values
    
    # matrices to store preditions
    data_oofs = np.zeros((data.shape[0]))
    kagg_oofs = np.zeros((kagg.shape[0]))
    
    graph_ids_unique = ids.graph_id.unique()
    
    for iter_num in range(iters_total):
        kf = KFold(n_splits=n_splits, shuffle=True)
        for train_graphs, test_graphs  in kf.split(graph_ids_unique):
            train_ind = ids[ids.graph_id.isin(graph_ids_unique[train_graphs])].index.values
            test_ind  = ids[ids.graph_id.isin(graph_ids_unique[test_graphs ])].index.values
            
            # Adding validation sets (from train set) for models, that require it. 
            # Validation size is 12.5% of train fold.
            if model=='xgb' or model=='lgb' or model=='nnet':
                kf_valid = KFold(n_splits=8, shuffle=True)
                graph_ids_train = graph_ids_unique[train_graphs]
                train_graphs, valid_graphs = list(kf_valid.split(graph_ids_train))[0]
                train_ind = ids[ids.graph_id.isin(graph_ids_train[train_graphs])].index.values
                valid_ind = ids[ids.graph_id.isin(graph_ids_train[valid_graphs])].index.values
                
            if model=='lgb':
                fold_pred, kagg_pred = lgb_pred (params, data[train_ind], y[train_ind], 
                                                         data[valid_ind], y[valid_ind],
                                                         data[test_ind], kagg)
            elif model=='xgb':
                fold_pred, kagg_pred = xgb_pred (params, data[train_ind], y[train_ind], 
                                                         data[valid_ind], y[valid_ind],
                                                         data[test_ind], kagg)
            elif model=='nnet':
                fold_pred, kagg_pred = nnet_pred(params, data[train_ind], y[train_ind], 
                                                         data[valid_ind], y[valid_ind],
                                                         data[test_ind], kagg)
            elif model=='fastfm':
                fold_pred, kagg_pred = fastfm_pred(params, data[train_ind], y[train_ind], data[test_ind], kagg)
               
            # Block for working with sklearn models
            else:
                model.fit(data[train_ind], y[train_ind])
                try:
                    fold_pred = model.predict_proba(data[test_ind])[:,1]
                    kagg_pred = model.predict_proba(kagg)[:,1]
                except:
                    try:
                        fold_pred = model.predict_proba(data[test_ind])
                        kagg_pred = model.predict_proba(kagg)
                    except:
                        fold_pred = model.predict(data[test_ind])
                        kagg_pred = model.predict(kagg)
            
            
            data_oofs[test_ind] += fold_pred
            kagg_oofs += kagg_pred
            print ('fold loss: ', log_loss(y[test_ind], fold_pred))            
        
        print ('iteration OOF score:', log_loss(y, data_oofs/(iter_num+1)))
    data_oofs /= iters_total
    kagg_oofs /= (iters_total*n_splits)
    return data_oofs, kagg_oofs

# OOF predicitons with LightGBM on TF-IDF transformed data.

In [19]:
%%time

# PCA transofmed matrices
pca = TruncatedSVD(n_components=20)
pca.fit(data)
data_pca = pca.transform(data)
kagg_pca = pca.transform(kagg)

# Empty dfs for oof preditions
data_preds = pd.DataFrame()
kagg_preds = pd.DataFrame()

# LightGBM hyperparameters
params = {
        'task': 'train','boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss',
        'feature_fraction': 0.95,
        'min_data_in_leaf': 10, 
        'bagging_freq': 3, 
        'min_gain_to_split': 0, 
        'lambda_l2': 1, 
        'learning_rate': 0.075, 
        'num_leaves': 128, 
        'bagging_fraction': 0.85}

t_start = time.time()
data_preds['tfidf_oof'], kagg_preds['tfidf_oof'] = get_oofs('lgb'
                                                            ,data
                                                            ,kagg
                                                            ,ytrain, 
                                                            ,ids 
                                                            ,params=params)
print ('tfidf_oof is done in {} minutes \n'.format(round((time.time()-t_start)/60,1) ))

t_start = time.time()
data_preds['tfidf_pca_oof'], kagg_preds['tfidf_pca_oof'] = get_oofs('lgb'
                                                                    ,data_pca
                                                                    ,kagg_pca
                                                                    ,ytrain
                                                                    ,ids
                                                                    ,params=params)
print ('tfidf_pca_oof is done in {} minutes \n'.format(round((time.time()-t_start)/60,1) ))


data_preds.to_csv(os.path.join(DATA_PATH, 'data_tfidf_oof.csv'), index=False)
kagg_preds.to_csv(os.path.join(DATA_PATH, 'kagg_tfidf_oof.csv'), index=False)

fold loss:  0.439412958122
fold loss:  0.446161681867
fold loss:  0.438687438293
fold loss:  0.440298329498
fold loss:  0.434612715998
iteration OOF score: 0.439872980869
fold loss:  0.43555397353
fold loss:  0.458551952053
fold loss:  0.43650222986
fold loss:  0.42941386793
fold loss:  0.454388462144
iteration OOF score: 0.435669698071
fold loss:  0.429751625751
fold loss:  0.452101395873
fold loss:  0.451927398582
fold loss:  0.439180205335
fold loss:  0.437623624326
iteration OOF score: 0.43348176463
tfidf_oof is done in 15.7 minutes 

fold loss:  0.580043700385
fold loss:  0.576181443632
fold loss:  0.577053242991
fold loss:  0.575489394876
fold loss:  0.57833468949
iteration OOF score: 0.577433531727
fold loss:  0.573240920204
fold loss:  0.591257770222
fold loss:  0.576375452266
fold loss:  0.578357423251
fold loss:  0.569795616364
iteration OOF score: 0.573748777017
fold loss:  0.575027725913
fold loss:  0.575310008771
fold loss:  0.574980728799
fold loss:  0.579712464421
fold l

FileNotFoundError: [Errno 2] No such file or directory: 'data/other/data_tfidf_oof.csv'

# OOF predicitons with factorization machines on TF-IDF transformed data.

In [20]:
%%time
data_preds = pd.DataFrame()
kagg_preds = pd.DataFrame()

params = {'n_iter':5000, 'init_stdev':5e-3, 'step_size':5e-3, 'l2_reg_w':10, 'l2_reg':10}
data_preds['FM_oof'], kagg_preds['FM_oof'] = get_oofs('fastfm', data, kagg, ytrain, ids, 
                                                      params=params)

data_preds.to_csv(os.path.join(DATA_PATH, 'data_fm_oof.csv'), index=False)
kagg_preds.to_csv(os.path.join(DATA_PATH, 'kagg_fm_oof.csv'), index=False)

fold loss:  0.659272003041
fold loss:  0.664246322973
fold loss:  0.664770210204
fold loss:  0.67232671434
fold loss:  0.667665502412
iteration OOF score: 0.665678150633
fold loss:  0.66759054704
fold loss:  0.663768711036
fold loss:  0.667166460172
fold loss:  0.667253302442
fold loss:  0.662739979867
iteration OOF score: 0.665593927196
fold loss:  0.65552871689
fold loss:  0.665540087076
fold loss:  0.663973001235
fold loss:  0.667548895915
fold loss:  0.66967392152
iteration OOF score: 0.665183570757
CPU times: user 7.62 s, sys: 1.17 s, total: 8.78 s
Wall time: 8.83 s


# END