In [1]:
data_dir = './data/mlboot_dataset/'
model_name = 'lgbm_st'
results_dir = './results/'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp
import gc

In [2]:
df = pd.read_csv(data_dir + 'preprocessed_new.csv') 
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil().astype(np.bool)
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil().astype(np.bool)
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil().astype(np.bool)
print(mat1.shape, mat2.shape, mat3.shape)

df['max_f1'] = mat1.tocsr().max(axis=1).todense()
df['max_f2'] = mat2.tocsr().max(axis=1).todense()
df['max_f3'] = mat3.tocsr().max(axis=1).todense()

train_mat1 = mat1[df_train_index.tolist()]
test_mat1 = mat1[df_test_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
test_mat2 = mat2[df_test_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]
test_mat3 = mat3[df_test_index.tolist()]

limit = 10
mat1 = mat1.tocsc()[:, np.where((train_mat1.getnnz(axis=0) > limit) & (test_mat1.getnnz(axis=0) > 0))[0]].tocsr()
mat2 = mat2.tocsc()[:, np.where((train_mat2.getnnz(axis=0) > limit) & (test_mat2.getnnz(axis=0) > 0))[0]].tocsr()
mat3 = mat3.tocsc()[:, np.where((train_mat3.getnnz(axis=0) > limit) & (test_mat3.getnnz(axis=0) > 0))[0]].tocsr()

(609018, 2053602) (609018, 2812610) (609018, 1057788)


In [3]:
print(mat1.shape, mat2.shape, mat3.shape)

(609018, 204775) (609018, 20268) (609018, 10296)


In [5]:
a = mat1[0].todense()
a[a]

matrix([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True]], dtype=bool)

In [4]:
data_svd = pd.DataFrame(np.load(data_dir + 'pca_cat10.npy'), index=df.index)

In [5]:
data_svd.columns = ['svd_description_'+str(i+1) for i in range(10)]
df = pd.concat([df, data_svd], axis=1)    
del data_svd

In [6]:
train_mat1 = mat1[df_train_index.tolist()].astype(np.int8)
test_mat1 = mat1[df_test_index.tolist()].astype(np.int8)
train_mat2 = mat2[df_train_index.tolist()].astype(np.int8)
test_mat2 = mat2[df_test_index.tolist()].astype(np.int8)
train_mat3 = mat3[df_train_index.tolist()].astype(np.int8)
test_mat3 = mat3[df_test_index.tolist()].astype(np.int8)

del mat1,mat2,mat3
gc.collect()

3

In [7]:
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [10]:
df.columns

Index(['uid', 'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'nuniq_keys_f1_cat0', 'nuniq_keys_f2_cat0',
       'nuniq_keys_f3_cat0', 'nuniq_keys_f1_cat1', 'nuniq_keys_f2_cat1',
       'nuniq_keys_f3_cat1', 'nuniq_keys_f1_cat2', 'nuniq_keys_f2_cat2',
       'nuniq_keys_f3_cat2', 'nuniq_keys_f1_cat3', 'nuniq_keys_f2_cat3',
       'nuniq_keys_f3_cat3', 'nuniq_keys_f1_cat4', 'nuniq_keys_f2_cat4',
       'nuniq_keys_f3_cat4', 'nuniq_keys_f1_cat5', 'nuniq_keys_f2_cat5',
       'nuniq_keys_f3_cat5', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
       'nuniq_key

In [11]:
from sklearn.preprocessing import minmax_scale
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_base_model.npy'))
sample_sub.columns = ['uid','nnet']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [12]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_baseline_sparse_10folds.npy'))
sample_sub.columns = ['uid','lgbm1']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [13]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_all_in_focal_loss.npy'))
sample_sub.columns = ['uid','nnet2']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [14]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_focal_loss_m3.npy'))
sample_sub.columns = ['uid','nnet3']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_focal_loss_m1.npy'))
sample_sub.columns = ['uid','nnet1']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_ftrl.npy'))
sample_sub.columns = ['uid','ftrl']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model.npy'))
sample_sub.columns = ['uid','nnet4']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model_catpca.npy'))
sample_sub.columns = ['uid','nnet5']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model_3br.npy'))
sample_sub.columns = ['uid','nnet6']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [15]:
X['lgbm1'] = minmax_scale(np.load(results_dir + 'train_baseline_sparse_10folds.npy'))
X['nnet'] = minmax_scale(np.load(results_dir + 'train_nn_base_model.npy'))
X['nnet2'] = minmax_scale(np.load(results_dir + 'train_all_in_focal_loss.npy'))
X['nnet1'] = minmax_scale(np.load(results_dir + 'train_focal_loss_m1.npy'))
X['nnet3'] = minmax_scale(np.load(results_dir + 'train_focal_loss_m3.npy'))
X['nnet4'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model.npy'))
X['nnet5'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model_catpca.npy'))
X['nnet6'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model_3br.npy'))
X['ftrl'] = minmax_scale(np.load(results_dir + 'train_ftrl.npy'))

In [16]:
x_te[['nnet','lgbm1','nnet2','nnet1','nnet3','nnet4','nnet5','nnet6','ftrl']].corr()

Unnamed: 0,nnet,lgbm1,nnet2,nnet1,nnet3,nnet4,nnet5,nnet6,ftrl
nnet,1.0,0.681589,0.75349,0.488332,0.330658,0.615233,0.632314,0.620995,0.38929
lgbm1,0.681589,1.0,0.593723,0.536187,0.342257,0.661794,0.685686,0.730914,0.318644
nnet2,0.75349,0.593723,1.0,0.582909,0.353475,0.615273,0.622576,0.501894,0.329572
nnet1,0.488332,0.536187,0.582909,1.0,0.488963,0.697361,0.685702,0.429881,0.260184
nnet3,0.330658,0.342257,0.353475,0.488963,1.0,0.541269,0.47747,0.262547,0.199443
nnet4,0.615233,0.661794,0.615273,0.697361,0.541269,1.0,0.942206,0.624456,0.31154
nnet5,0.632314,0.685686,0.622576,0.685702,0.47747,0.942206,1.0,0.644069,0.313207
nnet6,0.620995,0.730914,0.501894,0.429881,0.262547,0.624456,0.644069,1.0,0.323725
ftrl,0.38929,0.318644,0.329572,0.260184,0.199443,0.31154,0.313207,0.323725,1.0


In [17]:
X[['lgbm1','nnet','nnet2','nnet1','nnet3','nnet4','nnet5','nnet6','ftrl','target']].corr()

Unnamed: 0,lgbm1,nnet,nnet2,nnet1,nnet3,nnet4,nnet5,nnet6,ftrl,target
lgbm1,1.0,0.66497,0.570985,0.494446,0.295207,0.639325,0.649844,0.749904,0.574672,0.170409
nnet,0.66497,1.0,0.659126,0.460005,0.289962,0.617057,0.621315,0.640988,0.616108,0.130658
nnet2,0.570985,0.659126,1.0,0.548694,0.312146,0.627571,0.621176,0.527951,0.531092,0.111062
nnet1,0.494446,0.460005,0.548694,1.0,0.487426,0.694869,0.676564,0.438476,0.461213,0.09123
nnet3,0.295207,0.289962,0.312146,0.487426,1.0,0.419134,0.409703,0.274057,0.316298,0.049012
nnet4,0.639325,0.617057,0.627571,0.694869,0.419134,1.0,0.923874,0.603938,0.566767,0.120073
nnet5,0.649844,0.621315,0.621176,0.676564,0.409703,0.923874,1.0,0.606204,0.565838,0.122909
nnet6,0.749904,0.640988,0.527951,0.438476,0.274057,0.603938,0.606204,1.0,0.603479,0.141818
ftrl,0.574672,0.616108,0.531092,0.461213,0.316298,0.566767,0.565838,0.603479,1.0,0.102134
target,0.170409,0.130658,0.111062,0.09123,0.049012,0.120073,0.122909,0.141818,0.102134,1.0


In [18]:
for f in ['lgbm1','nnet','nnet2','nnet1','nnet3','nnet4','nnet5','ftrl','nnet6']:
    print(roc_auc_score(X.target, X[f]))

0.682562603224
0.648537598744
0.649222662589
0.623698784984
0.574520607881
0.647936920552
0.648281500429
0.628310959446
0.658849449887


In [19]:
X['ridge1'] = np.load(results_dir + 'train_ridge1.npy')
x_te['ridge1'] = np.load(results_dir + 'test_ridge1.npy')
X['ridge2'] = np.load(results_dir + 'train_ridge2.npy')
x_te['ridge2'] = np.load(results_dir + 'test_ridge2.npy')
X['ridge3'] = np.load(results_dir + 'train_ridge3.npy')
x_te['ridge3'] = np.load(results_dir + 'test_ridge3.npy')

In [29]:
X_te

<181024x11850 sparse matrix of type '<class 'numpy.float32'>'
	with 96203785 stored elements in Compressed Sparse Row format>

In [8]:
import scipy.sparse as sp
from sklearn.feature_selection import SelectPercentile
def save_submit(model_name, folds, y_pred):
    global x_te
    sub = x_te[['uid','target']].copy()
    sub['target'] = y_pred
    sub.columns = ['cuid','target']
    sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
    sample_sub = sample_sub.merge(sub, on='cuid', how='left')
    sample_sub[['target']].to_csv(results_dir + model_name + '_' + str(folds) + 'folds.csv', header=False, index=False)
    del sub,sample_sub
    gc.collect()
    
def mean_encode_test(df, y, test,k,column):
    mean_0 = np.zeros((test.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)  
    y0s = df[['target',column]].groupby(column).agg(np.mean).reset_index()
    y0s.columns = [column,'target_mean']
    vc = df[column].value_counts().reset_index()
    vc.columns = [column,'counts']
    test = test.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
    test['mean_target'] = (test.target_mean * test.counts + k * m0)/(test.counts + k)
    mean_0 = np.array(test['mean_target']).reshape(-1,1)
    return mean_0    

def mean_encode_self(df, y, kf, k, column):
    mean_0 = np.zeros((y.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)
    for dev_index, val_index in kf: 
        dev_X, val_X = df.iloc[dev_index,:], df.iloc[val_index,:]
        y0s = dev_X[['target',column]].groupby(column).agg(np.mean).reset_index()
        y0s.columns = [column,'target_mean']
        vc = dev_X[column].value_counts().reset_index()
        vc.columns = [column,'counts']
        val_X = val_X.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
        val_X['mean_target'] = (val_X.target_mean * val_X.counts + k * m0)/(val_X.counts + k)
        mean_0[val_index,:] = np.array(val_X['mean_target']).reshape(-1,1)       
    return mean_0

def make_agg_features(X, train_index, test_index, test_data):
    te_cols = ['most_freq_cat']
    kf = KFold(n_splits = 5, random_state=2018, shuffle=True)
    for c in te_cols:
        X.loc[test_index,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), X.loc[test_index,:].copy(), 10.0, c)
        test_data.loc[:,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), test_data.copy(), 10.0, c)
        X.loc[train_index,c + '_te'] = mean_encode_self(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), kf.split(X.loc[train_index,:]), 10.0, c)
    return X.loc[train_index,:], X.loc[test_index,:], test_data
    
train_cols = ['num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'nuniq_keys_f1_cat0', 'nuniq_keys_f2_cat0',
       'nuniq_keys_f3_cat0', 'nuniq_keys_f1_cat1', 'nuniq_keys_f2_cat1',
       'nuniq_keys_f3_cat1', 'nuniq_keys_f1_cat2', 'nuniq_keys_f2_cat2',
       'nuniq_keys_f3_cat2', 'nuniq_keys_f1_cat3', 'nuniq_keys_f2_cat3',
       'nuniq_keys_f3_cat3', 'nuniq_keys_f1_cat4', 'nuniq_keys_f2_cat4',
       'nuniq_keys_f3_cat4', 'nuniq_keys_f1_cat5', 'nuniq_keys_f2_cat5',
       'nuniq_keys_f3_cat5', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
       'nuniq_keys_f1.2', 'sumval_keys_f1_cat0', 'sumval_keys_f2_cat0',
       'sumval_keys_f3_cat0', 'sumval_keys_f1_cat1', 'sumval_keys_f2_cat1',
       'sumval_keys_f3_cat1', 'sumval_keys_f1_cat2', 'sumval_keys_f2_cat2',
       'sumval_keys_f3_cat2', 'sumval_keys_f1_cat3', 'sumval_keys_f2_cat3',
       'sumval_keys_f3_cat3', 'sumval_keys_f1_cat4', 'sumval_keys_f2_cat4',
       'sumval_keys_f3_cat4', 'sumval_keys_f1_cat5', 'sumval_keys_f2_cat5',
       'sumval_keys_f3_cat5', 'sumval_keys_f1', 'sumval_keys_f1.1',
       'sumval_keys_f1.2', 'most_freq_cat_te', 'diff_num_cats', 'unique_days',
       'max_f1', 'max_f2', 'max_f3', 
       'svd_description_1',
       'svd_description_2', 'svd_description_3', 'svd_description_4',
       'svd_description_5', 'svd_description_6', 'svd_description_7',
       'svd_description_8', 'svd_description_9', 'svd_description_10']


# Train the model
parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 128,
    'max_depth' : 10,
    'min_data' : 50,
    #'lambda_l2' : 15.5,
    'min_sum_hessian_in_leaf' : 0.5,
    #'lambda_l1' : 0.2,
    'is_unbalance': True,
    'learning_rate': 0.01,
    'feature_fraction': 0.7,
    'verbose': 0
}

kf = KFold(n_splits=10, shuffle=True, random_state=239)

ifold = 0

y_pred = 0
y_oof = X[['uid','target']].copy()
y_oof['target'] = np.nan

scores = []

train_mat = sp.hstack([train_mat1,train_mat2,train_mat3]).tocsr()
test_mat = sp.hstack([test_mat1,test_mat2,test_mat3]).tocsr()

for train_index,test_index in kf.split(X):
    print('fold', ifold)
       
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    X_tr,X_va,X_te = make_agg_features(X,train_index,test_index,x_te)
    X_tr = X_tr[train_cols]
    X_va = X_va[train_cols]
    X_te = X_te[train_cols]
    
    yy = y_tr
    ssp = SelectPercentile(percentile=5)  
    ssp.fit(train_mat[train_index], yy)
    sp_train_mat = ssp.transform(train_mat[train_index])
    sp_val_mat = ssp.transform(train_mat[test_index])
    sp_test_mat = ssp.transform(test_mat)   
    
    print('prepare train')
    X_tr = sp.hstack([
        X_tr.astype(np.float32), sp_train_mat.astype(np.float32)
    ]).tocsr()
    print(X_tr.shape)
    print('prepare valid')
    X_va = sp.hstack([
        X_va.astype(np.float32), sp_val_mat.astype(np.float32)
    ]).tocsr()    
    print('prepare test')
    X_te = sp.hstack([
        X_te.astype(np.float32), sp_test_mat.astype(np.float32)
    ]).tocsr()     

    # Create the LightGBM data containers
    tr_data = lgb.Dataset(X_tr, label=y_tr) #, categorical_feature=cate_cols
    va_data = lgb.Dataset(X_va, label=y_va) #, categorical_feature=cate_cols

    model = lgb.train(parameters,
                      tr_data,
                      valid_sets=[tr_data,va_data],
                      num_boost_round=8000,
                      early_stopping_rounds=300,
                      verbose_eval=100)
    
    yhat = model.predict(X_va, model.best_iteration)
    scores.append(roc_auc_score(y_va,yhat))
    print(ifold,roc_auc_score(y_va,yhat))
    y_oof.loc[test_index,'target'] = yhat

    print('prepare test')
    
    ytst = model.predict(X_te, model.best_iteration)
    y_pred += ytst*0.1
    
    del X_tr,X_va,tr_data,va_data, sp_train_mat, sp_val_mat, sp_test_mat
    gc.collect()    
    
    save_submit('lgb_q', ifold, y_pred)

    ifold += 1    

fold 0
prepare train
(385194, 11848)
prepare valid
prepare test
Training until validation scores don't improve for 300 rounds.
[100]	training's auc: 0.737146	valid_1's auc: 0.662712
[200]	training's auc: 0.766301	valid_1's auc: 0.671903
[300]	training's auc: 0.787981	valid_1's auc: 0.676771
[400]	training's auc: 0.804843	valid_1's auc: 0.679156
[500]	training's auc: 0.818574	valid_1's auc: 0.679994
[600]	training's auc: 0.830146	valid_1's auc: 0.680625
[700]	training's auc: 0.839768	valid_1's auc: 0.681031
[800]	training's auc: 0.84748	valid_1's auc: 0.680824
[900]	training's auc: 0.85395	valid_1's auc: 0.680532
[1000]	training's auc: 0.859728	valid_1's auc: 0.680158
Early stopping, best iteration is:
[714]	training's auc: 0.841015	valid_1's auc: 0.681113
0 0.681113011371
prepare test
fold 1
prepare train
(385194, 11848)
prepare valid
prepare test
Training until validation scores don't improve for 300 rounds.
[100]	training's auc: 0.737615	valid_1's auc: 0.65384
[200]	training's auc: 0

In [9]:
print(scores)
print(np.mean(scores), np.std(scores))

[0.68111301137079538, 0.66729670632609595, 0.65443438758439576, 0.67125924885411026, 0.6657308180834548, 0.67178026807655611, 0.66406003069547004, 0.67543207952522499, 0.659967836079704, 0.66880812975959825]
0.667988251636 0.00722814195148


In [10]:
model_name = 'binary_lgbm'
np.save(results_dir + 'train_' + model_name +'.npy', y_oof.target.values)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

sub = x_te[['uid','target']].copy()
sub['target'] = y_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.574572
1,ac4b8244f3ae82df511b002257473c11,0.582298
2,483d8b91e49522c8a5bbe37f3872c749,0.643829
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.411411
4,fdbfba9842ff0bf86d600eb334c7c42b,0.399608


In [11]:
roc_auc_score(X.target.values, y_oof.target.values)

0.66792997872031157