In [1]:
data_dir = './data/mlboot_dataset/'
model_name = 'xgb_b1'
results_dir = './results/'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp
import gc

In [2]:
df = pd.read_csv(data_dir + 'preprocessed_new.csv') 
q = pd.read_csv(data_dir + 'sessions.csv')
df = df.merge(q, on='uid', how='left')
del q
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil()
print(mat1.shape, mat2.shape, mat3.shape)

df['max_f1'] = mat1.tocsr().max(axis=1).todense()
df['max_f2'] = mat2.tocsr().max(axis=1).todense()
df['max_f3'] = mat3.tocsr().max(axis=1).todense()

train_mat1 = mat1[df_train_index.tolist()]
test_mat1 = mat1[df_test_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
test_mat2 = mat2[df_test_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]
test_mat3 = mat3[df_test_index.tolist()]

limit = 11
mat1 = mat1.tocsc()[:, np.where((train_mat1.getnnz(axis=0) > limit) & (test_mat1.getnnz(axis=0) > 0))[0]].tocsr()
mat2 = mat2.tocsc()[:, np.where((train_mat2.getnnz(axis=0) > limit) & (test_mat2.getnnz(axis=0) > 0))[0]].tocsr()
mat3 = mat3.tocsc()[:, np.where((train_mat3.getnnz(axis=0) > limit) & (test_mat3.getnnz(axis=0) > 0))[0]].tocsr()

(609018, 2053602) (609018, 2812610) (609018, 1057788)


In [3]:
print(mat1.shape, mat2.shape, mat3.shape)

(609018, 195733) (609018, 20268) (609018, 9415)


In [4]:
data_svd = pd.DataFrame(np.load(data_dir + 'pca_cat10.npy'), index=df.index)
data_svd.columns = ['svd_description_'+str(i+1) for i in range(10)]
df = pd.concat([df, data_svd], axis=1)    
del data_svd

In [5]:
data_svd = pd.DataFrame(np.load(data_dir + 'bin_pca_dim10.npy'), index=df.index)
data_svd.columns = ['svd_title_'+str(i+1) for i in range(10)]
df = pd.concat([df, data_svd], axis=1)    
del data_svd

In [6]:
train_mat1 = mat1[df_train_index.tolist()]
test_mat1 = mat1[df_test_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
test_mat2 = mat2[df_test_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]
test_mat3 = mat3[df_test_index.tolist()]

del mat1,mat2,mat3
gc.collect()

6

In [7]:
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [8]:
df.columns

Index(['uid', 'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days',
       ...
       'svd_title_1', 'svd_title_2', 'svd_title_3', 'svd_title_4',
       'svd_title_5', 'svd_title_6', 'svd_title_7', 'svd_title_8',
       'svd_title_9', 'svd_title_10'],
      dtype='object', length=107)

In [9]:
from sklearn.preprocessing import minmax_scale

In [10]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_baseline_sparse_10folds.npy'))
sample_sub.columns = ['uid','lgbm1']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [11]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_all_in_focal_loss.npy'))
sample_sub.columns = ['uid','nnet2']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [12]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_focal_loss_m3.npy'))
sample_sub.columns = ['uid','nnet3']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_focal_loss_m1.npy'))
sample_sub.columns = ['uid','nnet1']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_ftrl_50.npy'))
sample_sub.columns = ['uid','ftrl_50']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_ftrl_70.npy'))
sample_sub.columns = ['uid','ftrl']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model.npy'))
sample_sub.columns = ['uid','nnet4']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model_catpca.npy'))
sample_sub.columns = ['uid','nnet5']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model_3br.npy'))
sample_sub.columns = ['uid','nnet6']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [13]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model_3br_wcc.npy'))
sample_sub.columns = ['uid','nnet7']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [14]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model_3br_tanh.npy'))
sample_sub.columns = ['uid','nnet8']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [15]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_binary_lgbm.npy'))
sample_sub.columns = ['uid','lgbmb']
x_te = x_te.merge(sample_sub, on='uid', how='left')

sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model_3br_v4.npy'))
sample_sub.columns = ['uid','nnet10']
x_te = x_te.merge(sample_sub, on='uid', how='left')

sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model_3br_v5.npy'))
sample_sub.columns = ['uid','nnet11']
x_te = x_te.merge(sample_sub, on='uid', how='left')

sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_xgb_single.npy'))
sample_sub.columns = ['uid','xgb_single']
x_te = x_te.merge(sample_sub, on='uid', how='left')

sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_3br_bin_v2.npy'))
sample_sub.columns = ['uid','nnet12']
x_te = x_te.merge(sample_sub, on='uid', how='left')

sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_advanced_model_3br_bin.npy'))
sample_sub.columns = ['uid','nnet13']
x_te = x_te.merge(sample_sub, on='uid', how='left')


In [16]:
X['lgbm1'] = minmax_scale(np.load(results_dir + 'train_baseline_sparse_10folds.npy'))
X['lgbmb'] = minmax_scale(np.load(results_dir + 'train_binary_lgbm.npy'))
X['nnet2'] = minmax_scale(np.load(results_dir + 'train_all_in_focal_loss.npy'))
X['nnet1'] = minmax_scale(np.load(results_dir + 'train_focal_loss_m1.npy'))
X['nnet3'] = minmax_scale(np.load(results_dir + 'train_focal_loss_m3.npy'))
X['nnet4'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model.npy'))
X['nnet5'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model_catpca.npy'))
X['nnet6'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model_3br.npy'))
X['nnet7'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model_3br_wcc.npy'))
X['nnet8'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model_3br_tanh.npy'))
X['ftrl_50']  = minmax_scale(np.load(results_dir + 'train_ftrl_50.npy'))
X['ftrl']  = minmax_scale(np.load(results_dir + 'train_ftrl_70.npy'))
X['nnet10'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model_3br_v4.npy'))
X['nnet11'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model_3br_v5.npy'))
X['xgb_single']  = minmax_scale(np.load(results_dir + 'train_xgb_single.npy'))
X['nnet12'] = minmax_scale(np.load(results_dir + 'train_nn_3br_bin_v2.npy'))
X['nnet13'] = minmax_scale(np.load(results_dir + 'train_nn_advanced_model_3br_bin.npy'))
#

In [17]:
for f in ['lgbm1','nnet2','nnet1','nnet3','nnet4','nnet5','ftrl','nnet6','nnet8','ftrl_50','nnet11','xgb_single','lgbmb','nnet12','nnet13']:
    print(f,roc_auc_score(X.target, X[f]))

lgbm1 0.682562603224
nnet2 0.649222662589
nnet1 0.623698784984
nnet3 0.574520607881
nnet4 0.647936920552
nnet5 0.648281500429
ftrl 0.622924063126
nnet6 0.658849449887
nnet8 0.657409030233
ftrl_50 0.623440994459
nnet11 0.660053076044
xgb_single 0.660175309563
lgbmb 0.66792997872
nnet12 0.67274153339
nnet13 0.67264163497


In [18]:
X[['nnet12','nnet13']].corr()

Unnamed: 0,nnet12,nnet13
nnet12,1.0,0.815829
nnet13,0.815829,1.0


In [19]:
x_te[['nnet12','nnet13']].corr()

Unnamed: 0,nnet12,nnet13
nnet12,1.0,0.872521
nnet13,0.872521,1.0


In [20]:
train_mat = sp.hstack([train_mat1,train_mat2,train_mat3]).astype(np.bool).astype(np.int8).tocsr()
test_mat = sp.hstack([test_mat1,test_mat2,test_mat3]).astype(np.bool).astype(np.int8).tocsr()

In [41]:
import xgboost as xgb
import scipy.sparse as sp
from sklearn.feature_selection import SelectPercentile
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from vowpalwabbit.sklearn_vw import VWClassifier,VWRegressor

def mean_encode_test(df, y, test,k,column):
    mean_0 = np.zeros((test.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)  
    y0s = df[['target',column]].groupby(column).agg(np.mean).reset_index()
    y0s.columns = [column,'target_mean']
    vc = df[column].value_counts().reset_index()
    vc.columns = [column,'counts']
    test = test.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
    test['mean_target'] = (test.target_mean * test.counts + k * m0)/(test.counts + k)
    mean_0 = np.array(test['mean_target']).reshape(-1,1)
    return mean_0    

def mean_encode_self(df, y, kf, k, column):
    mean_0 = np.zeros((y.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)
    for dev_index, val_index in kf: 
        dev_X, val_X = df.iloc[dev_index,:], df.iloc[val_index,:]
        y0s = dev_X[['target',column]].groupby(column).agg(np.mean).reset_index()
        y0s.columns = [column,'target_mean']
        vc = dev_X[column].value_counts().reset_index()
        vc.columns = [column,'counts']
        val_X = val_X.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
        val_X['mean_target'] = (val_X.target_mean * val_X.counts + k * m0)/(val_X.counts + k)
        mean_0[val_index,:] = np.array(val_X['mean_target']).reshape(-1,1)       
    return mean_0

def make_agg_features(X, train_index, test_index, test_data):
    te_cols = ['most_freq_cat']
    kf = KFold(n_splits = 5, random_state=2018, shuffle=True)
    for c in te_cols:
        X.loc[test_index,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), X.loc[test_index,:].copy(), 10.0, c)
        test_data.loc[:,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), test_data.copy(), 10.0, c)
        X.loc[train_index,c + '_te'] = mean_encode_self(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), kf.split(X.loc[train_index,:]), 10.0, c)
    return X.loc[train_index,:], X.loc[test_index,:], test_data
    
train_cols = ['sess_keys_mean','sess_keys_max','diff_key1_mean','diff_key1_max','diff_key2_mean',
              'diff_key2_max','diff_key3_mean','diff_key3_max','quot_key1_mean','quot_key1_max',
              'quot_key2_mean','quot_key2_max','quot_key3_mean','quot_key3_max',
              'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
              'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
              'records', 'max_days', 'min_days', 'sum_values_f1_max',
              'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
              'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
              'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
              'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
              'mean_day_cntr', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
              'nuniq_keys_f1.2', 'sumval_keys_f1', 'sumval_keys_f1.1',
              'sumval_keys_f1.2', 'most_freq_cat_te', 'diff_num_cats', 'unique_days','max_f1','max_f2','max_f3',
              'svd_description_1','svd_description_2','svd_description_3','svd_description_4','svd_description_5',
              'svd_description_6','svd_description_7','svd_description_8','svd_description_9','svd_description_10'] + ['svd_title_'+str(i+1) for i in range(10)]
    
kf = KFold(n_splits=10, shuffle=True, random_state=239)

ifold = 0

y_pred = 0
y_oof = X[['uid','target']].copy()
y_oof['target'] = np.nan

scores = []

for train_index,test_index in kf.split(X):
    print('fold', ifold)
       
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    
    X_tr,X_va,X_te = make_agg_features(X,train_index,test_index,x_te)
    X_tr = X_tr[train_cols].fillna(0)
    X_va = X_va[train_cols].fillna(0)
    X_te = X_te[train_cols].fillna(0)
    
    scaler = MinMaxScaler(feature_range=(-1,1))
    X_tr = scaler.fit_transform(X_tr)
    X_va = scaler.transform(X_va)
    X_te = scaler.transform(X_te)
    
    yy = y_tr
    ssp = SelectPercentile(percentile=0.7)  
    ssp.fit(train_mat[train_index], yy)
    sp_train_mat = ssp.transform(train_mat[train_index])
    sp_val_mat = ssp.transform(train_mat[test_index])
    sp_test_mat = ssp.transform(test_mat)   
    
    print('prepare train')
    X_tr = sp.hstack([
        X_tr, sp_train_mat
    ]).tocsr()
    print(X_tr.shape)
    print('prepare valid')
    X_va = sp.hstack([
        X_va, sp_val_mat
    ]).tocsr()    
    print('prepare test')
    X_te = sp.hstack([
        X_te, sp_test_mat
    ]).tocsr()     
    
    model = VWRegressor(power_t=0.99, ftrl=True, l1=15, l2=0.1)
    model.fit(X_tr,y_tr)
    
    yhat = model.predict(X_va)
    scores.append(roc_auc_score(y_va,yhat))
    print(ifold,roc_auc_score(y_va,yhat))
    y_oof.loc[test_index,'target'] = yhat
   
    ytst = model.predict(X_te)
    print(ytst)
    print(minmax_scale(ytst))
    y_pred += minmax_scale(ytst)*0.1
    
    del X_tr,X_va,X_te, sp_train_mat, sp_val_mat, sp_test_mat
    gc.collect()    
    
    ifold += 1 
#model = VWRegressor(power_t=0.99, ftrl=True, l1=15, l2=0.1)
#0.678657863271
#0.654927206221

#0.8 perc
# 0 0.681140205086
# 1 0.664159676552
# 2 0.661224538511
# 3 0.671735521827

fold 0
prepare train
(385194, 1647)
prepare valid
prepare test
0 0.68135971202
[ 0.07684783  0.08619268  0.04895015 ...,  0.04038424  0.          0.        ]
[ 0.13111016  0.14705343  0.0835139  ...,  0.0688996   0.          0.        ]
fold 1
prepare train
(385194, 1647)
prepare valid
prepare test
1 0.663118094943
[ 0.05970712  0.0895149   0.06341106 ...,  0.02558964  0.          0.        ]
[ 0.09207724  0.13804524  0.09778926 ...,  0.03946303  0.          0.        ]
fold 2
prepare train
(385194, 1647)
prepare valid
prepare test
2 0.661822823645
[ 0.07713342  0.09433205  0.0546173  ...,  0.03627067  0.          0.        ]
[ 0.11766539  0.14390155  0.08331753 ...,  0.05533014  0.          0.        ]
fold 3
prepare train
(385194, 1647)
prepare valid
prepare test
3 0.672444359835
[ 0.07481077  0.0947302   0.05549809 ...,  0.03726765  0.          0.        ]
[ 0.11658251  0.14762425  0.0864863  ...,  0.05807662  0.          0.        ]
fold 4
prepare train
(385195, 1647)
prepare valid

In [42]:
print(scores)
print(np.mean(scores), np.std(scores))

[0.68135971202016565, 0.66311809494326535, 0.66182282364461864, 0.67244435983496054, 0.66102971682157374, 0.66680777408355585, 0.66662234052982638, 0.67539423377967267, 0.67020303763930711, 0.66838887992879226]
0.668719097323 0.00605505760655


In [43]:
roc_auc_score(X.target.values, y_oof.target.values)

0.66823684230830271

In [44]:
model_name = 'vw_07perc'

np.save(results_dir + 'train_' + model_name +'.npy', y_oof.target.values)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

sub = x_te[['uid','target']].copy()
sub['target'] = y_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.340757
1,ac4b8244f3ae82df511b002257473c11,0.091455
2,483d8b91e49522c8a5bbe37f3872c749,0.137531
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.063049
4,fdbfba9842ff0bf86d600eb334c7c42b,0.071593
