In [1]:
data_dir = './data/mlboot_dataset/'
model_name = 'xgb_single'
results_dir = './results/'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp
import gc

In [2]:
q = pd.read_csv(data_dir + 'sessions.csv')
q.columns

Index(['uid', 'sess_keys_mean', 'sess_keys_max', 'diff_key1_mean',
       'diff_key1_max', 'diff_key2_mean', 'diff_key2_max', 'diff_key3_mean',
       'diff_key3_max', 'quot_key1_mean', 'quot_key1_max', 'quot_key2_mean',
       'quot_key2_max', 'quot_key3_mean', 'quot_key3_max'],
      dtype='object')

In [3]:
df = pd.read_csv(data_dir + 'preprocessed_new.csv') 
df = df.merge(q, on='uid', how='left')
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

In [4]:
data_svd = pd.DataFrame(np.load(data_dir + 'pca_cat10.npy'), index=df.index)

In [5]:
data_svd.columns = ['svd_description_'+str(i+1) for i in range(10)]
df = pd.concat([df, data_svd], axis=1)    
del data_svd

In [6]:
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [7]:
df.columns

Index(['uid', 'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'nuniq_keys_f1_cat0', 'nuniq_keys_f2_cat0',
       'nuniq_keys_f3_cat0', 'nuniq_keys_f1_cat1', 'nuniq_keys_f2_cat1',
       'nuniq_keys_f3_cat1', 'nuniq_keys_f1_cat2', 'nuniq_keys_f2_cat2',
       'nuniq_keys_f3_cat2', 'nuniq_keys_f1_cat3', 'nuniq_keys_f2_cat3',
       'nuniq_keys_f3_cat3', 'nuniq_keys_f1_cat4', 'nuniq_keys_f2_cat4',
       'nuniq_keys_f3_cat4', 'nuniq_keys_f1_cat5', 'nuniq_keys_f2_cat5',
       'nuniq_keys_f3_cat5', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
       'nuniq_key

In [8]:
mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil()

mat = sp.hstack([mat1,mat2,mat3]).tolil()
del mat1,mat2,mat3

train_mat = mat[df_train_index.tolist()]
test_mat = mat[df_test_index.tolist()]
mat = mat.tocsc()[:, np.where((train_mat.getnnz(axis=0) > 1) & (test_mat.getnnz(axis=0) > 0))[0]].tocsr()
train_mat = mat[df_train_index.tolist()]
test_mat = mat[df_test_index.tolist()]
del mat
gc.collect()

0

In [9]:
import xgboost as xgb
import scipy.sparse as sp
from sklearn.feature_selection import SelectPercentile
def save_submit(model_name, folds, y_pred):
    global x_te
    sub = x_te[['uid','target']].copy()
    sub['target'] = y_pred
    sub.columns = ['cuid','target']
    sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
    sample_sub = sample_sub.merge(sub, on='cuid', how='left')
    sample_sub[['target']].to_csv(results_dir + model_name + '_' + str(folds) + 'folds.csv', header=False, index=False)
    del sub,sample_sub
    gc.collect()
    
def mean_encode_test(df, y, test,k,column):
    mean_0 = np.zeros((test.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)  
    y0s = df[['target',column]].groupby(column).agg(np.mean).reset_index()
    y0s.columns = [column,'target_mean']
    vc = df[column].value_counts().reset_index()
    vc.columns = [column,'counts']
    test = test.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
    test['mean_target'] = (test.target_mean * test.counts + k * m0)/(test.counts + k)
    mean_0 = np.array(test['mean_target']).reshape(-1,1)
    return mean_0    

def mean_encode_self(df, y, kf, k, column):
    mean_0 = np.zeros((y.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)
    for dev_index, val_index in kf: 
        dev_X, val_X = df.iloc[dev_index,:], df.iloc[val_index,:]
        y0s = dev_X[['target',column]].groupby(column).agg(np.mean).reset_index()
        y0s.columns = [column,'target_mean']
        vc = dev_X[column].value_counts().reset_index()
        vc.columns = [column,'counts']
        val_X = val_X.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
        val_X['mean_target'] = (val_X.target_mean * val_X.counts + k * m0)/(val_X.counts + k)
        mean_0[val_index,:] = np.array(val_X['mean_target']).reshape(-1,1)       
    return mean_0

def make_agg_features(X, train_index, test_index, test_data):
    te_cols = ['most_freq_cat']
    kf = KFold(n_splits = 5, random_state=2018, shuffle=True)
    for c in te_cols:
        X.loc[test_index,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), X.loc[test_index,:].copy(), 10.0, c)
        test_data.loc[:,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), test_data.copy(), 10.0, c)
        X.loc[train_index,c + '_te'] = mean_encode_self(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), kf.split(X.loc[train_index,:]), 10.0, c)
    return X.loc[train_index,:], X.loc[test_index,:], test_data
    
train_cols = ['num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
       'nuniq_keys_f1.2', 'sumval_keys_f1', 'sumval_keys_f1.1',
       'sumval_keys_f1.2', 'most_freq_cat_te', 'diff_num_cats', 'unique_days',
       'svd_description_1',
       'svd_description_2', 'svd_description_3', 'svd_description_4',
       'svd_description_5', 'svd_description_6', 'svd_description_7',
       'svd_description_8', 'svd_description_9', 'svd_description_10',
       'sess_keys_mean', 'sess_keys_max', 'diff_key1_mean',
       'diff_key1_max', 'diff_key2_mean', 'diff_key2_max', 'diff_key3_mean',
       'diff_key3_max', 'quot_key1_mean', 'quot_key1_max', 'quot_key2_mean',
       'quot_key2_max', 'quot_key3_mean', 'quot_key3_max'] 

# Train the model
parameters = {
    'booster' : 'gbtree',
    'n_estimators':20000,
    'max_depth':8,
    'objective':"binary:logistic",
    'eval_metric':'auc',
    'learning_rate':0.01, 
    'subsample':.6,
    'min_child_weight':20,
    'colsample_bytree':.6,
    'scale_pos_weight': 19,
    'gamma':10,
    #'reg_alpha':1,
    'reg_lambda':1.3,
}

kf = KFold(n_splits=10, shuffle=True, random_state=57)

ifold = 0

y_pred = 0
y_oof = X[['uid','target']].copy()
y_oof['target'] = np.nan

scores = []

for train_index,test_index in kf.split(X):
    print('fold', ifold)
       
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    X_tr,X_va,X_te = make_agg_features(X,train_index,test_index,x_te)
    X_tr = X_tr[train_cols].fillna(0).values
    X_va = X_va[train_cols].fillna(0).values
    X_te = X_te[train_cols].fillna(0).values
    
    yy = y_tr
    ssp = SelectPercentile(percentile=0.1)  
    ssp.fit(train_mat[train_index], yy)
    sp_train_mat = ssp.transform(train_mat[train_index])
    sp_val_mat = ssp.transform(train_mat[test_index])
    sp_test_mat = ssp.transform(test_mat)   
    
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_tr)
    X_tr = scaler.transform(X_tr)
    X_va = scaler.transform(X_va)
    X_te = scaler.transform(X_te)
    del scaler
    
    from sklearn.preprocessing import MaxAbsScaler 
    scaler = MaxAbsScaler()
    scaler.fit(sp_train_mat)
    sp_train_mat = scaler.transform(sp_train_mat)
    sp_val_mat = scaler.transform(sp_val_mat)
    sp_test_mat = scaler.transform(sp_test_mat)
    del scaler
    
    print('prepare train')
    X_tr = sp.hstack([
        X_tr, sp_train_mat
    ]).tocsr()
    print(X_tr.shape)
    print('prepare valid')
    X_va = sp.hstack([
        X_va, sp_val_mat
    ]).tocsr()    
    print('prepare test')
    X_te = sp.hstack([
        X_te, sp_test_mat
    ]).tocsr()     

    # Create the LightGBM data containers
    tr_data = xgb.DMatrix(X_tr, label=y_tr) #, categorical_feature=cate_cols
    va_data = xgb.DMatrix(X_va, label=y_va) #, categorical_feature=cate_cols
    te_data = xgb.DMatrix(X_te, label=y_va)
    model = xgb.train(parameters,
                      tr_data,
                      evals=[(tr_data,'train'),(va_data,'valid')],
                      num_boost_round=8000,
                      early_stopping_rounds=300,
                      #maximize = True,
                      verbose_eval=100)
    
    yhat = model.predict(va_data, model.best_iteration)
    scores.append(roc_auc_score(y_va,yhat))
    print(ifold,roc_auc_score(y_va,yhat))
    y_oof.loc[test_index,'target'] = yhat

    print('prepare test')
    
    ytst = model.predict(te_data, model.best_iteration)
    y_pred += ytst*0.1
    
    del X_tr,X_va,tr_data,va_data,te_data, sp_train_mat, sp_val_mat, sp_test_mat
    gc.collect()    
    
    save_submit('xgb_q', ifold, y_pred)

    ifold += 1    

fold 0


 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

prepare train
(385194, 549)
prepare valid
prepare test
[0]	train-auc:0.627956	valid-auc:0.616687
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.70732	valid-auc:0.657253
[200]	train-auc:0.725722	valid-auc:0.663112
[300]	train-auc:0.739502	valid-auc:0.666423
[400]	train-auc:0.752148	valid-auc:0.668144
[500]	train-auc:0.763095	valid-auc:0.668874
[600]	train-auc:0.773226	valid-auc:0.669371
[700]	train-auc:0.783079	valid-auc:0.66953
[800]	train-auc:0.792942	valid-auc:0.669552
[900]	train-auc:0.80229	valid-auc:0.669572
[1000]	train-auc:0.810355	valid-auc:0.668641
Stopping. Best iteration:
[719]	train-auc:0.784708	valid-auc:0.669625

0 0.66856490453
prepare test
fold 1


 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

prepare train
(385194, 549)
prepare valid
prepare test
[0]	train-auc:0.628919	valid-auc:0.597608
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.709549	valid-auc:0.652978
[200]	train-auc:0.726721	valid-auc:0.65636
[300]	train-auc:0.740327	valid-auc:0.658916
[400]	train-auc:0.752343	valid-auc:0.660214
[500]	train-auc:0.763245	valid-auc:0.661415
[600]	train-auc:0.772642	valid-auc:0.66212
[700]	train-auc:0.782141	valid-auc:0.662092
[800]	train-auc:0.790808	valid-auc:0.661524
Stopping. Best iteration:
[591]	train-auc:0.772044	valid-auc:0.662207

1 0.66127623624
prepare test
fold 2


 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

prepare train
(385194, 549)
prepare valid
prepare test
[0]	train-auc:0.633095	valid-auc:0.596459
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.71005	valid-auc:0.650817
[200]	train-auc:0.726267	valid-auc:0.65568
[300]	train-auc:0.740448	valid-auc:0.659237
[400]	train-auc:0.751559	valid-auc:0.660934
[500]	train-auc:0.762045	valid-auc:0.662123
[600]	train-auc:0.771435	valid-auc:0.662705
[700]	train-auc:0.780723	valid-auc:0.662621
[800]	train-auc:0.789558	valid-auc:0.662649
[900]	train-auc:0.798871	valid-auc:0.662962
[1000]	train-auc:0.807476	valid-auc:0.66295
[1100]	train-auc:0.81567	valid-auc:0.662787
[1200]	train-auc:0.823532	valid-auc:0.662777
[1300]	train-auc:0.831008	valid-auc:0.662467
Stopping. Best iteration:
[1052]	train-auc:0.811744	valid-auc:0.663127

2 0.662259214769
prepare test
fold 3
prepare train
(385194, 549)
prepare valid
prepare test
[0]	train-auc:0.626597	

 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

prepare train
(385195, 549)
prepare valid
prepare test
[0]	train-auc:0.629023	valid-auc:0.601385
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.707925	valid-auc:0.650114
[200]	train-auc:0.725719	valid-auc:0.656352
[300]	train-auc:0.739868	valid-auc:0.660121
[400]	train-auc:0.75152	valid-auc:0.66205
[500]	train-auc:0.762022	valid-auc:0.663311
[600]	train-auc:0.772581	valid-auc:0.663908
[700]	train-auc:0.781562	valid-auc:0.664428
[800]	train-auc:0.790174	valid-auc:0.664299
[900]	train-auc:0.799345	valid-auc:0.664399
[1000]	train-auc:0.807641	valid-auc:0.664303
[1100]	train-auc:0.815259	valid-auc:0.664217
[1200]	train-auc:0.822953	valid-auc:0.663954
Stopping. Best iteration:
[938]	train-auc:0.802545	valid-auc:0.664521

4 0.663855671264
prepare test
fold 5


 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

prepare train
(385195, 549)
prepare valid
prepare test
[0]	train-auc:0.625638	valid-auc:0.597447
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.708468	valid-auc:0.645182
[200]	train-auc:0.726016	valid-auc:0.650689
[300]	train-auc:0.740722	valid-auc:0.654253
[400]	train-auc:0.752183	valid-auc:0.655519
[500]	train-auc:0.762724	valid-auc:0.655993
[600]	train-auc:0.772277	valid-auc:0.656493
[700]	train-auc:0.781829	valid-auc:0.656805
[800]	train-auc:0.790412	valid-auc:0.657124
[900]	train-auc:0.799463	valid-auc:0.657647
[1000]	train-auc:0.807477	valid-auc:0.657746
[1100]	train-auc:0.815696	valid-auc:0.657635
[1200]	train-auc:0.823165	valid-auc:0.657715
[1300]	train-auc:0.830632	valid-auc:0.657724
[1400]	train-auc:0.837969	valid-auc:0.657398
[1500]	train-auc:0.844861	valid-auc:0.657391
Stopping. Best iteration:
[1240]	train-auc:0.825765	valid-auc:0.657846

5 0.657244825977
prep

 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

prepare train
(385195, 549)
prepare valid
prepare test
[0]	train-auc:0.629122	valid-auc:0.599823
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.708462	valid-auc:0.646134
[200]	train-auc:0.724618	valid-auc:0.650373
[300]	train-auc:0.738033	valid-auc:0.653101
[400]	train-auc:0.750197	valid-auc:0.65321
[500]	train-auc:0.761352	valid-auc:0.653691
[600]	train-auc:0.771208	valid-auc:0.654054
[700]	train-auc:0.781341	valid-auc:0.653581
[800]	train-auc:0.790554	valid-auc:0.653509
Stopping. Best iteration:
[587]	train-auc:0.770206	valid-auc:0.654191

6 0.65308670113
prepare test
fold 7


 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

prepare train
(385195, 549)
prepare valid
prepare test
[0]	train-auc:0.626857	valid-auc:0.601101
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.709078	valid-auc:0.651507
[200]	train-auc:0.726122	valid-auc:0.655005
[300]	train-auc:0.740516	valid-auc:0.656642
[400]	train-auc:0.75221	valid-auc:0.657758
[500]	train-auc:0.762817	valid-auc:0.658006
[600]	train-auc:0.772715	valid-auc:0.657526
[700]	train-auc:0.781911	valid-auc:0.65742
Stopping. Best iteration:
[486]	train-auc:0.761416	valid-auc:0.658286

7 0.657140235711
prepare test
fold 8


 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

prepare train
(385195, 549)
prepare valid
prepare test
[0]	train-auc:0.620871	valid-auc:0.599407
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.710835	valid-auc:0.644776
[200]	train-auc:0.727599	valid-auc:0.65029
[300]	train-auc:0.742158	valid-auc:0.653613
[400]	train-auc:0.753837	valid-auc:0.655626
[500]	train-auc:0.764016	valid-auc:0.656377
[600]	train-auc:0.773578	valid-auc:0.656871
[700]	train-auc:0.783443	valid-auc:0.656982
[800]	train-auc:0.792781	valid-auc:0.656985
[900]	train-auc:0.801366	valid-auc:0.656652
Stopping. Best iteration:
[667]	train-auc:0.780224	valid-auc:0.657159

8 0.656130640596
prepare test
fold 9


 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

prepare train
(385195, 549)
prepare valid
prepare test
[0]	train-auc:0.628217	valid-auc:0.59643
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.708365	valid-auc:0.647282
[200]	train-auc:0.725236	valid-auc:0.650899
[300]	train-auc:0.739217	valid-auc:0.653428
[400]	train-auc:0.751049	valid-auc:0.654934
[500]	train-auc:0.762159	valid-auc:0.655467
[600]	train-auc:0.772668	valid-auc:0.656048
[700]	train-auc:0.782288	valid-auc:0.656321
[800]	train-auc:0.791913	valid-auc:0.656779
[900]	train-auc:0.800736	valid-auc:0.65704
[1000]	train-auc:0.809258	valid-auc:0.65702
[1100]	train-auc:0.817282	valid-auc:0.657406
[1200]	train-auc:0.824347	valid-auc:0.657097
[1300]	train-auc:0.831991	valid-auc:0.656888
[1400]	train-auc:0.83926	valid-auc:0.65646
Stopping. Best iteration:
[1144]	train-auc:0.820536	valid-auc:0.657538

9 0.656430491826
prepare test


In [10]:
print(scores)
print(np.mean(scores), np.std(scores))

[0.66856490453023998, 0.66127623623984477, 0.66225921476859728, 0.66731323545769516, 0.6638556712644641, 0.65724482597689549, 0.65308670113001788, 0.65714023571133806, 0.65613064059608972, 0.65643049182648083]
0.66033021575 0.0048849347694


In [11]:
np.save(results_dir + 'train_' + model_name +'.npy', y_oof.target.values)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

sub = x_te[['uid','target']].copy()
sub['target'] = y_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,-0.305083
1,ac4b8244f3ae82df511b002257473c11,0.256976
2,483d8b91e49522c8a5bbe37f3872c749,0.507784
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,-0.433846
4,fdbfba9842ff0bf86d600eb334c7c42b,-0.528173
