In [1]:
data_dir = './data/mlboot_dataset/'
model_name = 'j1'
results_dir = './results/'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp
import gc

In [2]:
df = pd.read_csv(data_dir + 'preprocessed_new.csv') 
q = pd.read_csv(data_dir + 'sessions.csv')
df = df.merge(q, on='uid', how='left')
del q
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil()
print(mat1.shape, mat2.shape, mat3.shape)

(609018, 2053602) (609018, 20275) (609018, 1057788)


In [3]:
df['max_f1'] = mat1.tocsr().max(axis=1).todense()
df['max_f2'] = mat2.tocsr().max(axis=1).todense()
df['max_f3'] = mat3.tocsr().max(axis=1).todense()

train_mat1 = mat1[df_train_index.tolist()]
test_mat1 = mat1[df_test_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
test_mat2 = mat2[df_test_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]
test_mat3 = mat3[df_test_index.tolist()]

limit = 11
mat1 = mat1.tocsc()[:, np.where((train_mat1.getnnz(axis=0) > limit) & (test_mat1.getnnz(axis=0) > 0))[0]].tocsr()
mat2 = mat2.tocsc()[:, np.where((train_mat2.getnnz(axis=0) > limit) & (test_mat2.getnnz(axis=0) > 0))[0]].tocsr()
mat3 = mat3.tocsc()[:, np.where((train_mat3.getnnz(axis=0) > limit) & (test_mat3.getnnz(axis=0) > 0))[0]].tocsr()

In [4]:
print(mat1.shape, mat2.shape, mat3.shape)

(609018, 195734) (609018, 20268) (609018, 9415)


In [5]:
data_svd = pd.DataFrame(np.load(data_dir + 'pca_cat10.npy'), index=df.index)
data_svd.columns = ['svd_description_'+str(i+1) for i in range(10)]
df = pd.concat([df, data_svd], axis=1)    
del data_svd

In [6]:
data_svd = pd.DataFrame(np.load(data_dir + 'bin_pca_dim10.npy'), index=df.index)
data_svd.columns = ['svd_title_'+str(i+1) for i in range(10)]
df = pd.concat([df, data_svd], axis=1)    
del data_svd

In [7]:
train_mat1 = mat1[df_train_index.tolist()]
test_mat1 = mat1[df_test_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
test_mat2 = mat2[df_test_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]
test_mat3 = mat3[df_test_index.tolist()]

del mat1,mat2,mat3
gc.collect()

6

In [8]:
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [9]:
df.columns

Index(['uid', 'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days',
       ...
       'svd_title_1', 'svd_title_2', 'svd_title_3', 'svd_title_4',
       'svd_title_5', 'svd_title_6', 'svd_title_7', 'svd_title_8',
       'svd_title_9', 'svd_title_10'],
      dtype='object', length=107)

In [10]:
from sklearn.preprocessing import minmax_scale

In [11]:
meta = pd.read_csv(data_dir + 'train_meta.csv')
X = X.merge(meta, on='uid', how='left')

meta = pd.read_csv(data_dir + 'test_meta.csv')
x_te = x_te.merge(meta, on='uid', how='left')

In [12]:
train_mat = sp.hstack([train_mat1,train_mat2,train_mat3]).tocsr()
test_mat = sp.hstack([test_mat1,test_mat2,test_mat3]).tocsr()

In [17]:
mat = test_mat.sum(axis=0)
ixs = np.asarray(mat)[0].argsort()[-750:][::-1]
train_mat = train_mat[:,ixs]
test_mat = test_mat[:,ixs]

In [18]:
df.diff_num_cats.value_counts()

1    478079
2    120192
3     10459
4       285
5         3
Name: diff_num_cats, dtype: int64

In [19]:
df.most_freq_cat.value_counts()

5    329708
2    166519
1     47177
0     38653
3     22501
4      4460
Name: most_freq_cat, dtype: int64

In [20]:
import xgboost as xgb
import scipy.sparse as sp
from sklearn.feature_selection import SelectPercentile
def save_submit(model_name, folds, y_pred):
    global x_te
    sub = x_te[['uid','target']].copy()
    sub['target'] = y_pred
    sub.columns = ['cuid','target']
    sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
    sample_sub = sample_sub.merge(sub, on='cuid', how='left')
    sample_sub[['target']].to_csv(results_dir + model_name + '_' + str(folds) + 'folds.csv', header=False, index=False)
    del sub,sample_sub
    gc.collect()
    
def mean_encode_test(df, y, test,k,column):
    mean_0 = np.zeros((test.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)  
    y0s = df[['target',column]].groupby(column).agg(np.mean).reset_index()
    y0s.columns = [column,'target_mean']
    vc = df[column].value_counts().reset_index()
    vc.columns = [column,'counts']
    test = test.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
    test['mean_target'] = (test.target_mean * test.counts + k * m0)/(test.counts + k)
    mean_0 = np.array(test['mean_target']).reshape(-1,1)
    return mean_0    

def mean_encode_self(df, y, kf, k, column):
    mean_0 = np.zeros((y.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)
    for dev_index, val_index in kf: 
        dev_X, val_X = df.iloc[dev_index,:], df.iloc[val_index,:]
        y0s = dev_X[['target',column]].groupby(column).agg(np.mean).reset_index()
        y0s.columns = [column,'target_mean']
        vc = dev_X[column].value_counts().reset_index()
        vc.columns = [column,'counts']
        val_X = val_X.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
        val_X['mean_target'] = (val_X.target_mean * val_X.counts + k * m0)/(val_X.counts + k)
        mean_0[val_index,:] = np.array(val_X['mean_target']).reshape(-1,1)       
    return mean_0

def make_agg_features(X, train_index, test_index, test_data):
    te_cols = ['most_freq_cat','diff_num_cats']
    kf = KFold(n_splits = 5, random_state=2018, shuffle=True)
    for c in te_cols:
        X.loc[test_index,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), X.loc[test_index,:].copy(), 10.0, c)
        test_data.loc[:,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), test_data.copy(), 10.0, c)
        X.loc[train_index,c + '_te'] = mean_encode_self(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), kf.split(X.loc[train_index,:]), 10.0, c)
    return X.loc[train_index,:], X.loc[test_index,:], test_data
    
train_cols = ['sess_keys_mean','sess_keys_max','diff_key1_mean','diff_key1_max','diff_key2_mean',
              'diff_key2_max','diff_key3_mean','diff_key3_max','quot_key1_mean','quot_key1_max',
              'quot_key2_mean','quot_key2_max','quot_key3_mean','quot_key3_max',
              'num_times_cat_eq_0', 'num_times_cat_eq_2', 'num_times_cat_eq_5',
              'records', 'max_days', 'min_days', 'sum_values_f1_max',
              'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
              'sum_values_f1_mean',
              'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
              'max_day_cntr',
              'mean_day_cntr', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
              'nuniq_keys_f1.2', 'sumval_keys_f1', 'sumval_keys_f1.1',
              'sumval_keys_f1.2', 'most_freq_cat_te', 'unique_days','max_f1','max_f2',
              'svd_description_1','svd_description_2','svd_description_3',
              'svd_description_4','svd_description_5','svd_description_6',
              'svd_description_7','svd_description_8','svd_description_9',
              'most_freq_cat_te'] + ['svd_title_'+str(i+1) for i in range(9)] + meta.drop(['uid'], axis=1).columns.tolist()

# Train the model
parameters = {
    'booster' : 'gbtree',
    'n_estimators':20000,
    'max_depth':4,
    'objective':"binary:logistic",
    'eval_metric':'auc',
    'learning_rate':0.004, 
    'subsample':.6,
    'min_child_weight':10,
    'colsample_bytree':.6,
    'scale_pos_weight': 19,
    'gamma':1,
    'reg_lambda' : 41.3,
}

kf = KFold(n_splits=10, shuffle=True, random_state=239)

ifold = 0

y_pred = 0
y_oof = X[['uid','target']].copy()
y_oof['target'] = np.nan

scores = []

for train_index,test_index in kf.split(X):
    print('fold', ifold)
       
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    X_tr,X_va,X_te = make_agg_features(X,train_index,test_index,x_te)
    X_tr = X_tr[train_cols].fillna(0).values
    X_va = X_va[train_cols].fillna(0).values
    X_te = X_te[train_cols].fillna(0).values
    
    #yy = y_tr
    #ssp = SelectPercentile(percentile=0.1)  
    #ssp.fit(train_mat[train_index], yy)
    #sp_train_mat = ssp.transform(train_mat[train_index])
    #sp_val_mat = ssp.transform(train_mat[test_index])
    #sp_test_mat = ssp.transform(test_mat)   
    
    #from sklearn.preprocessing import StandardScaler
    #scaler = StandardScaler()
    #scaler.fit(X_tr)
    #X_tr = scaler.transform(X_tr)
    #X_va = scaler.transform(X_va)
    #X_te = scaler.transform(X_te)
    #del scaler
    
    sp_train_mat = train_mat[train_index]
    sp_test_mat = test_mat
    sp_val_mat = train_mat[test_index]
    
    from sklearn.preprocessing import MaxAbsScaler 
    scaler = MaxAbsScaler()
    scaler.fit(sp_train_mat)
    sp_train_mat = scaler.transform(sp_train_mat)
    sp_val_mat = scaler.transform(sp_val_mat)
    sp_test_mat = scaler.transform(sp_test_mat)
    del scaler
    
    X_tr = sp.hstack([
        X_tr, sp_train_mat
    ]).tocsr()
    print(X_tr.shape)
    X_va = sp.hstack([
        X_va, sp_val_mat
    ]).tocsr()    
    X_te = sp.hstack([
        X_te, sp_test_mat
    ]).tocsr()     

    # Create the LightGBM data containers
    tr_data = xgb.DMatrix(X_tr, label=y_tr) #, categorical_feature=cate_cols
    va_data = xgb.DMatrix(X_va, label=y_va) #, categorical_feature=cate_cols
    te_data = xgb.DMatrix(X_te, label=y_va)
    model = xgb.train(parameters,
                      tr_data,
                      evals=[(tr_data,'train'),(va_data,'valid')],
                      num_boost_round=8000,
                      early_stopping_rounds=300,
                      #maximize = True,
                      verbose_eval=100)
    
    yhat = model.predict(va_data, model.best_iteration)
    scores.append(roc_auc_score(y_va,yhat))
    print(ifold,roc_auc_score(y_va,yhat))
    y_oof.loc[test_index,'target'] = yhat
   
    ytst = model.predict(te_data, model.best_iteration)
    print(ytst)
    print(minmax_scale(ytst))
    y_pred += minmax_scale(ytst)*0.1
    
    del X_tr,X_va,tr_data,va_data,te_data, sp_train_mat, sp_val_mat, sp_test_mat
    gc.collect()    
    
    save_submit('xgb_q', ifold, y_pred)

    ifold += 1    

fold 0
(385194, 345)
[0]	train-auc:0.689925	valid-auc:0.701331
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.697817	valid-auc:0.708615
[200]	train-auc:0.698516	valid-auc:0.708747
[300]	train-auc:0.699302	valid-auc:0.708698
Stopping. Best iteration:
[13]	train-auc:0.696628	valid-auc:0.709409

0 0.708711190124
[ 0.6627304   0.26411811  0.01822019 ...,  0.09749877  0.04546159
 -0.6171779 ]
[ 0.73890769  0.57077128  0.46705046 ...,  0.50049049  0.47854099
  0.19903675]
fold 1
(385194, 345)
[0]	train-auc:0.693322	valid-auc:0.680544
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.700037	valid-auc:0.687988
[200]	train-auc:0.700823	valid-auc:0.688118
[300]	train-auc:0.701623	valid-auc:0.688279
[400]	train-auc:0.702404	valid-auc:0.688349
[500]	train-auc:0.703187	valid-

[1000]	train-auc:0.706755	valid-auc:0.695369
[1100]	train-auc:0.707606	valid-auc:0.695418
[1200]	train-auc:0.708499	valid-auc:0.695383
[1300]	train-auc:0.709373	valid-auc:0.695371
Stopping. Best iteration:
[1061]	train-auc:0.707269	valid-auc:0.695438

8 0.695348593841
[ 1.01451731  0.38997546  0.03274715 ...,  0.0912283  -0.06479736
 -1.0743525 ]
[ 0.71518785  0.5774399   0.49865022 ...,  0.51154876  0.47713596
  0.25447014]
fold 9
(385195, 345)
[0]	train-auc:0.691836	valid-auc:0.69
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 300 rounds.
[100]	train-auc:0.698889	valid-auc:0.695608
[200]	train-auc:0.699688	valid-auc:0.696183
[300]	train-auc:0.700487	valid-auc:0.696538
[400]	train-auc:0.701281	valid-auc:0.696862
[500]	train-auc:0.702058	valid-auc:0.696989
[600]	train-auc:0.702873	valid-auc:0.697149
[700]	train-auc:0.703735	valid-auc:0.697296
[800]	train-auc:0.704634	valid-auc:0.697382
[900]	train-auc:

In [21]:
print(scores)
print(np.mean(scores), np.std(scores))
roc_auc_score(X.target.values, y_oof.target.values)

[0.7087111901240899, 0.6886407508208614, 0.68382332890308606, 0.70283890397526994, 0.69181324583274528, 0.6951145215154142, 0.6912338421917712, 0.70541713065910994, 0.69534859384084058, 0.6973542896287005]
0.696029579749 0.00736706252798


0.69487836601595887

In [22]:
model_name = 'xgb_j1'
np.save(results_dir + 'train_' + model_name +'.npy', y_oof.target.values)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

sub = x_te[['uid','target']].copy()
sub['target'] = y_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.828121
1,ac4b8244f3ae82df511b002257473c11,0.520657
2,483d8b91e49522c8a5bbe37f3872c749,0.702572
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.409549
4,fdbfba9842ff0bf86d600eb334c7c42b,0.421917


In [24]:
print(scores)
print(np.mean(scores), np.std(scores))
roc_auc_score(X.target.values, y_oof.target.values)

[0.70266956283189674, 0.68534367448460842, 0.67659762699023662, 0.69533404351538208, 0.68507270994204705, 0.68726859777613714, 0.68473580622851282, 0.70047125013335787, 0.68772433900540597, 0.69305728895477903]
0.689827489986 0.0075736412952


0.68967107272739692

In [57]:
model_name = 'lgb_e1'
import xgboost as xgb
import scipy.sparse as sp
from sklearn.feature_selection import SelectPercentile
def save_submit(model_name, folds, y_pred):
    global x_te
    sub = x_te[['uid','target']].copy()
    sub['target'] = y_pred
    sub.columns = ['cuid','target']
    sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
    sample_sub = sample_sub.merge(sub, on='cuid', how='left')
    sample_sub[['target']].to_csv(results_dir + model_name + '_' + str(folds) + 'folds.csv', header=False, index=False)
    del sub,sample_sub
    gc.collect()
    
def mean_encode_test(df, y, test,k,column):
    mean_0 = np.zeros((test.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)  
    y0s = df[['target',column]].groupby(column).agg(np.mean).reset_index()
    y0s.columns = [column,'target_mean']
    vc = df[column].value_counts().reset_index()
    vc.columns = [column,'counts']
    test = test.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
    test['mean_target'] = (test.target_mean * test.counts + k * m0)/(test.counts + k)
    mean_0 = np.array(test['mean_target']).reshape(-1,1)
    return mean_0    

def mean_encode_self(df, y, kf, k, column):
    mean_0 = np.zeros((y.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)
    for dev_index, val_index in kf: 
        dev_X, val_X = df.iloc[dev_index,:], df.iloc[val_index,:]
        y0s = dev_X[['target',column]].groupby(column).agg(np.mean).reset_index()
        y0s.columns = [column,'target_mean']
        vc = dev_X[column].value_counts().reset_index()
        vc.columns = [column,'counts']
        val_X = val_X.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
        val_X['mean_target'] = (val_X.target_mean * val_X.counts + k * m0)/(val_X.counts + k)
        mean_0[val_index,:] = np.array(val_X['mean_target']).reshape(-1,1)       
    return mean_0

def make_agg_features(X, train_index, test_index, test_data):
    te_cols = ['most_freq_cat']
    kf = KFold(n_splits = 5, random_state=2018, shuffle=True)
    for c in te_cols:
        X.loc[test_index,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), X.loc[test_index,:].copy(), 10.0, c)
        test_data.loc[:,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), test_data.copy(), 10.0, c)
        X.loc[train_index,c + '_te'] = mean_encode_self(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), kf.split(X.loc[train_index,:]), 10.0, c)
    return X.loc[train_index,:], X.loc[test_index,:], test_data
    
train_cols = ['sess_keys_mean','sess_keys_max','diff_key1_mean','diff_key1_max','diff_key2_mean',
              'diff_key2_max','diff_key3_mean','diff_key3_max','quot_key1_mean','quot_key1_max',
              'quot_key2_mean','quot_key2_max','quot_key3_mean','quot_key3_max',
              'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
              'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
              'records', 'max_days', 'min_days', 'sum_values_f1_max',
              'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
              'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
              'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
              'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
              'mean_day_cntr', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
              'nuniq_keys_f1.2', 'sumval_keys_f1', 'sumval_keys_f1.1',
              'sumval_keys_f1.2', 'most_freq_cat_te', 'diff_num_cats', 'unique_days','max_f1','max_f2','max_f3',
              'svd_description_1','svd_description_2','svd_description_3','svd_description_4','svd_description_5',
              'svd_description_6','svd_description_7','svd_description_8','svd_description_9','svd_description_10',
              'nnet4','nnet5','nnet6','nnet7','nnet10','nnet11','xgb_single','nnet12','nnet13','lgbm1','nnet3','nnet1',
              'vw','ftrl_50','ftrl','vw2','sgd1',
              'nnet8','lgbmb','most_freq_cat_te'] + ['svd_title_'+str(i+1) for i in range(10)]

#,'most_freq_cat_te','nnet4','nnet5','nnet6','nnet7','nnet10','nnet11','nnet8',

# Train the model
parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 16,
    'max_depth' : 4,
    #'min_data' : 30,
    #'lambda_l2' : 15.5,
    #'min_sum_hessian_in_leaf' : 0.5,
    'lambda_l1' : 5.2,
    'is_unbalance': True,
    'learning_rate': 0.005,
    'feature_fraction': 0.7,
    'verbose': 0
}

kf = KFold(n_splits=10, shuffle=True, random_state=239)

ifold = 0

y_pred = 0
y_oof = X[['uid','target']].copy()
y_oof['target'] = np.nan

scores = []

train_mat = sp.hstack([train_mat1,train_mat2,train_mat3]).tocsr().astype(np.bool).astype(np.float32)
test_mat = sp.hstack([test_mat1,test_mat2,test_mat3]).tocsr().astype(np.bool).astype(np.float32)

for train_index,test_index in kf.split(X):
    print('fold', ifold)
       
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    X_tr,X_va,X_te = make_agg_features(X,train_index,test_index,x_te)
    X_tr = X_tr[train_cols]
    X_va = X_va[train_cols]
    X_te = X_te[train_cols]
    
    yy = y_tr
    ssp = SelectPercentile(percentile=0.1)  
    ssp.fit(train_mat[train_index], yy)
    sp_train_mat = ssp.transform(train_mat[train_index])
    sp_val_mat = ssp.transform(train_mat[test_index])
    sp_test_mat = ssp.transform(test_mat)   
    
    print('prepare train')
    X_tr = sp.hstack([
        X_tr, sp_train_mat
    ]).tocsr()
    print(X_tr.shape)
    print('prepare valid')
    X_va = sp.hstack([
        X_va, sp_val_mat
    ]).tocsr()    
    print('prepare test')
    X_te = sp.hstack([
        X_te, sp_test_mat
    ]).tocsr()     

    # Create the LightGBM data containers
    tr_data = lgb.Dataset(X_tr, label=y_tr) #, categorical_feature=cate_cols
    va_data = lgb.Dataset(X_va, label=y_va) #, categorical_feature=cate_cols

    model = lgb.train(parameters,
                      tr_data,
                      valid_sets=[tr_data,va_data],
                      num_boost_round=8000,
                      early_stopping_rounds=300,
                      verbose_eval=100)
    
    yhat = model.predict(X_va, model.best_iteration)
    scores.append(roc_auc_score(y_va,yhat))
    print(ifold,roc_auc_score(y_va,yhat))
    y_oof.loc[test_index,'target'] = yhat

    print('prepare test')
    
    ytst = model.predict(X_te, model.best_iteration)
    print(ytst)
    y_pred += minmax_scale(ytst)*0.1
    
    del X_tr,X_va,tr_data,va_data, sp_train_mat, sp_val_mat, sp_test_mat
    gc.collect()    
    
    save_submit('lgb_q', ifold, y_pred)

    ifold += 1     
print(scores)
print(np.mean(scores), np.std(scores))    

np.save(results_dir + 'train_' + model_name +'.npy', y_oof.target.values)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

sub = x_te[['uid','target']].copy()
sub['target'] = y_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)
sample_sub.head()

fold 0
prepare train
(385194, 302)
prepare valid
prepare test
Training until validation scores don't improve for 300 rounds.
[100]	training's auc: 0.686688	valid_1's auc: 0.697801
[200]	training's auc: 0.688095	valid_1's auc: 0.699308
[300]	training's auc: 0.688965	valid_1's auc: 0.699675
[400]	training's auc: 0.689871	valid_1's auc: 0.69988
[500]	training's auc: 0.690854	valid_1's auc: 0.699993
[600]	training's auc: 0.691808	valid_1's auc: 0.700054
[700]	training's auc: 0.692743	valid_1's auc: 0.700072
[800]	training's auc: 0.693726	valid_1's auc: 0.700011
[900]	training's auc: 0.694608	valid_1's auc: 0.699987
Early stopping, best iteration is:
[648]	training's auc: 0.692249	valid_1's auc: 0.700121
0 0.700120848549
prepare test
[ 0.68733739  0.66464567  0.56854747 ...,  0.49842045  0.53676299
  0.28347787]
fold 1


KeyboardInterrupt: 

In [24]:
model_name = 'lgb_b2'
import xgboost as xgb
import scipy.sparse as sp
from sklearn.feature_selection import SelectPercentile
def save_submit(model_name, folds, y_pred):
    global x_te
    sub = x_te[['uid','target']].copy()
    sub['target'] = y_pred
    sub.columns = ['cuid','target']
    sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
    sample_sub = sample_sub.merge(sub, on='cuid', how='left')
    sample_sub[['target']].to_csv(results_dir + model_name + '_' + str(folds) + 'folds.csv', header=False, index=False)
    del sub,sample_sub
    gc.collect()
    
def mean_encode_test(df, y, test,k,column):
    mean_0 = np.zeros((test.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)  
    y0s = df[['target',column]].groupby(column).agg(np.mean).reset_index()
    y0s.columns = [column,'target_mean']
    vc = df[column].value_counts().reset_index()
    vc.columns = [column,'counts']
    test = test.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
    test['mean_target'] = (test.target_mean * test.counts + k * m0)/(test.counts + k)
    mean_0 = np.array(test['mean_target']).reshape(-1,1)
    return mean_0    

def mean_encode_self(df, y, kf, k, column):
    mean_0 = np.zeros((y.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)
    for dev_index, val_index in kf: 
        dev_X, val_X = df.iloc[dev_index,:], df.iloc[val_index,:]
        y0s = dev_X[['target',column]].groupby(column).agg(np.mean).reset_index()
        y0s.columns = [column,'target_mean']
        vc = dev_X[column].value_counts().reset_index()
        vc.columns = [column,'counts']
        val_X = val_X.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
        val_X['mean_target'] = (val_X.target_mean * val_X.counts + k * m0)/(val_X.counts + k)
        mean_0[val_index,:] = np.array(val_X['mean_target']).reshape(-1,1)       
    return mean_0

def make_agg_features(X, train_index, test_index, test_data):
    te_cols = ['most_freq_cat']
    kf = KFold(n_splits = 5, random_state=2018, shuffle=True)
    for c in te_cols:
        X.loc[test_index,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), X.loc[test_index,:].copy(), 10.0, c)
        test_data.loc[:,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), test_data.copy(), 10.0, c)
        X.loc[train_index,c + '_te'] = mean_encode_self(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), kf.split(X.loc[train_index,:]), 10.0, c)
    return X.loc[train_index,:], X.loc[test_index,:], test_data
    
train_cols = ['sess_keys_mean','sess_keys_max','diff_key1_mean','diff_key1_max','diff_key2_mean',
              'diff_key2_max','diff_key3_mean','diff_key3_max','quot_key1_mean','quot_key1_max',
              'quot_key2_mean','quot_key2_max','quot_key3_mean','quot_key3_max',
              'num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
              'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
              'records', 'max_days', 'min_days', 'sum_values_f1_max',
              'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
              'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
              'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
              'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
              'mean_day_cntr', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
              'nuniq_keys_f1.2', 'sumval_keys_f1', 'sumval_keys_f1.1',
              'sumval_keys_f1.2', 'most_freq_cat_te', 'diff_num_cats', 'unique_days','max_f1','max_f2','max_f3',
              'svd_description_1','svd_description_2','svd_description_3','svd_description_4','svd_description_5',
              'svd_description_6','svd_description_7','svd_description_8','svd_description_9','svd_description_10',
              'nnet4','nnet5','nnet6','nnet7','nnet10','nnet11','xgb_single','nnet12','nnet13','lgbm1','nnet3','nnet1',
              'nnet8','lgbmb','most_freq_cat_te'] + ['svd_title_'+str(i+1) for i in range(10)]

# Train the model
parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 16,
    'max_depth' : 4,
    #'min_data' : 100,
    #'lambda_l2' : 15.5,
    'min_sum_hessian_in_leaf' : 0.2,
    'lambda_l1' : 6.2,
    'is_unbalance': True,
    'learning_rate': 0.001,
    'feature_fraction': 0.7,
    'verbose': 0
}

kf = KFold(n_splits=10, shuffle=True, random_state=13)

ifold = 0

y_pred = 0
y_oof = X[['uid','target']].copy()
y_oof['target'] = np.nan

scores = []

train_mat = sp.hstack([train_mat1,train_mat2,train_mat3]).tocsr()
test_mat = sp.hstack([test_mat1,test_mat2,test_mat3]).tocsr()
mat_spca = np.load(data_dir + 'spca_dim100.npy')
train_mat_spca = mat_spca[df_train_index.tolist()]
test_mat_spca = mat_spca[df_test_index.tolist()]
del mat_spca

for train_index,test_index in kf.split(X):
    print('fold', ifold)
       
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    X_tr,X_va,X_te = make_agg_features(X,train_index,test_index,x_te)
    X_tr = X_tr[train_cols]
    X_va = X_va[train_cols]
    X_te = X_te[train_cols]
    
    yy = y_tr
    ssp = SelectPercentile(percentile=0.1)  
    ssp.fit(train_mat[train_index], yy)
    sp_train_mat = ssp.transform(train_mat[train_index])
    sp_val_mat = ssp.transform(train_mat[test_index])
    sp_test_mat = ssp.transform(test_mat)   
    
    print('prepare train')
    X_tr = sp.hstack([
        X_tr, sp_train_mat, train_mat_spca[train_index]
    ]).tocsr()
    print(X_tr.shape)
    print('prepare valid')
    X_va = sp.hstack([
        X_va, sp_val_mat, train_mat_spca[test_index]
    ]).tocsr()    
    print('prepare test')
    X_te = sp.hstack([
        X_te, sp_test_mat, test_mat_spca
    ]).tocsr()     

    # Create the LightGBM data containers
    tr_data = lgb.Dataset(X_tr, label=y_tr) #, categorical_feature=cate_cols
    va_data = lgb.Dataset(X_va, label=y_va) #, categorical_feature=cate_cols

    model = lgb.train(parameters,
                      tr_data,
                      valid_sets=[tr_data,va_data],
                      num_boost_round=8000,
                      early_stopping_rounds=300,
                      verbose_eval=100)
    
    yhat = model.predict(X_va, model.best_iteration)
    scores.append(roc_auc_score(y_va,yhat))
    print(ifold,roc_auc_score(y_va,yhat))
    y_oof.loc[test_index,'target'] = yhat

    print('prepare test')
    
    ytst = model.predict(X_te, model.best_iteration)
    y_pred += ytst*0.1
    
    del X_tr,X_va,tr_data,va_data, sp_train_mat, sp_val_mat, sp_test_mat
    gc.collect()    
    
    save_submit('lgb_q', ifold, y_pred)

    ifold += 1     
print(scores)
print(np.mean(scores), np.std(scores))    

np.save(results_dir + 'train_' + model_name +'.npy', y_oof.target.values)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

sub = x_te[['uid','target']].copy()
sub['target'] = y_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)
sample_sub.head()

fold 0
prepare train
(385194, 410)
prepare valid
prepare test
Training until validation scores don't improve for 300 rounds.
[100]	training's auc: 0.688782	valid_1's auc: 0.680434
[200]	training's auc: 0.689369	valid_1's auc: 0.680718
[300]	training's auc: 0.689748	valid_1's auc: 0.681187
[400]	training's auc: 0.690082	valid_1's auc: 0.681541
[500]	training's auc: 0.690511	valid_1's auc: 0.681837
[600]	training's auc: 0.690828	valid_1's auc: 0.682075
[700]	training's auc: 0.691088	valid_1's auc: 0.682264
[800]	training's auc: 0.691347	valid_1's auc: 0.682386
[900]	training's auc: 0.691583	valid_1's auc: 0.682475
[1000]	training's auc: 0.691845	valid_1's auc: 0.682563
[1100]	training's auc: 0.692089	valid_1's auc: 0.682652
[1200]	training's auc: 0.692325	valid_1's auc: 0.68269
[1300]	training's auc: 0.692538	valid_1's auc: 0.682712
[1400]	training's auc: 0.692752	valid_1's auc: 0.682727
[1500]	training's auc: 0.692951	valid_1's auc: 0.68275
[1600]	training's auc: 0.693164	valid_1's auc:

[2500]	training's auc: 0.694809	valid_1's auc: 0.685585
[2600]	training's auc: 0.695072	valid_1's auc: 0.685611
[2700]	training's auc: 0.695337	valid_1's auc: 0.685642
[2800]	training's auc: 0.6956	valid_1's auc: 0.685645
[2900]	training's auc: 0.695853	valid_1's auc: 0.685642
[3000]	training's auc: 0.696117	valid_1's auc: 0.685645
[3100]	training's auc: 0.696391	valid_1's auc: 0.685631
[3200]	training's auc: 0.696669	valid_1's auc: 0.685628
Early stopping, best iteration is:
[2959]	training's auc: 0.696009	valid_1's auc: 0.68565
4 0.685650472164
prepare test
fold 5
prepare train
(385195, 410)
prepare valid
prepare test
Training until validation scores don't improve for 300 rounds.
[100]	training's auc: 0.68827	valid_1's auc: 0.68582
[200]	training's auc: 0.688599	valid_1's auc: 0.686177
[300]	training's auc: 0.689037	valid_1's auc: 0.686457
[400]	training's auc: 0.689382	valid_1's auc: 0.686671
[500]	training's auc: 0.689775	valid_1's auc: 0.68678
[600]	training's auc: 0.690088	valid_

Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.727645
1,ac4b8244f3ae82df511b002257473c11,0.578061
2,483d8b91e49522c8a5bbe37f3872c749,0.673377
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.476768
4,fdbfba9842ff0bf86d600eb334c7c42b,0.444018


In [25]:
sample_sub.head()

Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.727645
1,ac4b8244f3ae82df511b002257473c11,0.578061
2,483d8b91e49522c8a5bbe37f3872c749,0.673377
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.476768
4,fdbfba9842ff0bf86d600eb334c7c42b,0.444018


In [26]:
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_xgb_b1.npy'))
sample_sub.columns = ['uid','xgb_sess4']
x_te = x_te.merge(sample_sub, on='uid', how='left')

sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_lgb_b1.npy'))
sample_sub.columns = ['uid','lgb_sess6']
x_te = x_te.merge(sample_sub, on='uid', how='left')

sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_lgb_b2.npy'))
sample_sub.columns = ['uid','lgb_sess7']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [27]:
X['xgb_sess4'] = minmax_scale(np.load(results_dir + 'train_xgb_b1.npy'))
X['lgb_sess6'] = minmax_scale(np.load(results_dir + 'train_lgb_b1.npy'))
X['lgb_sess7'] = minmax_scale(np.load(results_dir + 'train_lgb_b2.npy'))

In [35]:
meta = ['nnet12','nnet13','lgbm1','lgbmb','nnet1','nnet3','xgb_single'] #
for f in meta:
    print(f,roc_auc_score(X.target, X[f]))

nnet12 0.67274153339
nnet13 0.67264163497
lgbm1 0.682562603224
lgbmb 0.66792997872
nnet1 0.623698784984
nnet3 0.574520607881
xgb_single 0.660175309563


In [36]:
X[meta].corr()

Unnamed: 0,nnet12,nnet13,lgbm1,lgbmb,nnet1,nnet3,xgb_single
nnet12,1.0,0.815829,0.736898,0.844497,0.612611,0.445006,0.825464
nnet13,0.815829,1.0,0.853592,0.710129,0.463131,0.288266,0.687111
lgbm1,0.736898,0.853592,1.0,0.752149,0.494446,0.295207,0.698921
lgbmb,0.844497,0.710129,0.752149,1.0,0.66878,0.501951,0.907113
nnet1,0.612611,0.463131,0.494446,0.66878,1.0,0.487426,0.661838
nnet3,0.445006,0.288266,0.295207,0.501951,0.487426,1.0,0.490909
xgb_single,0.825464,0.687111,0.698921,0.907113,0.661838,0.490909,1.0


In [37]:
x_te[meta].corr()

Unnamed: 0,nnet12,nnet13,lgbm1,lgbmb,nnet1,nnet3,xgb_single
nnet12,1.0,0.872521,0.793141,0.837708,0.643211,0.436315,0.821308
nnet13,0.872521,1.0,0.871866,0.713111,0.511538,0.316164,0.68175
lgbm1,0.793141,0.871866,1.0,0.778259,0.536187,0.342257,0.72694
lgbmb,0.837708,0.713111,0.778259,1.0,0.641589,0.466649,0.933161
nnet1,0.643211,0.511538,0.536187,0.641589,1.0,0.488963,0.639809
nnet3,0.436315,0.316164,0.342257,0.466649,0.488963,1.0,0.456628
xgb_single,0.821308,0.68175,0.72694,0.933161,0.639809,0.456628,1.0


In [33]:
import xgboost as xgb
import scipy.sparse as sp
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectPercentile
def save_submit(model_name, folds, y_pred):
    global x_te
    sub = x_te[['uid','target']].copy()
    sub['target'] = y_pred
    sub.columns = ['cuid','target']
    sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
    sample_sub = sample_sub.merge(sub, on='cuid', how='left')
    sample_sub[['target']].to_csv(results_dir + model_name + '_' + str(folds) + 'folds.csv', header=False, index=False)
    del sub,sample_sub
    gc.collect()
    
def mean_encode_test(df, y, test,k,column):
    mean_0 = np.zeros((test.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)  
    y0s = df[['target',column]].groupby(column).agg(np.mean).reset_index()
    y0s.columns = [column,'target_mean']
    vc = df[column].value_counts().reset_index()
    vc.columns = [column,'counts']
    test = test.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
    test['mean_target'] = (test.target_mean * test.counts + k * m0)/(test.counts + k)
    mean_0 = np.array(test['mean_target']).reshape(-1,1)
    return mean_0    

def mean_encode_self(df, y, kf, k, column):
    mean_0 = np.zeros((y.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)
    for dev_index, val_index in kf: 
        dev_X, val_X = df.iloc[dev_index,:], df.iloc[val_index,:]
        y0s = dev_X[['target',column]].groupby(column).agg(np.mean).reset_index()
        y0s.columns = [column,'target_mean']
        vc = dev_X[column].value_counts().reset_index()
        vc.columns = [column,'counts']
        val_X = val_X.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
        val_X['mean_target'] = (val_X.target_mean * val_X.counts + k * m0)/(val_X.counts + k)
        mean_0[val_index,:] = np.array(val_X['mean_target']).reshape(-1,1)       
    return mean_0

def make_agg_features(X, train_index, test_index, test_data):
    te_cols = ['most_freq_cat']
    kf = KFold(n_splits = 5, random_state=2018, shuffle=True)
    for c in te_cols:
        X.loc[test_index,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), X.loc[test_index,:].copy(), 10.0, c)
        test_data.loc[:,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), test_data.copy(), 10.0, c)
        X.loc[train_index,c + '_te'] = mean_encode_self(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), kf.split(X.loc[train_index,:]), 10.0, c)
    return X.loc[train_index,:], X.loc[test_index,:], test_data
    
train_cols = meta

# Train the model
parameters = {
    'booster' : 'gbtree',
    'n_estimators':20000,
    'max_depth':4,
    'objective':"binary:logistic",
    'eval_metric':'auc',
    'learning_rate':0.005, 
    'subsample':.6,
    'min_child_weight':10,
    'colsample_bytree':.6,
    'scale_pos_weight': 19,
    'gamma':1,
    #'reg_alpha':1,
    'reg_lambda':1.3,
}

kf = KFold(n_splits=10, shuffle=True, random_state=239)

ifold = 0

y_pred = 0
y_oof = X[['uid','target']].copy()
y_oof['target'] = np.nan

scores = []

for train_index,test_index in kf.split(X):
    print('fold', ifold)
       
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    X_tr,X_va,X_te = make_agg_features(X,train_index,test_index,x_te)

    
    X_tr = X_tr[train_cols].fillna(0).values
    X_va = X_va[train_cols].fillna(0).values
    X_te = X_te[train_cols].fillna(0).values
    
       
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_tr)
    X_tr = scaler.transform(X_tr)
    X_va = scaler.transform(X_va)
    X_te = scaler.transform(X_te)
    del scaler

    y_hat_med = pd.DataFrame(X_va).mean(axis=1)
    #print(ifold,'median:',roc_auc_score(y_va,y_hat_med))
    
    
    # Create the LightGBM data containers
    model = Ridge(alpha=20)
    model.fit(X_tr,y_tr)
    
    yhat = model.predict(X_va)
    scores.append(roc_auc_score(y_va,yhat))
    print(ifold,roc_auc_score(y_va,yhat))
    y_oof.loc[test_index,'target'] = yhat

    print('prepare test')
    
    #
    ytst = pd.DataFrame(X_te).mean(axis=1)
    ytst = model.predict(X_te)
    print(minmax_scale(ytst))
    y_pred += minmax_scale(ytst)*0.1
    
    del X_tr,X_va,X_te
    gc.collect()    
    
    save_submit('xgb_q', ifold, y_pred)

    ifold += 1    

fold 0
0 0.698668673061
prepare test
[ 0.14689715  0.14794698  0.10039477 ...,  0.07865642  0.0795201
  0.04324515]
fold 1
1 0.680282222997
prepare test
[ 0.14906741  0.14968694  0.10206368 ...,  0.08114234  0.08153055
  0.04605674]
fold 2
2 0.672018064269
prepare test
[ 0.14804478  0.14921921  0.10076937 ...,  0.07930363  0.07985425
  0.04391066]
fold 3
3 0.688242992587
prepare test
[ 0.14820545  0.1487872   0.10010541 ...,  0.07902909  0.07945207
  0.04392936]
fold 4
4 0.681608416947
prepare test
[ 0.14675651  0.14805475  0.10015005 ...,  0.07874264  0.07952918
  0.0439385 ]
fold 5
5 0.683953307708
prepare test
[ 0.14593103  0.14723824  0.09932246 ...,  0.07791339  0.07879705
  0.04330436]
fold 6
6 0.678530035578
prepare test
[ 0.15262749  0.15368207  0.10530478 ...,  0.08421273  0.08437358
  0.04908429]
fold 7
7 0.694977085688
prepare test
[ 0.14813504  0.14974     0.10206602 ...,  0.08021212  0.08106084
  0.04498688]
fold 8
8 0.683650882628
prepare test
[ 0.14763976  0.14892431  0.

In [34]:
print(scores)
print(np.mean(scores), np.std(scores))
roc_auc_score(X.target.values, y_oof.target.values)

[0.69866867306089697, 0.68028222299680186, 0.6720180642689908, 0.6882429925869018, 0.6816084169472465, 0.68395330770756191, 0.67853003557793867, 0.69497708568816041, 0.68365088262772289, 0.69192788633314306]
0.68538595678 0.00768417125345


0.68522457782906487

In [28]:
model_name = 'ridge_e1'
np.save(results_dir + 'train_' + model_name +'.npy', y_oof.target.values)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

sub = x_te[['uid','target']].copy()
sub['target'] = y_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.246518
1,ac4b8244f3ae82df511b002257473c11,0.106445
2,483d8b91e49522c8a5bbe37f3872c749,0.165956
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.074222
4,fdbfba9842ff0bf86d600eb334c7c42b,0.064321


In [44]:
s = pd.DataFrame()
s['s7463'] = minmax_scale(pd.read_csv(results_dir + 'ridge_b1.csv', header=None, names=['v']).v.values)
s['s7412'] = minmax_scale(pd.read_csv(results_dir + 'baseline_sparse_10folds.csv', header=None, names=['v']).v.values)

In [45]:
s.head()

Unnamed: 0,s7463,s7412
0,0.811052,0.184876
1,0.570673,0.096343
2,0.726611,0.151491
3,0.429136,0.055863
4,0.389015,0.051491


In [40]:
len(sample_sub)

181024