In [3]:
data_dir = './data/mlboot_dataset/'
model_name = 'nt5'
results_dir = './results/'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp
import gc

In [2]:
df = pd.read_csv(data_dir + 'preprocessed.csv') 
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil()
print(mat1.shape, mat2.shape, mat3.shape)

df['max_f1'] = mat1.tocsr().max(axis=1).todense()
df['max_f2'] = mat2.tocsr().max(axis=1).todense()
df['max_f3'] = mat3.tocsr().max(axis=1).todense()

limit = 4
mat1 = mat1.tocsc()[:, np.where(mat1.getnnz(axis=0) > limit)[0]].tocsr()
mat2 = mat2.tocsc()[:, np.where(mat2.getnnz(axis=0) > limit)[0]].tocsr()
mat3 = mat3.tocsc()[:, np.where(mat3.getnnz(axis=0) > limit)[0]].tocsr()
print(mat1.shape, mat2.shape, mat3.shape)

df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

train_mat1 = mat1[df_train_index.tolist()]
test_mat1 = mat1[df_test_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
test_mat2 = mat2[df_test_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]
test_mat3 = mat3[df_test_index.tolist()]

del mat1,mat2,mat3
gc.collect()

(609018, 2053602) (609018, 2812610) (609018, 1057788)
(609018, 598456) (609018, 20275) (609018, 92738)


21

In [4]:
df['most_freq_cat'] = np.argmax(df[[u'num_times_cat_eq_0', u'num_times_cat_eq_1',
       u'num_times_cat_eq_2', u'num_times_cat_eq_3', u'num_times_cat_eq_4',
       u'num_times_cat_eq_5']].fillna(0).values, axis=1)

In [5]:
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [6]:
from sklearn.linear_model import Ridge,SGDRegressor
kf = KFold(n_splits=10, shuffle=True, random_state=239)

ifold = 0

y_ridge_pred1 = 0
y_ridge_oof1 = X[['uid','target']].copy()
y_ridge_oof1['target'] = np.nan

for train_index,test_index in kf.split(X):
    print('fold', ifold)
    
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    X_tr = train_mat2[train_index]
    X_va = train_mat2[test_index]
    X_te = test_mat2
    
    model = Ridge(max_iter=12)
    model.fit(X_tr,y_tr)
    
    yhat = model.predict(X_va)
    print(ifold,roc_auc_score(y_va,yhat))
    y_ridge_oof1.loc[test_index,'target'] = yhat

    ytst = model.predict(X_te)
    y_ridge_pred1 += ytst*0.1
    
    ifold += 1
X['ridge2'] = y_ridge_oof1.target.values
x_te['ridge2'] = y_ridge_pred1

fold 0




0 0.626105242626
fold 1
1 0.609323199151
fold 2
2 0.608542823781
fold 3
3 0.616122329507
fold 4
4 0.61329483061
fold 5
5 0.626851899506
fold 6
6 0.619582901644
fold 7
7 0.617508361161
fold 8
8 0.624439639139
fold 9
9 0.615391426295


In [7]:
X['diff_num_cats'] = (X['num_times_cat_eq_0']>0).astype(np.int32)+(X['num_times_cat_eq_1']>0).astype(np.int32)+\
(X['num_times_cat_eq_2']>0).astype(np.int32)+(X['num_times_cat_eq_3']>0).astype(np.int32)+\
(X['num_times_cat_eq_4']>0).astype(np.int32)+(X['num_times_cat_eq_5']>0).astype(np.int32)

In [8]:
x_te['diff_num_cats'] = (x_te['num_times_cat_eq_0']>0).astype(np.int32)+(x_te['num_times_cat_eq_1']>0).astype(np.int32)+\
(x_te['num_times_cat_eq_2']>0).astype(np.int32)+(x_te['num_times_cat_eq_3']>0).astype(np.int32)+\
(x_te['num_times_cat_eq_4']>0).astype(np.int32)+(x_te['num_times_cat_eq_5']>0).astype(np.int32)

In [9]:
from sklearn.preprocessing import minmax_scale
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_nn_base_model.npy'))
sample_sub.columns = ['uid','nnet']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [10]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_baseline_sparse_10folds.npy'))
sample_sub.columns = ['uid','lgbm1']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [11]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_all_in_focal_loss.npy'))
sample_sub.columns = ['uid','nnet2']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [12]:
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_focal_loss_m3.npy'))
sample_sub.columns = ['uid','nnet3']
x_te = x_te.merge(sample_sub, on='uid', how='left')
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sample_sub['target'] = minmax_scale(np.load(results_dir + 'test_focal_loss_m1.npy'))
sample_sub.columns = ['uid','nnet1']
x_te = x_te.merge(sample_sub, on='uid', how='left')

In [13]:
X['lgbm1'] = minmax_scale(np.load(results_dir + 'train_baseline_sparse_10folds.npy'))
X['nnet'] = minmax_scale(np.load(results_dir + 'train_nn_base_model.npy'))
X['nnet2'] = minmax_scale(np.load(results_dir + 'train_all_in_focal_loss.npy'))
X['nnet1'] = minmax_scale(np.load(results_dir + 'train_focal_loss_m1.npy'))
X['nnet3'] = minmax_scale(np.load(results_dir + 'train_focal_loss_m3.npy'))

In [14]:
x_te[['nnet','lgbm1','nnet2','nnet1','nnet3']].corr()

Unnamed: 0,nnet,lgbm1,nnet2,nnet1,nnet3
nnet,1.0,0.681589,0.75349,0.488332,0.330658
lgbm1,0.681589,1.0,0.593723,0.536187,0.342257
nnet2,0.75349,0.593723,1.0,0.582909,0.353475
nnet1,0.488332,0.536187,0.582909,1.0,0.488963
nnet3,0.330658,0.342257,0.353475,0.488963,1.0


In [15]:
X[['lgbm1','nnet','nnet2','nnet1','nnet3','target']].corr()

Unnamed: 0,lgbm1,nnet,nnet2,nnet1,nnet3,target
lgbm1,1.0,0.66497,0.570985,0.494446,0.295207,0.170409
nnet,0.66497,1.0,0.659126,0.460005,0.289962,0.130658
nnet2,0.570985,0.659126,1.0,0.548694,0.312146,0.111062
nnet1,0.494446,0.460005,0.548694,1.0,0.487426,0.09123
nnet3,0.295207,0.289962,0.312146,0.487426,1.0,0.049012
target,0.170409,0.130658,0.111062,0.09123,0.049012,1.0


In [16]:
for f in ['lgbm1','nnet','nnet2','nnet1','nnet3']:
    print(roc_auc_score(X.target, X[f]))

0.682562603224
0.648537598744
0.649222662589
0.623698784984
0.574520607881


In [17]:
X['ridge1'] = np.load(results_dir + 'train_ridge1.npy')
x_te['ridge1'] = np.load(results_dir + 'test_ridge1.npy')
X['ridge3'] = np.load(results_dir + 'train_ridge3.npy')
x_te['ridge3'] = np.load(results_dir + 'test_ridge3.npy')

In [18]:
def save_submit(model_name, folds, y_pred):
    global x_te
    sub = x_te[['uid','target']].copy()
    sub['target'] = y_pred
    sub.columns = ['cuid','target']
    sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
    sample_sub = sample_sub.merge(sub, on='cuid', how='left')
    sample_sub[['target']].to_csv(results_dir + model_name + '_' + str(folds) + 'folds.csv', header=False, index=False)
    del sub,sample_sub
    gc.collect()
    
def mean_encode_test(df, y, test,k,column):
    mean_0 = np.zeros((test.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)  
    y0s = df[['target',column]].groupby(column).agg(np.mean).reset_index()
    y0s.columns = [column,'target_mean']
    vc = df[column].value_counts().reset_index()
    vc.columns = [column,'counts']
    test = test.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
    test['mean_target'] = (test.target_mean * test.counts + k * m0)/(test.counts + k)
    mean_0 = np.array(test['mean_target']).reshape(-1,1)
    return mean_0    

def mean_encode_self(df, y, kf, k, column):
    mean_0 = np.zeros((y.shape[0],1))
    df['target'] = y
    m0 = np.mean(y)
    for dev_index, val_index in kf: 
        dev_X, val_X = df.iloc[dev_index,:], df.iloc[val_index,:]
        y0s = dev_X[['target',column]].groupby(column).agg(np.mean).reset_index()
        y0s.columns = [column,'target_mean']
        vc = dev_X[column].value_counts().reset_index()
        vc.columns = [column,'counts']
        val_X = val_X.merge(y0s, on = column,how= 'left').merge(vc, on = column,how= 'left')
        val_X['mean_target'] = (val_X.target_mean * val_X.counts + k * m0)/(val_X.counts + k)
        mean_0[val_index,:] = np.array(val_X['mean_target']).reshape(-1,1)       
    return mean_0

def make_agg_features(X, train_index, test_index, test_data):
    te_cols = ['most_freq_cat']
    kf = KFold(n_splits = 5, random_state=2018, shuffle=True)
    for c in te_cols:
        X.loc[test_index,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), X.loc[test_index,:].copy(), 10.0, c)
        test_data.loc[:,c + '_te'] = mean_encode_test(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), test_data.copy(), 10.0, c)
        X.loc[train_index,c + '_te'] = mean_encode_self(X.loc[train_index,:].copy(), X.loc[train_index,'target'].copy(), kf.split(X.loc[train_index,:]), 10.0, c)
    return X.loc[train_index,:], X.loc[test_index,:], test_data
    
train_cols = [u'num_keys_f1', 'max_f1', 'max_f2', 'max_f3', 'diff_num_cats',
       u'sum_values_f1', u'num_keys_f2', u'sum_values_f2', u'num_keys_f3',
       u'sum_values_f3', u'num_times_cat_eq_0', u'num_times_cat_eq_1',
       u'num_times_cat_eq_2', u'num_times_cat_eq_3', u'num_times_cat_eq_4',
       u'num_times_cat_eq_5', u'records', u'max_days', u'min_days',
       u'sum_values_f1_std', u'num_keys_f1_std', u'sum_values_f2_std',
       u'num_keys_f2_std', u'sum_values_f3_std', u'num_keys_f3_std',
       u'sum_values_f1_max', u'num_keys_f1_max', u'sum_values_f2_max',
       u'num_keys_f2_max', u'sum_values_f3_max', u'num_keys_f3_max',
       u'sum_values_f1_mean', u'num_keys_f1_mean', u'sum_values_f2_mean',
       u'num_keys_f2_mean', u'sum_values_f3_mean', u'num_keys_f3_mean', 
       'nnet','lgbm1','nnet2','nnet1','nnet3','most_freq_cat_te',
       'ridge1','ridge3','ridge2']


# Train the model
parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 16,
    'max_depth' : 4,
    'min_data' : 30,
    #'lambda_l2' : 1.5,
    'min_sum_hessian_in_leaf' : 0.1,
    #'lambda_l1' : 0.2,
    'is_unbalance': True,
    'learning_rate': 0.01,
    'feature_fraction': 0.7,
    'verbose': 0
}

kf = KFold(n_splits=10, shuffle=True, random_state=239)

ifold = 0

y_pred = 0
y_oof = X[['uid','target']].copy()
y_oof['target'] = np.nan

scores = []

for train_index,test_index in kf.split(X):
    print('fold', ifold)
       
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    X_tr,X_va,X_te = make_agg_features(X,train_index,test_index,x_te)
    X_tr = X_tr[train_cols]
    X_va = X_va[train_cols]
    X_te = X_te[train_cols]
 

    print(ifold,'lgbm',roc_auc_score(y_va,X.loc[test_index, :].lgbm1))
    print(ifold,'nn',roc_auc_score(y_va,X.loc[test_index, :].nnet))
    print(ifold,'nn2',roc_auc_score(y_va,X.loc[test_index, :].nnet2))
    print(ifold,'nn1',roc_auc_score(y_va,X.loc[test_index, :].nnet1))
    print(ifold,'nn3',roc_auc_score(y_va,X.loc[test_index, :].nnet3))
       
    # Create the LightGBM data containers
    tr_data = lgb.Dataset(X_tr, label=y_tr) #, categorical_feature=cate_cols
    va_data = lgb.Dataset(X_va, label=y_va) #, categorical_feature=cate_cols

    model = lgb.train(parameters,
                      tr_data,
                      valid_sets=[tr_data,va_data],
                      num_boost_round=8000,
                      early_stopping_rounds=300,
                      verbose_eval=100)
    
    yhat = model.predict(X_va, model.best_iteration)
    scores.append(roc_auc_score(y_va,yhat))
    print(ifold,roc_auc_score(y_va,yhat))
    y_oof.loc[test_index,'target'] = yhat

    print('prepare test')
    
    ytst = model.predict(X_te, model.best_iteration)
    y_pred += ytst*0.1
    
    del X_tr,X_va,tr_data,va_data
    gc.collect()    
    
    save_submit('lgb_q', ifold, y_pred)

    ifold += 1    

fold 0
0 lgbm 0.696430153644
0 nn 0.664470244549
0 nn2 0.669799660145
0 nn1 0.62858143074
0 nn3 0.578370888983
Training until validation scores don't improve for 300 rounds.
[100]	training's auc: 0.690582	valid_1's auc: 0.702004
[200]	training's auc: 0.692581	valid_1's auc: 0.702801
[300]	training's auc: 0.694626	valid_1's auc: 0.703148
[400]	training's auc: 0.696758	valid_1's auc: 0.70314
[500]	training's auc: 0.698755	valid_1's auc: 0.702994
[600]	training's auc: 0.700608	valid_1's auc: 0.702776
Early stopping, best iteration is:
[318]	training's auc: 0.694971	valid_1's auc: 0.703193
0 0.703193226062
prepare test
fold 1
1 lgbm 0.679540369801
1 nn 0.642941694898
1 nn2 0.642705688137
1 nn1 0.614755310425
1 nn3 0.570227112295
Training until validation scores don't improve for 300 rounds.
[100]	training's auc: 0.692755	valid_1's auc: 0.681383
[200]	training's auc: 0.694785	valid_1's auc: 0.682428
[300]	training's auc: 0.696682	valid_1's auc: 0.682943
[400]	training's auc: 0.698707	valid_

In [19]:
print(scores)
print(np.mean(scores), np.std(scores))

[0.7031932260616095, 0.684053238540697, 0.67510996191806116, 0.69493612123727277, 0.68399008057849842, 0.68934470712390361, 0.68524272622684657, 0.69450546947435277, 0.68884208729959029, 0.69130655857097356]
0.689052417703 0.00727359537567


In [34]:
print(score)
print(np.mean(score), np.std(score))

[0.70101884559, 0.683251990687, 0.674342696529, 0.69439042338, 0.683629739895, 0.688567379887, 0.683618554598, 0.693767584936, 0.687560690599, 0.690429041143]
0.688057694724 0.00705220902941


In [43]:
model_name = 'nn_all_in_one3'
np.save(results_dir + 'train_' + model_name +'.npy', y_oof.target.values)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

In [44]:
sub = x_te[['uid','target']].copy()
sub['target'] = y_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.689239
1,ac4b8244f3ae82df511b002257473c11,0.61793
2,483d8b91e49522c8a5bbe37f3872c749,0.701102
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.505842
4,fdbfba9842ff0bf86d600eb334c7c42b,0.510307


In [45]:
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)

In [46]:
from sklearn.preprocessing import minmax_scale
a = pd.DataFrame()
a['sol0'] = minmax_scale(pd.read_csv(results_dir + 'nn_base_model.csv', header=None)[0].values)
a['sol1'] = minmax_scale(pd.read_csv(results_dir + 'new_try.csv', header=None)[0].values)
a['sol2'] = minmax_scale(pd.read_csv(results_dir + 'baseline_sparse_10folds.csv', header=None)[0].values)
a['sol3'] = minmax_scale(pd.read_csv(results_dir + 'nn_all_in_one.csv', header=None)[0].values)
a['sol4'] = minmax_scale(pd.read_csv(results_dir + 'nn_all_in_one2.csv', header=None)[0].values)
a.corr()

Unnamed: 0,sol0,sol1,sol2,sol3,sol4
sol0,1.0,0.719971,0.681589,0.71623,0.70922
sol1,0.719971,1.0,0.826783,0.982878,0.975515
sol2,0.681589,0.826783,1.0,0.813299,0.819038
sol3,0.71623,0.982878,0.813299,1.0,0.992501
sol4,0.70922,0.975515,0.819038,0.992501,1.0
