In [1]:
model_name = 'baseline_sparse'
data_dir = './data/mlboot_dataset/'
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp

In [8]:
df = pd.read_csv(data_dir + 'preprocessed.csv') 
q = pd.read_csv(data_dir + 'sessions.csv')
df = df.merge(q, on='uid', how='left')
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [2]:
df = pd.read_csv(data_dir + 'preprocessed.csv') 
q = pd.read_csv(data_dir + 'sessions.csv')
df = df.merge(q, on='uid', how='left')
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')
df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

In [3]:
df = pd.read_csv(data_dir + 'preprocessed.csv') 
q = pd.read_csv(data_dir + 'sessions.csv')
df = df.merge(q, on='uid', how='left')
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil().astype(np.bool).astype(np.int8)
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil().astype(np.bool).astype(np.int8)
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil().astype(np.bool).astype(np.int8)

In [4]:
df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [5]:
%%time
train_mat1 = mat1[df_train_index.tolist()]

CPU times: user 6.32 s, sys: 52.2 ms, total: 6.37 s
Wall time: 5.8 s


In [6]:
test_mat1 = mat1[df_test_index.tolist()]
train_mat2 = mat2[df_train_index.tolist()]
test_mat2 = mat2[df_test_index.tolist()]
train_mat3 = mat3[df_train_index.tolist()]
test_mat3 = mat3[df_test_index.tolist()]

In [9]:
train_cols = [u'num_keys_f1',
       u'sum_values_f1', u'num_keys_f2', u'sum_values_f2', u'num_keys_f3',
       u'sum_values_f3', u'num_times_cat_eq_0', u'num_times_cat_eq_1',
       u'num_times_cat_eq_2', u'num_times_cat_eq_3', u'num_times_cat_eq_4',
       u'num_times_cat_eq_5', u'records', u'max_days', u'min_days',
       u'sum_values_f1_std', u'num_keys_f1_std', u'sum_values_f2_std',
       u'num_keys_f2_std', u'sum_values_f3_std', u'num_keys_f3_std',
       u'sum_values_f1_max', u'num_keys_f1_max', u'sum_values_f2_max',
       u'num_keys_f2_max', u'sum_values_f3_max', u'num_keys_f3_max',
       u'sum_values_f1_mean', u'num_keys_f1_mean', u'sum_values_f2_mean',
       u'num_keys_f2_mean', u'sum_values_f3_mean', u'num_keys_f3_mean',
       'sess_keys_mean','sess_keys_max','diff_key1_mean','diff_key1_max','diff_key2_mean',
       'diff_key2_max','diff_key3_mean','diff_key3_max','quot_key1_mean','quot_key1_max',
       'quot_key2_mean','quot_key2_max','quot_key3_mean','quot_key3_max']

# Train the model
parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 128,
    'max_depth' : 12,
    #'min_data' : 30,
    #'lambda_l2' : 15,
    'min_sum_hessian_in_leaf' : 0.3,
    'lambda_l1' : 2.5,
    #'max_drop' : 1,
    'learning_rate': 0.01,
    'feature_fraction': 0.7,
    'verbose': 0
}

kf = KFold(n_splits=5, shuffle=True, random_state=1239)

ifold = 0

y_pred = 0
y_oof = X[['uid','target']].copy()
y_oof['target'] = np.nan

scores = []

for train_index,test_index in kf.split(X):       
    print('fold', ifold)
    
    y_tr,y_va = X.loc[train_index,'target'].values,X.loc[test_index,'target'].values
    X_tr,X_va = X.loc[train_index, train_cols].values,X.loc[test_index, train_cols].values
    X_te = x_te[train_cols]
    
    mat = test_mat1.sum(axis=0)
    ixs1 = np.asarray(mat)[0].argsort()[-4500:][::-1]
    mat = test_mat2.sum(axis=0)
    ixs2 = np.asarray(mat)[0].argsort()[-4500:][::-1]
    mat = test_mat3.sum(axis=0)
    ixs3 = np.asarray(mat)[0].argsort()[-4500:][::-1]
    
    print('prepare train')
    X_tr = sp.hstack([
        X_tr, train_mat1[:,ixs1][train_index], train_mat2[:,ixs2][train_index], train_mat3[:,ixs3][train_index]
    ]).tocsr()
    print('prepare valid')
    X_va = sp.hstack([
        X_va, train_mat1[:,ixs1][test_index], train_mat2[:,ixs2][test_index], train_mat3[:,ixs3][test_index]
    ]).tocsr()
    
    # Create the LightGBM data containers
    tr_data = lgb.Dataset(X_tr, label=y_tr) #, categorical_feature=cate_cols
    va_data = lgb.Dataset(X_va, label=y_va) #, categorical_feature=cate_cols

    model = lgb.train(parameters,
                      tr_data,
                      valid_sets=va_data,
                      num_boost_round=8000,
                      early_stopping_rounds=200,
                      verbose_eval=50)
    
    yhat = model.predict(X_va, model.best_iteration)
    print(ifold,roc_auc_score(y_va,yhat))
    scores.append(roc_auc_score(y_va,yhat))
    y_oof.loc[test_index,'target'] = yhat

    print('prepare test')
    X_te = sp.hstack([
        X_te, test_mat1[:,ixs1], test_mat2[:,ixs2], test_mat3[:,ixs3]
    ]).tocsr()   
    
    ytst = model.predict(X_te, model.best_iteration)
    y_pred += ytst*0.1
    
    ifold += 1

fold 0
prepare train
prepare valid
Training until validation scores don't improve for 200 rounds.
[50]	valid_0's auc: 0.633039
[100]	valid_0's auc: 0.637693
[150]	valid_0's auc: 0.642734
[200]	valid_0's auc: 0.647132
[250]	valid_0's auc: 0.651836
[300]	valid_0's auc: 0.656059
[350]	valid_0's auc: 0.659587
[400]	valid_0's auc: 0.662764
[450]	valid_0's auc: 0.665176
[500]	valid_0's auc: 0.667065
[550]	valid_0's auc: 0.668941
[600]	valid_0's auc: 0.670284
[650]	valid_0's auc: 0.671323
[700]	valid_0's auc: 0.672324
[750]	valid_0's auc: 0.67303
[800]	valid_0's auc: 0.67356
[850]	valid_0's auc: 0.67389
[900]	valid_0's auc: 0.674288
[950]	valid_0's auc: 0.674353
[1000]	valid_0's auc: 0.674518
[1050]	valid_0's auc: 0.674673
[1100]	valid_0's auc: 0.674904
[1150]	valid_0's auc: 0.675045
[1200]	valid_0's auc: 0.674805
[1250]	valid_0's auc: 0.67454
[1300]	valid_0's auc: 0.674536
Early stopping, best iteration is:
[1147]	valid_0's auc: 0.67506
0 0.675060116656
prepare test
fold 1
prepare train
prep

In [10]:
print(scores,np.mean(scores))
roc_auc_score(X.target.values, y_oof.target.values)

[0.67506011665592647, 0.68253779577975482, 0.68325450130564791, 0.67548015393408023, 0.67650227323519529] 0.678566968182


0.67856009029978326

In [11]:
model_name = 'sparse_5folds_timp_bin'

results_dir = './results/'
np.save(results_dir + 'train_' + model_name +'.npy', y_oof.target.values)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

In [12]:
sub = x_te[['uid','target']].copy()
sub['target'] = y_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.065947
1,ac4b8244f3ae82df511b002257473c11,0.031363
2,483d8b91e49522c8a5bbe37f3872c749,0.047066
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.018885
4,fdbfba9842ff0bf86d600eb334c7c42b,0.015534


In [13]:
sample_sub[['target']].to_csv(results_dir + model_name + '.csv', header=False, index=False)