In [1]:
data_dir = './data/mlboot_dataset/'
model_name = 'ftrl'
results_dir = './results/'

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import scipy.sparse as sp

In [None]:
import gc
df = pd.read_csv(data_dir + 'preprocessed_new.csv') 
y = pd.read_table(data_dir + 'mlboot_train_answers.tsv')
y.columns = ['uid','target']
df = df.merge(y, on='uid', how='left')

df_train_index = df[~df.target.isnull()].index
df_test_index = df[df.target.isnull()].index

mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil()
print(mat1.shape, mat2.shape, mat3.shape)

df['max_f1'] = mat1.tocsr().max(axis=1).todense()
df['max_f2'] = mat2.tocsr().max(axis=1).todense()
df['max_f3'] = mat3.tocsr().max(axis=1).todense()

In [None]:
mat1 = sp.load_npz(data_dir+'dmat1.npz').tolil()
mat2 = sp.load_npz(data_dir+'dmat2.npz').tolil()
mat3 = sp.load_npz(data_dir+'dmat3.npz').tolil()
print(mat1.shape, mat2.shape, mat3.shape)

mat = sp.hstack([mat1,mat2,mat3])
del mat1,mat2,mat3
gc.collect()

In [7]:
train_mat = mat.tocsr()[df_train_index.tolist()]
test_mat = mat.tocsr()[df_test_index.tolist()]

mat = mat.tocsc()[:, np.where((train_mat.getnnz(axis=0) > 4) & (test_mat.getnnz(axis=0) > 0))[0]].tocsr()

In [8]:
print('scaling matrix')
from sklearn.preprocessing import MaxAbsScaler 
scaler_mat = MaxAbsScaler()
mat = scaler_mat.fit_transform(mat)
mat.shape

scaling matrix


(609018, 338171)

In [9]:
X = df.loc[~df.target.isnull(),:].reset_index(drop=True)
x_te = df.loc[df.target.isnull(),:].reset_index(drop=True)

In [10]:
%%time
train_mat = mat[df_train_index.tolist()]
test_mat = mat[df_test_index.tolist()]
del mat
gc.collect()

CPU times: user 10 s, sys: 2.1 s, total: 12.1 s
Wall time: 11.6 s


# Data preprocessing

In [11]:
train_cols = ['num_times_cat_eq_0', 'num_times_cat_eq_1', 'num_times_cat_eq_2',
       'num_times_cat_eq_3', 'num_times_cat_eq_4', 'num_times_cat_eq_5',
       'records', 'max_days', 'min_days', 'sum_values_f1_max',
       'num_keys_f1_max', 'sum_values_f2_max', 'num_keys_f2_max',
       'sum_values_f3_max', 'num_keys_f3_max', 'sum_values_f1_mean',
       'num_keys_f1_mean', 'sum_values_f2_mean', 'num_keys_f2_mean',
       'sum_values_f3_mean', 'num_keys_f3_mean', 'max_day_cntr',
       'mean_day_cntr', 'nuniq_keys_f1', 'nuniq_keys_f1.1',
       'nuniq_keys_f1.2', 'sumval_keys_f1', 'sumval_keys_f1.1',
       'sumval_keys_f1.2', 'most_freq_cat_te', 'diff_num_cats', 'unique_days',
       'max_f1', 'max_f2', 'max_f3']

In [11]:
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler(feature_range=(-1,1))
#scaler.fit(X[train_cols].fillna(0).values)
#X[train_cols] = scaler.transform(X[train_cols].fillna(0).values)
#x_te[train_cols] = scaler.transform(x_te[train_cols].fillna(0).values)

In [22]:
from sklearn.model_selection import KFold
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=239)
pred = np.zeros(y.shape)
test_pred = 0
ifold = 0

fold_auc = []
y = X.target.values

from wordbatch.models import FM_FTRL
from sklearn.feature_selection import SelectPercentile
cpu_cores = 4
for trn_inx, val_inx in kf.split(y):
    print("Training fold {}".format(ifold))
    y_tr,y_va = y[trn_inx],y[val_inx]
    X_tr = X.loc[:, train_cols].fillna(0).values[trn_inx]
    X_va = X.loc[:, train_cols].fillna(0).values[val_inx]
    X_te = x_te.loc[:,train_cols].fillna(0).values
    
    yy = y_tr
    ssp = SelectPercentile(percentile=50)  
    ssp.fit(train_mat[trn_inx], yy)
    sp_train_mat = ssp.transform(train_mat[trn_inx])
    sp_val_mat = ssp.transform(train_mat[val_inx])
    sp_test_mat = ssp.transform(test_mat)  
    print('shape: ',sp_train_mat.shape)
    print('max: ',sp_train_mat.max())
    
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_tr)
    X_tr = scaler.transform(X_tr)
    X_va = scaler.transform(X_va)
    X_te = scaler.transform(X_te)
    del scaler
    
    trn_seq = sp.hstack([X_tr, sp_train_mat])
    val_seq = sp.hstack([X_va, sp_val_mat])
    te_seq = sp.hstack([X_te, sp_test_mat])
    
    model = FM_FTRL(alpha=0.02, beta=0.01, L1=0.000001, L2=0.001, D=trn_seq.shape[1], alpha_fm=0.03, L2_fm=0.005, init_fm=0.2,
                           D_fm=30, e_noise=0.0000002, iters=1, inv_link="sigmoid", threads=8)
    score = 0
    for i in range(8):
        model.fit(trn_seq, y_tr)
        yhat = model.predict(val_seq).ravel()
        if (np.isnan(yhat).any()):
            print(':(')
            break
        pred[val_inx] = yhat
        score = roc_auc_score(y_va,yhat)
        print('fold',ifold,'iter', i, score)
    
    fold_auc.append(score)
    print()
    
    test_pred += model.predict(te_seq).ravel()/n_folds
    ifold += 1
    gc.collect()

Training fold 0


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
  f = msb / msw


shape:  (342395, 169085)
max:  1.0
fold 0 iter 0 0.607181482412
fold 0 iter 1 0.617548988682
fold 0 iter 2 0.621804739426
fold 0 iter 3 0.624775643456
fold 0 iter 4 0.626644994816
fold 0 iter 5 0.628065890224
fold 0 iter 6 0.62890166848
fold 0 iter 7 0.629204467762

Training fold 1




shape:  (342395, 169085)
max:  1.0
fold 1 iter 0 0.602940944401
fold 1 iter 1 0.6126685984
fold 1 iter 2 0.617153740534
fold 1 iter 3 0.619596839297
fold 1 iter 4 0.620714582813
fold 1 iter 5 0.621305818021
fold 1 iter 6 0.621621273638
fold 1 iter 7 0.621722144189

Training fold 2




shape:  (342395, 169085)
max:  1.0
fold 2 iter 0 0.601715169635
fold 2 iter 1 0.610345264805
fold 2 iter 2 0.616377940924
fold 2 iter 3 0.619029754229
fold 2 iter 4 0.621185803333
fold 2 iter 5 0.622429485686
fold 2 iter 6 0.623116771011
fold 2 iter 7 0.623495045833

Training fold 3




shape:  (342395, 169085)
max:  1.0
fold 3 iter 0 0.597541118321
fold 3 iter 1 0.609626264505
fold 3 iter 2 0.613562951472
fold 3 iter 3 0.61954632591
fold 3 iter 4 0.622082554766
fold 3 iter 5 0.62314257485
fold 3 iter 6 0.623716432692
fold 3 iter 7 0.624068297572

Training fold 4
shape:  (342396, 169085)
max:  1.0
fold 4 iter 0 0.60187012667
fold 4 iter 1 0.609038969182
fold 4 iter 2 0.613371585915
fold 4 iter 3 0.614885685839
fold 4 iter 4 0.617332940977
fold 4 iter 5 0.618385382772
fold 4 iter 6 0.618876148522
fold 4 iter 7 0.619433833752



In [23]:
print(fold_auc)
print(np.mean(fold_auc), np.std(fold_auc))
roc_auc_score(X.target.values, pred)

[0.62920446776181305, 0.62172214418872063, 0.62349504583268056, 0.62406829757229965, 0.61943383375201833]
0.623584757822 0.00324106460659


0.62344099445898693

In [24]:
model_name = 'ftrl_50'
np.save(results_dir + 'train_' + model_name +'.npy', pred)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')

In [25]:
sub = x_te[['uid','target']].copy()
sub['target'] = test_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub.head()

isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.008976
1,ac4b8244f3ae82df511b002257473c11,0.077496
2,483d8b91e49522c8a5bbe37f3872c749,0.075154
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.061373
4,fdbfba9842ff0bf86d600eb334c7c42b,0.052356


In [27]:
sample_sub[['target']].to_csv(results_dir + 'ftrl_50' + '.csv', header=False, index=False)

In [28]:
sample_sub.head()

Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.008976
1,ac4b8244f3ae82df511b002257473c11,0.077496
2,483d8b91e49522c8a5bbe37f3872c749,0.075154
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.061373
4,fdbfba9842ff0bf86d600eb334c7c42b,0.052356


In [29]:
from sklearn.model_selection import KFold
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=72391)
pred = np.zeros(y.shape)
test_pred = 0
ifold = 0

fold_auc = []
y = X.target.values

from wordbatch.models import FM_FTRL
from sklearn.feature_selection import SelectPercentile
cpu_cores = 4
for trn_inx, val_inx in kf.split(y):
    print("Training fold {}".format(ifold))
    y_tr,y_va = y[trn_inx],y[val_inx]
    X_tr = X.loc[:, train_cols].fillna(0).values[trn_inx]
    X_va = X.loc[:, train_cols].fillna(0).values[val_inx]
    X_te = x_te.loc[:,train_cols].fillna(0).values
    
    yy = y_tr
    ssp = SelectPercentile(percentile=70)  
    ssp.fit(train_mat[trn_inx], yy)
    sp_train_mat = ssp.transform(train_mat[trn_inx])
    sp_val_mat = ssp.transform(train_mat[val_inx])
    sp_test_mat = ssp.transform(test_mat)  
    print('shape: ',sp_train_mat.shape)
    print('max: ',sp_train_mat.max())
    
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_tr)
    X_tr = scaler.transform(X_tr)
    X_va = scaler.transform(X_va)
    X_te = scaler.transform(X_te)
    del scaler
    
    trn_seq = sp.hstack([X_tr, sp_train_mat])
    val_seq = sp.hstack([X_va, sp_val_mat])
    te_seq = sp.hstack([X_te, sp_test_mat])
    
    model = FM_FTRL(alpha=0.02, beta=0.01, L1=0.000001, L2=0.001, D=trn_seq.shape[1], alpha_fm=0.03, L2_fm=0.005, init_fm=0.2,
                           D_fm=30, e_noise=0.0000002, iters=1, inv_link="sigmoid", threads=8)
    score = 0
    for i in range(10):
        model.fit(trn_seq, y_tr)
        yhat = model.predict(val_seq).ravel()
        if (np.isnan(yhat).any()):
            print(':(')
            break
        pred[val_inx] = yhat
        score = roc_auc_score(y_va,yhat)
        print('fold',ifold,'iter', i, score)
    
    fold_auc.append(score)
    print()
    
    test_pred += model.predict(te_seq).ravel()/n_folds
    ifold += 1
    gc.collect()
print(fold_auc)
print(np.mean(fold_auc), np.std(fold_auc))
print(roc_auc_score(X.target.values, pred))
model_name = 'ftrl_70'
np.save(results_dir + 'train_' + model_name +'.npy', pred)
sample_sub = pd.read_table(data_dir+'mlboot_test.tsv')
sub = x_te[['uid','target']].copy()
sub['target'] = test_pred
sub.columns = ['cuid','target']
sample_sub = sample_sub.merge(sub, on='cuid', how='left')
np.save(results_dir + 'test_' + model_name +'.npy', sample_sub.target.values)
print('isnull?',sample_sub.target.isnull().any())
sample_sub.head()

Training fold 0


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
  f = msb / msw


shape:  (342395, 236719)
max:  1.0
fold 0 iter 0 0.60708655343
fold 0 iter 1 0.61514083869
fold 0 iter 2 0.620654780756
fold 0 iter 3 0.621784961249
fold 0 iter 4 0.62293854355
fold 0 iter 5 0.622965365468
fold 0 iter 6 0.622733741602
fold 0 iter 7 0.622693453097
fold 0 iter 8 0.622752656956
fold 0 iter 9 0.622734462614

Training fold 1




shape:  (342395, 236719)
max:  1.0
fold 1 iter 0 0.601960894121
fold 1 iter 1 0.607867647592
fold 1 iter 2 0.611509375535
fold 1 iter 3 0.615610949415
fold 1 iter 4 0.619295114914
fold 1 iter 5 0.621707880845
fold 1 iter 6 0.62299603291
fold 1 iter 7 0.623655557878
fold 1 iter 8 0.62419410423
fold 1 iter 9 0.624259715906

Training fold 2




shape:  (342395, 236719)
max:  1.0
fold 2 iter 0 0.599201625644
fold 2 iter 1 0.610638004024
fold 2 iter 2 0.615236038079
fold 2 iter 3 0.617607207965
fold 2 iter 4 0.619292308995
fold 2 iter 5 0.620113070531
fold 2 iter 6 0.620598181196
fold 2 iter 7 0.620722290214
fold 2 iter 8 0.62063000989
fold 2 iter 9 0.620417132893

Training fold 3




shape:  (342395, 236719)
max:  1.0
fold 3 iter 0 0.60197179579
fold 3 iter 1 0.612349586578
fold 3 iter 2 0.617617231583
fold 3 iter 3 0.619023337766
fold 3 iter 4 0.621242602005
fold 3 iter 5 0.62236564823
fold 3 iter 6 0.623048100806
fold 3 iter 7 0.623421666298
fold 3 iter 8 0.623494053203
fold 3 iter 9 0.623387431209

Training fold 4
shape:  (342396, 236719)
max:  1.0
fold 4 iter 0 0.601985008291
fold 4 iter 1 0.612500645325
fold 4 iter 2 0.617495885733
fold 4 iter 3 0.620389048213
fold 4 iter 4 0.621996475522
fold 4 iter 5 0.622954766686
fold 4 iter 6 0.623550029009
fold 4 iter 7 0.624005064749
fold 4 iter 8 0.624154916586
fold 4 iter 9 0.624106561798

[0.62273446261364562, 0.62425971590589535, 0.62041713289317291, 0.6233874312088854, 0.62410656179811974]
0.622981060884 0.00139293828024
0.622924063126
isnull? False


Unnamed: 0,cuid,target
0,888b238b4d14c03173baa375a739f6bc,0.007802
1,ac4b8244f3ae82df511b002257473c11,0.075282
2,483d8b91e49522c8a5bbe37f3872c749,0.074491
3,4c7ec46a0e88a7e1e1cedd2d526d5d61,0.061084
4,fdbfba9842ff0bf86d600eb334c7c42b,0.054119
