In [1]:
import pickle
import numpy as np
import xgboost as xgb
import glob
import pandas as pd

fl = list(glob.glob('../features/*.pkl'))
for l in sorted(fl):
    print("'{}'".format(l))
    
# no other feat7 

'../features/cnn_1_aug1_feat.pkl'
'../features/cnn_1_aug_1_fold8_feat.pkl'
'../features/cnn_1_aug_rescale_preprocess_feat.pkl'
'../features/cnn_1_aug_skimage_denoise_feat.pkl'
'../features/cnn_1_aug_skimage_preprocess_feat.pkl'
'../features/cnn_1_feat.pkl'
'../features/cnn_2_aug1_feat.pkl'
'../features/cnn_2_aug_1_fold8_feat.pkl'
'../features/cnn_2_aug_denoise_preprocess_feat.pkl'
'../features/cnn_2_aug_rescale_preprocess_feat.pkl'
'../features/cnn_2_aug_skimage_preprocess_feat.pkl'
'../features/cnn_2_feat.pkl'
'../features/cnn_3_aug1_feat.pkl'
'../features/cnn_3_aug_1_fold8_feat.pkl'
'../features/cnn_3_aug_denoise_preprocess_feat.pkl'
'../features/cnn_3_aug_rescale_preprocess_feat.pkl'
'../features/cnn_3_aug_skimage_preprocess_feat.pkl'
'../features/cnn_4_aug1_feat.pkl'
'../features/cnn_4_aug1_feat_add_early.pkl'
'../features/cnn_4_aug_1_fold8_feat.pkl'
'../features/cnn_4_aug_denoise_preprocess_feat.pkl'
'../features/cnn_4_aug_rescale_preprocess_feat.pkl'
'../features/cnn_4_aug_skimag

In [2]:
def load_x_y():
    train_x,test_x=[],[]
    for f in fl:
        with open(f,'rb') as fin:
            a,b = pickle.load(fin)
            train_x.append(a)
            test_x.append(b)
    train_x = np.hstack(train_x)
    test_x = np.hstack(test_x)
    train_df = pd.read_json('../input/train.json')
    y = train_df.is_iceberg.values
    return train_x,test_x,y

train_x,test_x,train_y = load_x_y()
print(train_x.shape,test_x.shape,train_y.shape)

(1604, 81) (8424, 81) (1604,)


In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
def cv_test(k_cnt=3,rnd=42,save_flag=False,verbose_cnt=False):
    kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd*2)
    w_test_pred = None
    r_val_loss = 0
    test_pred = None
    val_loss = 0
    val_loss_list = []
    for train_index, test_index in kf.split(train_x,train_y):
        X_train, X_test = train_x[train_index], train_x[test_index]
        y_train, y_test = train_y[train_index], train_y[test_index]
        params = {
                'colsample_bytree': 0.75,
                'colsample_bylevel':0.95,
                'gamma':0.2,
                'subsample': 0.9,
                'eta': 0.07,
                'max_depth': 3,
                'eval_metric':'logloss',
                'objective':'binary:logistic',
                'scale_pos_weight': 0.85,
                
                }
        
        # def mat
        d_train = xgb.DMatrix(X_train, y_train)
        d_valid = xgb.DMatrix(X_test, y_test)
        d_test = xgb.DMatrix(test_x)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        # train model
        m = xgb.train(params, d_train, 500, watchlist, 
                        early_stopping_rounds=20,
                        verbose_eval=verbose_cnt
                        )
        
        # cal val loss
        curr_val_loss = log_loss(y_test,m.predict(d_valid))
        val_loss += curr_val_loss/k_cnt
        val_loss_list.append(curr_val_loss)
        curr_r_val_loss = 1.0 / curr_val_loss
        r_val_loss += curr_r_val_loss
        
        if test_pred is None:
            test_pred = m.predict(d_test)
            w_test_pred = m.predict(d_test)*curr_r_val_loss
        else:
            curr_pred = m.predict(d_test)
            test_pred += curr_pred
            w_test_pred = w_test_pred + curr_pred*curr_r_val_loss

    # avg
    test_pred = test_pred / k_cnt
    w_test_pred = w_test_pred / r_val_loss
    
    # train log loss
    print('local average valid loss',val_loss,'val loss std',np.std(val_loss_list))
    if save_flag:
        test_df=pd.read_json('../input/test.json')
        submission = pd.DataFrame()
        submission['id']=test_df['id']
        submission['is_iceberg']=test_pred
        print(submission.head())
        submission.to_csv('../results/all_xgb_sub_fold_{}_rnd_{}.csv'.format(k_cnt,rnd), index=False)
        
        submission = pd.DataFrame()
        submission['id']=test_df['id']
        submission['is_iceberg']=w_test_pred
        print(submission.head())
        submission.to_csv('../results/weighted_all_xgb_sub_fold_{}_rnd_{}.csv'.format(k_cnt,rnd), index=False)

    
print('def done')

def done


In [29]:
# find new rnd seed
for i in range(50):
    rnd = i*6
    print(rnd)
    cv_test(3,rnd,False)

0
local average valid loss 0.145866677952 val loss std 0.0337133401228
6
local average valid loss 0.151000133049 val loss std 0.0176700491043
12
local average valid loss 0.140975174603 val loss std 0.0197822648936
18
local average valid loss 0.147715408352 val loss std 0.0112063094976
24
local average valid loss 0.144434320449 val loss std 0.0155391819745
30
local average valid loss 0.148187370884 val loss std 0.0192206774681
36
local average valid loss 0.152259416391 val loss std 0.0339691591859
42
local average valid loss 0.140732382995 val loss std 0.0117262843428
48
local average valid loss 0.150008122301 val loss std 0.0293353223621
54
local average valid loss 0.139361062583 val loss std 0.0154661089693
60
local average valid loss 0.138519939534 val loss std 0.0134844661305
66
local average valid loss 0.145642976274 val loss std 0.012359701545
72
local average valid loss 0.148550231702 val loss std 0.0139217767581
78
local average valid loss 0.144192687695 val loss std 0.011069536

In [30]:
# find new rnd seed
for i in range(50):
    rnd = i*6
    print(rnd)
    cv_test(5,rnd,False)

0
local average valid loss 0.139841161703 val loss std 0.0391319834476
6
local average valid loss 0.143956969279 val loss std 0.0159588680688
12
local average valid loss 0.138732632956 val loss std 0.0250747364962
18
local average valid loss 0.143088039897 val loss std 0.00630852957126
24
local average valid loss 0.146124837712 val loss std 0.0082554884584
30
local average valid loss 0.144082709975 val loss std 0.011494115402
36
local average valid loss 0.145391927026 val loss std 0.026440015539
42
local average valid loss 0.137776919879 val loss std 0.0269819378445
48
local average valid loss 0.134990193269 val loss std 0.0392766942859
54
local average valid loss 0.137996396121 val loss std 0.0361318865052
60
local average valid loss 0.145544168721 val loss std 0.0210942061963
66
local average valid loss 0.134508066085 val loss std 0.0151440443714
72
local average valid loss 0.142278985135 val loss std 0.0190848555778
78
local average valid loss 0.144405404486 val loss std 0.027395802

In [4]:
cv_test(3,252,True,500)

[0]	train-logloss:0.636166	valid-logloss:0.639833
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[138]	train-logloss:0.030478	valid-logloss:0.126177

[0]	train-logloss:0.636	valid-logloss:0.639144
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[81]	train-logloss:0.05569	valid-logloss:0.137791

[0]	train-logloss:0.635984	valid-logloss:0.64014
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[101]	train-logloss:0.045349	valid-logloss:0.144128

local average valid loss 0.137414263225 val loss std 0.00780035335958
         id  is_iceberg
0  5941774d    0.026259
1  4023181e    0.736732
2  b20200e4    0.025662


In [5]:
cv_test(5,66,True,500)

[0]	train-logloss:0.636517	valid-logloss:0.637451
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[125]	train-logloss:0.039947	valid-logloss:0.123319

[0]	train-logloss:0.635833	valid-logloss:0.639346
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[82]	train-logloss:0.061432	valid-logloss:0.154324

[0]	train-logloss:0.636394	valid-logloss:0.636728
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[68]	train-logloss:0.073152	valid-logloss:0.134726

[0]	train-logloss:0.635661	valid-logloss:0.640291
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid