In [1]:
import pickle
import numpy as np
import xgboost as xgb
import glob
import pandas as pd

fl = list(glob.glob('../features/*.pkl'))
for l in sorted(fl):
    print("'{}'".format(l))

'../features/cnn_1_aug1_feat.pkl'
'../features/cnn_1_aug_1_fold8_feat.pkl'
'../features/cnn_1_aug_rescale_preprocess_feat.pkl'
'../features/cnn_1_aug_skimage_denoise_feat.pkl'
'../features/cnn_1_aug_skimage_preprocess_feat.pkl'
'../features/cnn_1_feat.pkl'
'../features/cnn_2_aug1_feat.pkl'
'../features/cnn_2_aug_1_fold8_feat.pkl'
'../features/cnn_2_aug_denoise_preprocess_feat.pkl'
'../features/cnn_2_aug_rescale_preprocess_feat.pkl'
'../features/cnn_2_aug_skimage_preprocess_feat.pkl'
'../features/cnn_2_feat.pkl'
'../features/cnn_3_aug1_feat.pkl'
'../features/cnn_3_aug_1_fold8_feat.pkl'
'../features/cnn_3_aug_denoise_preprocess_feat.pkl'
'../features/cnn_3_aug_rescale_preprocess_feat.pkl'
'../features/cnn_3_aug_skimage_preprocess_feat.pkl'
'../features/cnn_4_aug1_feat.pkl'
'../features/cnn_4_aug1_feat_add_early.pkl'
'../features/cnn_4_aug_1_fold8_feat.pkl'
'../features/cnn_4_aug_denoise_preprocess_feat.pkl'
'../features/cnn_4_aug_rescale_preprocess_feat.pkl'
'../features/cnn_4_aug_skimag

In [2]:
def load_x_y():
    train_x,test_x=[],[]
    for f in fl:
        with open(f,'rb') as fin:
            a,b = pickle.load(fin)
            train_x.append(a)
            test_x.append(b)
    train_x = np.hstack(train_x)
    test_x = np.hstack(test_x)
    train_df = pd.read_json('../input/train.json')
    y = train_df.is_iceberg.values
    return train_x,test_x,y

train_x,test_x,train_y = load_x_y()
print(train_x.shape,test_x.shape,train_y.shape)

(1604, 71) (8424, 71) (1604,)


In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
def cv_test(k_cnt=3,rnd=42,save_flag=False):
    kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd*2)
    w_test_pred = None
    r_val_loss = 0
    test_pred = None
    val_loss = 0
    val_loss_list = []
    for train_index, test_index in kf.split(train_x,train_y):
        X_train, X_test = train_x[train_index], train_x[test_index]
        y_train, y_test = train_y[train_index], train_y[test_index]
        params = {
                'colsample_bytree': 0.75,
                'colsample_bylevel':0.95,
                'gamma':0.2,
                'subsample': 0.9,
                'eta': 0.07,
                'max_depth': 3,
                'eval_metric':'logloss',
                'objective':'binary:logistic',
                'scale_pos_weight': 0.85
                }
        
        # def mat
        d_train = xgb.DMatrix(X_train, y_train)
        d_valid = xgb.DMatrix(X_test, y_test)
        d_test = xgb.DMatrix(test_x)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        # train model
        m = xgb.train(params, d_train, 500, watchlist, 
                        early_stopping_rounds=20,
                        verbose_eval=1000)
        
        # cal val loss
        curr_val_loss = log_loss(y_test,m.predict(d_valid))
        val_loss += curr_val_loss/k_cnt
        val_loss_list.append(curr_val_loss)
        curr_r_val_loss = 1.0 / curr_val_loss
        r_val_loss += curr_r_val_loss
        
        if test_pred is None:
            test_pred = m.predict(d_test)
            w_test_pred = m.predict(d_test)*curr_r_val_loss
        else:
            curr_pred = m.predict(d_test)
            test_pred += curr_pred
            w_test_pred = w_test_pred + curr_pred*curr_r_val_loss

    # avg
    test_pred = test_pred / k_cnt
    w_test_pred = w_test_pred / r_val_loss
    
    # train log loss
    print('local average valid loss',val_loss,'val loss std',np.std(val_loss_list))
    if save_flag:
        test_df=pd.read_json('../input/test.json')
        submission = pd.DataFrame()
        submission['id']=test_df['id']
        submission['is_iceberg']=test_pred
        print(submission.head())
        submission.to_csv('../results/all_xgb_sub_fold_{}_rnd_{}.csv'.format(k_cnt,rnd), index=False)
        
        submission = pd.DataFrame()
        submission['id']=test_df['id']
        submission['is_iceberg']=w_test_pred
        print(submission.head())
        submission.to_csv('../results/weighted_all_xgb_sub_fold_{}_rnd_{}.csv'.format(k_cnt,rnd), index=False)

    
print('def done')

def done


In [4]:
cv_test(3,42,True)

[0]	train-logloss:0.63587	valid-logloss:0.639467
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[117]	train-logloss:0.046946	valid-logloss:0.13987

[0]	train-logloss:0.636265	valid-logloss:0.640309
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[79]	train-logloss:0.06692	valid-logloss:0.157984

[0]	train-logloss:0.63679	valid-logloss:0.638617
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[72]	train-logloss:0.073551	valid-logloss:0.152528

local average valid loss 0.151950851041 val loss std 0.0078130230655
         id  is_iceberg
0  5941774d    0.019915
1  4023181e    0.941567
2  b20200e4    0.100444
3

In [5]:
cv_test(5,42,True)

[0]	train-logloss:0.635993	valid-logloss:0.638347
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[112]	train-logloss:0.051964	valid-logloss:0.157172

[0]	train-logloss:0.637715	valid-logloss:0.638521
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[132]	train-logloss:0.048691	valid-logloss:0.121559

[0]	train-logloss:0.636373	valid-logloss:0.639036
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[94]	train-logloss:0.061218	valid-logloss:0.172823

[0]	train-logloss:0.637275	valid-logloss:0.638753
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until vali

In [6]:
cv_test(10,42,True)

[0]	train-logloss:0.635963	valid-logloss:0.638904
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[78]	train-logloss:0.075219	valid-logloss:0.193447

[0]	train-logloss:0.636804	valid-logloss:0.640577
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[94]	train-logloss:0.068414	valid-logloss:0.144808

[0]	train-logloss:0.637214	valid-logloss:0.637049
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[112]	train-logloss:0.057217	valid-logloss:0.12991

[0]	train-logloss:0.637317	valid-logloss:0.640562
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-

In [7]:
cv_test(3,233,True)

[0]	train-logloss:0.637192	valid-logloss:0.638209
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[114]	train-logloss:0.055107	valid-logloss:0.133915

[0]	train-logloss:0.637101	valid-logloss:0.640535
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[98]	train-logloss:0.061476	valid-logloss:0.140654

[0]	train-logloss:0.635434	valid-logloss:0.640158
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[62]	train-logloss:0.069828	valid-logloss:0.186732

local average valid loss 0.154466957912 val loss std 0.0241405814484
         id  is_iceberg
0  5941774d    0.016512
1  4023181e    0.934627
2  b20200e4    0.0572

In [8]:
cv_test(5,233,True)

[0]	train-logloss:0.635812	valid-logloss:0.638773
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[112]	train-logloss:0.054314	valid-logloss:0.159519

[0]	train-logloss:0.63819	valid-logloss:0.636248
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[122]	train-logloss:0.059014	valid-logloss:0.086715

[0]	train-logloss:0.636881	valid-logloss:0.63984
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[127]	train-logloss:0.047951	valid-logloss:0.13932

[0]	train-logloss:0.636386	valid-logloss:0.64048
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-l

In [9]:
cv_test(10,233,True)

[0]	train-logloss:0.636594	valid-logloss:0.63953
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[97]	train-logloss:0.067416	valid-logloss:0.143249

[0]	train-logloss:0.635806	valid-logloss:0.638859
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[191]	train-logloss:0.029864	valid-logloss:0.150104

[0]	train-logloss:0.637441	valid-logloss:0.635033
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
Stopping. Best iteration:
[89]	train-logloss:0.074503	valid-logloss:0.099915

[0]	train-logloss:0.637169	valid-logloss:0.637311
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-