In [1]:
import pickle
import numpy as np
import xgboost as xgb
import glob
import pandas as pd

def load_x_y():
    train_x,test_x=[],[]
    for f in glob.glob('../features/*.pkl'):
        print(f)
        with open(f,'rb') as fin:
            a,b = pickle.load(fin)
            train_x.append(a)
            test_x.append(b)
    train_x = np.hstack(train_x)
    test_x = np.hstack(test_x)
    train_df = pd.read_json('../input/train.json')
    y = train_df.is_iceberg.values
    return train_x,test_x,y

train_x,test_x,train_y = load_x_y()
print(train_x.shape,test_x.shape,train_y.shape)

../features/vgg16_1_feat.pkl
../features/cnn_1_feat.pkl
../features/other_feat.pkl
../features/cnn_1_aug1_feat.pkl
../features/resnet_1_feat.pkl
../features/cnn_2_aug1_feat.pkl
../features/cnn_2_feat.pkl
../features/cnn_2_feat_adjust.pkl
../features/incept_1_feat.pkl
../features/cnn_2_aug2_feat.pkl
(1604, 20) (8424, 20) (1604,)


In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
def cv_test(k_cnt=3,rnd=42,save_flag=False):
    kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd*2)
    test_pred = None
    val_loss = 0
    for train_index, test_index in kf.split(train_x,train_y):
        X_train, X_test = train_x[train_index], train_x[test_index]
        y_train, y_test = train_y[train_index], train_y[test_index]
        params = {
                'colsample_bytree': 0.75,
                'colsample_bylevel':0.95,
                'gamma':0.2,
                'subsample': 0.9,
                'eta': 0.07,
                'max_depth': 3,
                'eval_metric':'logloss',
                'objective':'binary:logistic',
                'scale_pos_weight': 1.0
                }
        
        # def mat
        d_train = xgb.DMatrix(X_train, y_train)
        d_valid = xgb.DMatrix(X_test, y_test)
        d_test = xgb.DMatrix(test_x)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        # train model
        m = xgb.train(params, d_train, 500, watchlist, 
                        early_stopping_rounds=20,
                        verbose_eval=50)
        
        # cal val loss
        val_loss += log_loss(y_test,m.predict(d_valid))/k_cnt
        
        if test_pred is None:
            test_pred = m.predict(d_test)
        else:
            curr_pred = m.predict(d_test)
            test_pred += curr_pred

    # avg
    test_pred = test_pred / k_cnt
    
    # train log loss
    print('local average valid loss',val_loss)
    if save_flag:
        test_df=pd.read_json('../input/test.json')
        submission = pd.DataFrame()
        submission['id']=test_df['id']
        submission['is_iceberg']=test_pred
        print(submission.head())
        submission.to_csv('../results/xgb_sub.csv', index=False)

    
print('def done')

def done


In [12]:
cv_test(5,233,True)

[0]	train-logloss:0.641657	valid-logloss:0.642664
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
[50]	train-logloss:0.143012	valid-logloss:0.166868
[100]	train-logloss:0.102703	valid-logloss:0.148911
Stopping. Best iteration:
[124]	train-logloss:0.087908	valid-logloss:0.143337

[0]	train-logloss:0.641701	valid-logloss:0.641994
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
[50]	train-logloss:0.15182	valid-logloss:0.138679
[100]	train-logloss:0.110653	valid-logloss:0.114605
[150]	train-logloss:0.082439	valid-logloss:0.103618
[200]	train-logloss:0.064478	valid-logloss:0.099232
[250]	train-logloss:0.049489	valid-logloss:0.094989
[300]	train-logloss:0.039072	valid-logloss:0.093102
[350]	train-logloss:0.031946	valid-logloss:0.091654
Stopping. Best iteration:
[353]	train-logloss:0.031409