In [1]:
from multiprocessing import Pool
from tqdm import tqdm
import gc
#
import numpy as np # linear algebra
import pandas as pd # data processing
import datetime as dt
#
from random import choice, sample, shuffle, uniform, seed
from math import exp, expm1, log1p, log10, log2, sqrt, ceil, floor, isfinite, isnan
from itertools import combinations
#import for image processing
#import cv2
from scipy.stats import kurtosis, skew
from scipy.ndimage import laplace, sobel
#evaluation
from sklearn.model_selection import StratifiedShuffleSplit, KFold
from sklearn.metrics import log_loss
import xgboost as xgb
import lightgbm as lgb

###############################################################################
def read_jason(file='', loc='../input/'):
    print('{}{}'.format(loc, file))
    df = pd.read_json('{}{}'.format(loc, file))
    df['inc_angle'] = df['inc_angle'].replace('na', -1).astype(float)
    #print(df['inc_angle'].value_counts())
    
    band1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df["band_1"]])
    band2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df["band_2"]])
    df = df.drop(['band_1', 'band_2'], axis=1)
    
    bands = np.stack((band1, band2,  0.5 * (band1 + band2)), axis=-1)  # -1 means add to the last dimension
    del band1, band2
    
    return df, bands

###############################################################################
#forked from
#https://www.kaggle.com/the1owl/planet-understanding-the-amazon-from-space/natural-growth-patterns-fractals-of-nature/notebook
def img_to_stats(paths):
    
    img_id, img = paths[0], paths[1]
    
    #ignored error    
    np.seterr(divide='ignore', invalid='ignore')
    
    bins = 20
    scl_min, scl_max = -50, 50
    opt_poly = True
    #opt_poly = False
    
    try:
        st = []
        st_interv = []
        hist_interv = []
        for i in range(img.shape[2]):
            img_sub = np.squeeze(img[:, :, i])
            
            #median, max and min
            sub_st = []
            sub_st += [np.mean(img_sub), np.std(img_sub), np.max(img_sub), np.median(img_sub), np.min(img_sub)]
            sub_st += [(sub_st[2] - sub_st[3]), (sub_st[2] - sub_st[4]), (sub_st[3] - sub_st[4])] 
            sub_st += [(sub_st[-3] / sub_st[1]), (sub_st[-2] / sub_st[1]), (sub_st[-1] / sub_st[1])] #normalized by stdev
            st += sub_st
            #Laplacian, Sobel, kurtosis and skewness
            st_trans = []
            st_trans += [laplace(img_sub, mode='reflect', cval=0.0).ravel().var()] #blurr
            sobel0 = sobel(img_sub, axis=0, mode='reflect', cval=0.0).ravel().var()
            sobel1 = sobel(img_sub, axis=1, mode='reflect', cval=0.0).ravel().var()
            st_trans += [sobel0, sobel1]
            st_trans += [kurtosis(img_sub.ravel()), skew(img_sub.ravel())]
            
            if opt_poly:
                st_interv.append(sub_st)
                #
                st += [x * y for x, y in combinations(st_trans, 2)]
                st += [x + y for x, y in combinations(st_trans, 2)]
                st += [x - y for x, y in combinations(st_trans, 2)]                
 
            #hist
            #hist = list(cv2.calcHist([img], [i], None, [bins], [0., 1.]).flatten())
            hist = list(np.histogram(img_sub, bins=bins, range=(scl_min, scl_max))[0])
            hist_interv.append(hist)
            st += hist
            st += [hist.index(max(hist))] #only the smallest index w/ max value would be incl
            st += [np.std(hist), np.max(hist), np.median(hist), (np.max(hist) - np.median(hist))]

        if opt_poly:
            for x, y in combinations(st_interv, 2):
                st += [float(x[j]) * float(y[j]) for j in range(len(st_interv[0]))]

            for x, y in combinations(hist_interv, 2):
                hist_diff = [x[j] * y[j] for j in range(len(hist_interv[0]))]
                st += [hist_diff.index(max(hist_diff))] #only the smallest index w/ max value would be incl
                st += [np.std(hist_diff), np.max(hist_diff), np.median(hist_diff), (np.max(hist_diff) - np.median(hist_diff))]
                
        #correction
        nan = -999
        for i in range(len(st)):
            if isnan(st[i]) == True:
                st[i] = nan
                
    except:
        print('except: ')
    
    return [img_id, st]


def extract_img_stats(paths):
    imf_d = {}
    p = Pool(8) #(cpu_count())
    ret = p.map(img_to_stats, paths)   # list of pair of (id, bands) bands is np.array shape (75, 75, 3)
    for i in tqdm(range(len(ret)), miniters=100):
        imf_d[ret[i][0]] = ret[i][1]

    ret = []
    fdata = [imf_d[i] for i, j in paths]
    return np.array(fdata, dtype=np.float32)


def process(df, bands):

    data = extract_img_stats([(k, v) for k, v in zip(df['id'].tolist(), bands)]); gc.collect() #(N, 246)
    data = np.concatenate([data, df['inc_angle'].values[:, np.newaxis]], axis=-1); gc.collect() #(N, 247)

    print(data.shape)
    return data

###############################################################################
if __name__ == '__main__':
    
    np.random.seed(1017)
    target = 'is_iceberg'
    
    #Load data
    train, train_bands = read_jason(file='train.json', loc='data/processed/')
    test, test_bands = read_jason(file='test.json', loc='data/processed/')

    train_X_full = process(df=train, bands=train_bands)
    train_y_full = train[target].values    
    test_X = process(df=test, bands=test_bands)

data/processed/train.json
data/processed/test.json


100%|██████████| 1604/1604 [00:00<00:00, 703509.74it/s]

(1604, 247)



100%|██████████| 8424/8424 [00:00<00:00, 1094030.74it/s]

(8424, 247)



Process ForkPoolWorker-13:
Process ForkPoolWorker-10:
Process ForkPoolWorker-12:
Process ForkPoolWorker-15:
Process ForkPoolWorker-9:
Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Process ForkPoolWorker-16:
Traceback (most recent call last):
Process ForkPoolWorker-8:
Traceback (most recent call last):
Process ForkPoolWorker-7:
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process ForkPoolWorker-14:
Traceback (most recent call last):
Process ForkPoolWorker-11:
Process ForkPoolWorker-6:
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._ar

  File "/usr/lib/python3.5/multiprocessing/queues.py", line 342, in get
    with self._rlock:
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 342, in get
    with self._rlock:
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
  File

In [2]:
# train_X= train_X_full[train_X_full[:,-1]!=-1] #train_X[:,-1]==-1
# train_y= train_y_full[train_X_full[:,-1]!=-1]

train_X= train_X_full
train_y= train_y_full

print(train_X.shape, train_y.shape)

(1604, 247) (1604,)


In [6]:
# This will be the version changed based on my own understanding
def save_blend(preds={}, loc='./'):
    target = 'is_iceberg'
    
    w_total = 0.0
    blend = None
    df_corr = None
    print('\nBlending...')
    for k, v in preds.items():
        if blend is None:
            blend = pd.read_csv('{0}/{1}'.format(loc, k))
            print('load: {0}, w={1}'.format(k, v))
            
            df_corr = pd.DataFrame({'id': blend['id'].tolist()})
            df_corr[k[16:-4]] = blend[target]
            
            w_total += v
            blend[target] = blend[target] * v
                
        else:
            preds_tmp = pd.read_csv('{0}/{1}'.format(loc, k))
            preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
            print('load: {0}, w={1}'.format(k, v))
            df_corr[k[16:-4]] = preds_tmp[target]
            
            w_total += v
            blend[target] += preds_tmp[target] * v
            del preds_tmp
            
    print('\n{}'.format(df_corr.corr()), flush=True)
    #write submission
    blend[target] = blend[target] / w_total
    print('\nPreview: \n{}'.format(blend.head()), flush=True)
    blend.to_csv('{}subm_blend{:03d}_{}.csv'.format(loc, len(preds), tmp), index=False, float_format='%.6f')

def run_lgb(params={}, lgb_train=None, lgb_valid=None, lgb_test=None, test_ids=None, nr_round=2000, min_round=100, file=''):

    print('\nLightGBM: {}'.format(params['boosting'])) 
    model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)
    
    pred = model2.predict(lgb_test, num_iteration=model2.best_iteration)
    #
    subm = pd.DataFrame({'id': test_ids, 'is_iceberg': pred})
    subm.to_csv(file, index=False, float_format='%.6f')
    #   
    df = pd.DataFrame({'feature':model2.feature_name(), 'importances': model2.feature_importance()})
    
    return pred, df


#results
freq = pd.DataFrame()
subms = []

#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)
# seed_list=[]
# final_dict ={}
# final_dict['xgb_re'] = []
# final_dict['lgb_re'] = []
# final_dict['lgb_dart_re'] =[]
for rep in range(20):
    ran_num =  np.random.randint(50000,60000,size=1)[0]
    seed_list.append(ran_num)
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 5
    kf = KFold(n_splits=nr_runs, random_state=split_seed)
    tree_lim =0
#     xgb_re = []
#     lgb_re =[]
#     lgb_dart_re= []
    for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
        print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

        tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

        x1, x2 = train_X[train_index], train_X[test_index]
        y1, y2 = train_y[train_index], train_y[test_index]
        #x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=test_ratio, random_state=split_seed + r)
        print('splitted: {0}, {1}'.format(x1.shape, x2.shape), flush=True)
        test_X_dup = test_X.copy()

        #XGB
        xgb_train = xgb.DMatrix(x1, y1)
        xgb_valid = xgb.DMatrix(x2, y2)
        #
        watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
        params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'seed': 99, 'silent': True}
        params['eta'] = 0.03
        params['max_depth'] = 4
        params['subsample'] = 0.9
        params['eval_metric'] = 'logloss'
        params['colsample_bytree'] = 0.8
        params['colsample_bylevel'] = 0.8
        params['max_delta_step'] = 3
        #params['gamma'] = 5.0
        #params['labmda'] = 1
        params['scale_pos_weight'] = 1.0
        params['seed'] = ran_num + r
        nr_round = 2000
        min_round = 100

        model1 = xgb.train(params, 
                           xgb_train, 
                           nr_round,  
                           watchlist, 
                           verbose_eval=50, 
                           early_stopping_rounds=min_round)

        pred_xgb = model1.predict(xgb.DMatrix(test_X_dup), ntree_limit=model1.best_ntree_limit+tree_lim)

        #
        file = 'gbm/subm_{}_xgb_{:02d}.csv'.format(tmp, r+1)
        subm = pd.DataFrame({'id': test['id'].values, target: pred_xgb})
        subm.to_csv(file, index=False, float_format='%.6f')
        subms.append(file)    

        ##LightGBM
        lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
        lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
        #gbdt
        params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
        params['boosting'] = 'gbdt'
        params['metric'] = 'binary_logloss'
        params['learning_rate'] = 0.03
        params['max_depth'] = 5
        params['num_leaves'] = 9 # higher number of leaves
        params['feature_fraction'] = 0.8 # Controls overfit
        params['bagging_fraction'] = 0.9    
        params['bagging_freq'] = 3
        params['seed'] = ran_num + r
        #
        params['verbose'] = -1

        file = 'gbm/subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
        subms.append(file)
        
        model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)

        ##LightGBM
        #dart
        params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
        params['boosting'] = 'dart'
        params['metric'] = 'binary_logloss'
        params['learning_rate'] = 0.04
        params['max_depth'] = 5
        params['num_leaves'] = 16 # higher number of leaves
        params['feature_fraction'] = 0.8 # Controls overfit
        params['bagging_fraction'] = 0.9    
        params['bagging_freq'] = 3
        params['seed'] = ran_num + r
        #dart
        params['drop_rate'] = 0.1
        params['skip_drop'] = 0.5
        params['max_drop'] = 10
        params['verbose'] = -1 

        file = 'gbm/subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
        subms.append(file)

        model3 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)
        
        
        xgb_re.append(model1.best_score)
        lgb_re.append(model2.best_score['valid_0']['binary_logloss'])
        lgb_dart_re.append(model3.best_score['valid_0']['binary_logloss'])
    
#     final_dict['xgb_re'].append(np.mean(xgb_re))
#     final_dict['lgb_re'].append(np.mean(lgb_re))
#     final_dict['lgb_dart_re'].append(np.mean(lgb_dart_re))
    
    #blending
    preds = {k: 1.0 for k in subms}
    save_blend(preds=preds)

The seed we are using is: 52430

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f3d9766d990>
splitted: (1283, 247), (321, 247)
[0]	train-logloss:0.676033	valid-logloss:0.6784
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.3117	valid-logloss:0.369105
[100]	train-logloss:0.19938	valid-logloss:0.285958
[150]	train-logloss:0.142229	valid-logloss:0.247775
[200]	train-logloss:0.109054	valid-logloss:0.23082
[250]	train-logloss:0.085045	valid-logloss:0.219496
[300]	train-logloss:0.066774	valid-logloss:0.213611
[350]	train-logloss:0.053188	valid-logloss:0.208668
[400]	train-logloss:0.043344	valid-logloss:0.204417
[450]	train-logloss:0.03581	valid-logloss:0.203433
[500]	train-logloss:0.029243	valid-logloss:0.201247
[550]	train-logloss:0.024603	valid-logloss:0.200056
[600]	train-logloss:0.021024	valid-logloss:0.199269
[650]	train-logloss:0.018007	valid-logloss:0

[400]	train-logloss:0.043451	valid-logloss:0.220679
[450]	train-logloss:0.035314	valid-logloss:0.221228
Stopping. Best iteration:
[391]	train-logloss:0.045244	valid-logloss:0.220419

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.3492
[100]	valid_0's binary_logloss: 0.271451
[150]	valid_0's binary_logloss: 0.247388
[200]	valid_0's binary_logloss: 0.235436
[250]	valid_0's binary_logloss: 0.227201
[300]	valid_0's binary_logloss: 0.224166
[350]	valid_0's binary_logloss: 0.220172
[400]	valid_0's binary_logloss: 0.222745
[450]	valid_0's binary_logloss: 0.221706
Early stopping, best iteration is:
[351]	valid_0's binary_logloss: 0.21985
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.41134
[100]	valid_0's binary_logloss: 0.345168
[150]	valid_0's binary_logloss: 0.319845
[200]	valid_0's binary_logloss: 0.295326
[250]	valid_0's binary_logloss: 0.273092
[300]	valid_0's binary_logloss: 0.259206
[350]	

[100]	valid_0's binary_logloss: 0.299271
[150]	valid_0's binary_logloss: 0.272888
[200]	valid_0's binary_logloss: 0.263016
[250]	valid_0's binary_logloss: 0.248523
[300]	valid_0's binary_logloss: 0.230488
[350]	valid_0's binary_logloss: 0.217983
[400]	valid_0's binary_logloss: 0.206163
[450]	valid_0's binary_logloss: 0.204279
[500]	valid_0's binary_logloss: 0.192931
[550]	valid_0's binary_logloss: 0.19095
[600]	valid_0's binary_logloss: 0.188891
[650]	valid_0's binary_logloss: 0.1837
[700]	valid_0's binary_logloss: 0.178843
[750]	valid_0's binary_logloss: 0.177112
[800]	valid_0's binary_logloss: 0.178473
Early stopping, best iteration is:
[742]	valid_0's binary_logloss: 0.176007

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f3d9766dc60>
splitted: (1283, 247), (321, 247)
[0]	train-logloss:0.676516	valid-logloss:0.678837
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]

[50]	valid_0's binary_logloss: 0.35681
[100]	valid_0's binary_logloss: 0.277327
[150]	valid_0's binary_logloss: 0.249204
[200]	valid_0's binary_logloss: 0.236105
[250]	valid_0's binary_logloss: 0.226463
[300]	valid_0's binary_logloss: 0.220839
[350]	valid_0's binary_logloss: 0.217648
[400]	valid_0's binary_logloss: 0.216337
[450]	valid_0's binary_logloss: 0.217054
[500]	valid_0's binary_logloss: 0.219856
Early stopping, best iteration is:
[425]	valid_0's binary_logloss: 0.214765
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.430943
[100]	valid_0's binary_logloss: 0.355177
[150]	valid_0's binary_logloss: 0.304182
[200]	valid_0's binary_logloss: 0.288926
[250]	valid_0's binary_logloss: 0.270985
[300]	valid_0's binary_logloss: 0.259059
[350]	valid_0's binary_logloss: 0.246184
[400]	valid_0's binary_logloss: 0.247096
[450]	valid_0's binary_logloss: 0.243818
[500]	valid_0's binary_logloss: 0.234652
[550]	valid_0's binary_logloss: 0.232002
[60

[250]	valid_0's binary_logloss: 0.250474
[300]	valid_0's binary_logloss: 0.244166
[350]	valid_0's binary_logloss: 0.233878
[400]	valid_0's binary_logloss: 0.231143
[450]	valid_0's binary_logloss: 0.230012
[500]	valid_0's binary_logloss: 0.230015
Early stopping, best iteration is:
[419]	valid_0's binary_logloss: 0.227894

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f3d9766d990>
splitted: (1284, 247), (320, 247)
[0]	train-logloss:0.675819	valid-logloss:0.6797
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.309186	valid-logloss:0.393446
[100]	train-logloss:0.203888	valid-logloss:0.28221
[150]	train-logloss:0.147414	valid-logloss:0.245454
[200]	train-logloss:0.112065	valid-logloss:0.225532
[250]	train-logloss:0.085944	valid-logloss:0.21623
[300]	train-logloss:0.068288	valid-logloss:0.209707
[350]	train-logloss:0.055368	valid-logloss:0.206904
[400]	train

[150]	train-logloss:0.140468	valid-logloss:0.288752
[200]	train-logloss:0.1051	valid-logloss:0.2747
[250]	train-logloss:0.081085	valid-logloss:0.265128
[300]	train-logloss:0.064004	valid-logloss:0.257363
[350]	train-logloss:0.049832	valid-logloss:0.25486
[400]	train-logloss:0.040945	valid-logloss:0.251779
[450]	train-logloss:0.033492	valid-logloss:0.250767
[500]	train-logloss:0.028064	valid-logloss:0.248907
[550]	train-logloss:0.02365	valid-logloss:0.249532
[600]	train-logloss:0.020251	valid-logloss:0.251189
Stopping. Best iteration:
[511]	train-logloss:0.02697	valid-logloss:0.248319

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.369308
[100]	valid_0's binary_logloss: 0.301803
[150]	valid_0's binary_logloss: 0.273123
[200]	valid_0's binary_logloss: 0.262738
[250]	valid_0's binary_logloss: 0.256901
[300]	valid_0's binary_logloss: 0.254618
[350]	valid_0's binary_logloss: 0.250622
[400]	valid_0's binary_logloss: 0.251575
[450]	valid_0's bi

[400]	valid_0's binary_logloss: 0.218065
[450]	valid_0's binary_logloss: 0.212389
[500]	valid_0's binary_logloss: 0.215326
[550]	valid_0's binary_logloss: 0.214737
Early stopping, best iteration is:
[450]	valid_0's binary_logloss: 0.212389
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.428933
[100]	valid_0's binary_logloss: 0.333077
[150]	valid_0's binary_logloss: 0.323676
[200]	valid_0's binary_logloss: 0.284914
[250]	valid_0's binary_logloss: 0.27327
[300]	valid_0's binary_logloss: 0.255086
[350]	valid_0's binary_logloss: 0.251324
[400]	valid_0's binary_logloss: 0.242645
[450]	valid_0's binary_logloss: 0.237685
[500]	valid_0's binary_logloss: 0.234519
[550]	valid_0's binary_logloss: 0.225707
[600]	valid_0's binary_logloss: 0.223399
[650]	valid_0's binary_logloss: 0.22239
[700]	valid_0's binary_logloss: 0.219259
[750]	valid_0's binary_logloss: 0.218586
[800]	valid_0's binary_logloss: 0.218023
[850]	valid_0's binary_logloss: 0.216331
[90

[1000]	valid_0's binary_logloss: 0.21326
[1050]	valid_0's binary_logloss: 0.21404
Early stopping, best iteration is:
[979]	valid_0's binary_logloss: 0.211975

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f3d9766d990>
splitted: (1284, 247), (320, 247)
[0]	train-logloss:0.67558	valid-logloss:0.679896
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.307249	valid-logloss:0.395785
[100]	train-logloss:0.202345	valid-logloss:0.291203
[150]	train-logloss:0.142742	valid-logloss:0.250259
[200]	train-logloss:0.108871	valid-logloss:0.230611
[250]	train-logloss:0.085292	valid-logloss:0.222499
[300]	train-logloss:0.066914	valid-logloss:0.213423
[350]	train-logloss:0.053818	valid-logloss:0.210765
[400]	train-logloss:0.043358	valid-logloss:0.210721
[450]	train-logloss:0.035978	valid-logloss:0.210775
[500]	train-logloss:0.02981	valid-logloss:0.208908
[550]	train-loglo


round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f3d9766dc60>
splitted: (1283, 247), (321, 247)
[0]	train-logloss:0.676722	valid-logloss:0.679257
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.308336	valid-logloss:0.391939
[100]	train-logloss:0.198426	valid-logloss:0.320954
[150]	train-logloss:0.138501	valid-logloss:0.288735
[200]	train-logloss:0.102434	valid-logloss:0.273269
[250]	train-logloss:0.07921	valid-logloss:0.26349
[300]	train-logloss:0.062476	valid-logloss:0.257879
[350]	train-logloss:0.050193	valid-logloss:0.25643
[400]	train-logloss:0.040585	valid-logloss:0.254783
[450]	train-logloss:0.033817	valid-logloss:0.25284
[500]	train-logloss:0.028424	valid-logloss:0.253518
[550]	train-logloss:0.023983	valid-logloss:0.25608
Stopping. Best iteration:
[462]	train-logloss:0.032212	valid-logloss:0.252646

Training until validation scores don't improve 

[700]	train-logloss:0.01581	valid-logloss:0.196809
[750]	train-logloss:0.014029	valid-logloss:0.197123
Stopping. Best iteration:
[693]	train-logloss:0.016082	valid-logloss:0.196581

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.358985
[100]	valid_0's binary_logloss: 0.281907
[150]	valid_0's binary_logloss: 0.251341
[200]	valid_0's binary_logloss: 0.235062
[250]	valid_0's binary_logloss: 0.223762
[300]	valid_0's binary_logloss: 0.217006
[350]	valid_0's binary_logloss: 0.212972
[400]	valid_0's binary_logloss: 0.210295
[450]	valid_0's binary_logloss: 0.208345
[500]	valid_0's binary_logloss: 0.20728
[550]	valid_0's binary_logloss: 0.207872
[600]	valid_0's binary_logloss: 0.208412
Early stopping, best iteration is:
[509]	valid_0's binary_logloss: 0.20586
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.412022
[100]	valid_0's binary_logloss: 0.322133
[150]	valid_0's binary_logloss: 0.301118
[200]

[500]	valid_0's binary_logloss: 0.219881
[550]	valid_0's binary_logloss: 0.222724
Early stopping, best iteration is:
[450]	valid_0's binary_logloss: 0.219187
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.435486
[100]	valid_0's binary_logloss: 0.359474
[150]	valid_0's binary_logloss: 0.312882
[200]	valid_0's binary_logloss: 0.285547
[250]	valid_0's binary_logloss: 0.264162
[300]	valid_0's binary_logloss: 0.252274
[350]	valid_0's binary_logloss: 0.240541
[400]	valid_0's binary_logloss: 0.23182
[450]	valid_0's binary_logloss: 0.234098
[500]	valid_0's binary_logloss: 0.229266
[550]	valid_0's binary_logloss: 0.226289
[600]	valid_0's binary_logloss: 0.220434
[650]	valid_0's binary_logloss: 0.221227
[700]	valid_0's binary_logloss: 0.218364
[750]	valid_0's binary_logloss: 0.217043
[800]	valid_0's binary_logloss: 0.217463
[850]	valid_0's binary_logloss: 0.215783
[900]	valid_0's binary_logloss: 0.216302
[950]	valid_0's binary_logloss: 0.216159
[1

[200]	valid_0's binary_logloss: 0.262481
[250]	valid_0's binary_logloss: 0.242867
[300]	valid_0's binary_logloss: 0.225564
[350]	valid_0's binary_logloss: 0.222889
[400]	valid_0's binary_logloss: 0.212944
[450]	valid_0's binary_logloss: 0.207633
[500]	valid_0's binary_logloss: 0.201147
[550]	valid_0's binary_logloss: 0.199948
[600]	valid_0's binary_logloss: 0.195091
[650]	valid_0's binary_logloss: 0.192565
[700]	valid_0's binary_logloss: 0.18174
[750]	valid_0's binary_logloss: 0.181956
[800]	valid_0's binary_logloss: 0.179899
[850]	valid_0's binary_logloss: 0.178669
[900]	valid_0's binary_logloss: 0.178489
[950]	valid_0's binary_logloss: 0.178434
Early stopping, best iteration is:
[872]	valid_0's binary_logloss: 0.177019

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f3d9766dc60>
splitted: (1283, 247), (321, 247)
[0]	train-logloss:0.675835	valid-logloss:0.67758
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-l

[200]	train-logloss:0.109946	valid-logloss:0.22997
[250]	train-logloss:0.085591	valid-logloss:0.216576
[300]	train-logloss:0.067412	valid-logloss:0.209569
[350]	train-logloss:0.053443	valid-logloss:0.204393
[400]	train-logloss:0.042795	valid-logloss:0.199849
[450]	train-logloss:0.035289	valid-logloss:0.196983
[500]	train-logloss:0.029417	valid-logloss:0.195319
[550]	train-logloss:0.024848	valid-logloss:0.194299
[600]	train-logloss:0.021199	valid-logloss:0.195073
Stopping. Best iteration:
[539]	train-logloss:0.025857	valid-logloss:0.193449

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.355878
[100]	valid_0's binary_logloss: 0.277758
[150]	valid_0's binary_logloss: 0.247371
[200]	valid_0's binary_logloss: 0.236035
[250]	valid_0's binary_logloss: 0.225348
[300]	valid_0's binary_logloss: 0.219439
[350]	valid_0's binary_logloss: 0.218716
[400]	valid_0's binary_logloss: 0.216593
[450]	valid_0's binary_logloss: 0.213597
[500]	valid_0's binary_

[500]	valid_0's binary_logloss: 0.21732
[550]	valid_0's binary_logloss: 0.21909
[600]	valid_0's binary_logloss: 0.223989
Early stopping, best iteration is:
[507]	valid_0's binary_logloss: 0.216527
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.369318
[100]	valid_0's binary_logloss: 0.323828
[150]	valid_0's binary_logloss: 0.308264
[200]	valid_0's binary_logloss: 0.278646
[250]	valid_0's binary_logloss: 0.261244
[300]	valid_0's binary_logloss: 0.26088
[350]	valid_0's binary_logloss: 0.243852
[400]	valid_0's binary_logloss: 0.238661
[450]	valid_0's binary_logloss: 0.238551
[500]	valid_0's binary_logloss: 0.237744
[550]	valid_0's binary_logloss: 0.23219
[600]	valid_0's binary_logloss: 0.227851
[650]	valid_0's binary_logloss: 0.226584
[700]	valid_0's binary_logloss: 0.222072
[750]	valid_0's binary_logloss: 0.222701
[800]	valid_0's binary_logloss: 0.221484
[850]	valid_0's binary_logloss: 0.217205
[900]	valid_0's binary_logloss: 0.216969
[950]

[800]	valid_0's binary_logloss: 0.181568
Early stopping, best iteration is:
[743]	valid_0's binary_logloss: 0.18014

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f3d9766dc60>
splitted: (1283, 247), (321, 247)
[0]	train-logloss:0.67682	valid-logloss:0.679387
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.31041	valid-logloss:0.392811
[100]	train-logloss:0.197741	valid-logloss:0.323502
[150]	train-logloss:0.139096	valid-logloss:0.290349
[200]	train-logloss:0.104027	valid-logloss:0.272018
[250]	train-logloss:0.08076	valid-logloss:0.264076
[300]	train-logloss:0.06284	valid-logloss:0.257397
[350]	train-logloss:0.051035	valid-logloss:0.253035
[400]	train-logloss:0.04148	valid-logloss:0.252661
[450]	train-logloss:0.034628	valid-logloss:0.249536
[500]	train-logloss:0.028956	valid-logloss:0.246927
[550]	train-logloss:0.024503	valid-logloss:0.246066
[600]	trai

In [7]:
for i,v in final_dict.items():
    print(v)

[0.21012963979079585, 0.20862186577636441, 0.20983491831459072, 0.205572354848373, 0.20768137792375349, 0.21065259329173974, 0.21025686751234796, 0.2092285317725569, 0.20937734740484334, 0.20710388216406561]
[0.21192038774617994, 0.21510278865547719, 0.21349441290720278, 0.21314132306764719, 0.21106262003875184, 0.21150269654329173, 0.20970752255349692, 0.21026733285106491, 0.21018590252162195, 0.21561990540992712]
[0.20853060000000001, 0.20904379999999997, 0.20738019999999996, 0.20734639999999999, 0.207984, 0.20712419999999998, 0.20816319999999999, 0.20811739999999998, 0.20851959999999997, 0.20861739999999998]


In [4]:
# This will be the version changed based on my own understanding
def save_blend(preds={}, loc='./'):
    target = 'is_iceberg'
    
    w_total = 0.0
    blend = None
    df_corr = None
    print('\nBlending...')
    for k, v in preds.items():
        if blend is None:
            blend = pd.read_csv('{0}/{1}'.format(loc, k))
            print('load: {0}, w={1}'.format(k, v))
            
            df_corr = pd.DataFrame({'id': blend['id'].tolist()})
            df_corr[k[16:-4]] = blend[target]
            
            w_total += v
            blend[target] = blend[target] * v
                
        else:
            preds_tmp = pd.read_csv('{0}/{1}'.format(loc, k))
            preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
            print('load: {0}, w={1}'.format(k, v))
            df_corr[k[16:-4]] = preds_tmp[target]
            
            w_total += v
            blend[target] += preds_tmp[target] * v
            del preds_tmp
            
    print('\n{}'.format(df_corr.corr()), flush=True)
    #write submission
    blend[target] = blend[target] / w_total
    print('\nPreview: \n{}'.format(blend.head()), flush=True)
    blend.to_csv('{}subm_blend{:03d}_{}.csv'.format(loc, len(preds), tmp), index=False, float_format='%.6f')

def run_lgb(params={}, lgb_train=None, lgb_valid=None, lgb_test=None, test_ids=None, nr_round=2000, min_round=100, file=''):

    print('\nLightGBM: {}'.format(params['boosting'])) 
    model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)
    
    pred = model2.predict(lgb_test, num_iteration=model2.best_iteration)
    #
    subm = pd.DataFrame({'id': test_ids, 'is_iceberg': pred})
    subm.to_csv(file, index=False, float_format='%.6f')
    #   
    df = pd.DataFrame({'feature':model2.feature_name(), 'importances': model2.feature_importance()})
    
    return pred, df

In [45]:
#results
freq = pd.DataFrame()

avg_result = []
#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)

ran_num = 463465 #463465#56491
for ran_num in [56491,463465]:
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 5
    kf = KFold(n_splits=nr_runs, random_state=split_seed)

    for param in [0.5,0.8,1.2,1.5]:
        result = []
        for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
            print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

            tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

            x1, x2 = train_X[train_index], train_X[test_index]
            y1, y2 = train_y[train_index], train_y[test_index]
            #x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=test_ratio, random_state=split_seed + r)
            print('splitted: {0}, {1}'.format(x1.shape, x2.shape), flush=True)
            test_X_dup = test_X.copy()

            #XGB
            xgb_train = xgb.DMatrix(x1, y1)
            xgb_valid = xgb.DMatrix(x2, y2)
            #
            watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
            params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'seed': 99, 'silent': True}
            params['eta'] = 0.05
            params['max_depth'] = 4
            params['subsample'] = 0.9
            params['eval_metric'] = 'logloss'
            params['colsample_bytree'] = 0.8
            params['colsample_bylevel'] = param
            params['max_delta_step'] = 3
            #params['gamma'] = 5.0
            params['labmda'] = param
            params['scale_pos_weight'] = 1.0
            params['seed'] = ran_num + r
            nr_round = 2000
            min_round = 100

            model1 = xgb.train(params, 
                               xgb_train, 
                               nr_round,  
                               watchlist, 
                               verbose_eval=50, 
                               early_stopping_rounds=min_round)

            pred_xgb = model1.predict(xgb.DMatrix(test_X_dup), ntree_limit=model1.best_ntree_limit)

            #
            msg= 'xgbfold%d'%r
            freq[msg] = pred_xgb
            result.append(model1.best_score)
        print('All your scores are: ')
        print(result)
        print('The average of your score')
        print(np.mean(result))
        avg_result.append(np.mean(result))
print(avg_result)

The seed we are using is: 56491

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f4f99a3e4c8>
splitted: (1283, 247), (321, 247)
[0]	train-logloss:0.666214	valid-logloss:0.668001
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.23468	valid-logloss:0.302959
[100]	train-logloss:0.1385	valid-logloss:0.240614
[150]	train-logloss:0.090079	valid-logloss:0.213791
[200]	train-logloss:0.061226	valid-logloss:0.201662
[250]	train-logloss:0.043525	valid-logloss:0.195157
[300]	train-logloss:0.032578	valid-logloss:0.194752
[350]	train-logloss:0.024474	valid-logloss:0.193095
Stopping. Best iteration:
[280]	train-logloss:0.036201	valid-logloss:0.192763


round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f4f99a3e4c8>
splitted: (1283, 247), (321, 247)
[0]	train-logloss:0.666596	valid-logloss:0.668727
Multiple eval metrics have been passed: 'valid-logloss' will be u

XGBoostError: b'value 1.2 for Parameter colsample_bylevel exceed bound [0,1]'

#model1.best_score

nta = [0.01,0.015,0.02,0.025,0.03]
[0.20835880000000001, 0.20946060000000002, 0.2080698, 0.20710020000000001, 0.20994420000000003]
#change to different split.
[0.2067958, 0.20759760000000002, 0.20812819999999999, 0.20848559999999999, 0.20875680000000002]

then we want to see tree depth sensitivity.
[2,3,4,5]
[0.2127414, 0.20789479999999999, 0.2080698, 0.20814539999999998
 
subsample 0.5,0.6,0.7,0.8,0.85,0.9,0.95 #maybe because data points are limited?
[0.21047080000000001, 0.20971980000000001, 0.2092918, 0.2080698,0.20451239, 0.2070148,0.20853039999999998]

change to other data split! consistent here!
[0.21159359999999999, 0.21181939999999999, 0.20943299999999998, 0.2081281,0.2075963 0.207232,0.20908] 

 
#### colsample_bytree 0.6,0.7,0.8,0.9
[0.2066904, 0.20730179999999998, 0.2070148, 0.21193840000000003]
[0.2090204, 0.20862359999999999, 0.207232, 0.20750860000000002]

#### max_delta: Maximum delta step we allow each tree’s weight estimation to be.
[0.20828599999999997, 0.2070148, 0.2070148, 0.2070148]
[0.20910699999999999, 0.207232, 0.207232, 0.207232]

  [0.6,0.7,0.8,0.9]
0.20991120000000002, 0.2083854, 0.20834060000000001, 0.20772740000000001, 
0.20727820000000002, 0.20918019999999998, 0.21255940000000001, 0.2111894

np.mean(result)

In [54]:
avg_result = []
#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)

ran_num = 463465 #463465#56491
for ran_num in [2312, 56491,463465]:
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 10
    kf = KFold(n_splits=nr_runs, random_state=split_seed)

    for param in [0.8]:
        result = []
        for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
            print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

            tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

            x1, x2 = train_X[train_index], train_X[test_index]
            y1, y2 = train_y[train_index], train_y[test_index] 

            ##LightGBM
            lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
            lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
            #gbdt
            params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
            params['boosting'] = 'gbdt'
            params['metric'] = 'binary_logloss'
            params['learning_rate'] = 0.03
            params['max_depth'] = 5
            params['num_leaves'] = 9 # higher number of leaves
            params['feature_fraction'] = param # Controls overfit
            params['bagging_fraction'] = 0.9    
            params['bagging_freq'] = 3
            params['seed'] = ran_num + r
            #
            params['verbose'] = -1

            file = 'gbm/subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
            subms.append(file)

            model2 = lgb.train(params, 
                           lgb_train, 
                           nr_round, 
                           lgb_valid, 
                           verbose_eval=50, early_stopping_rounds=min_round)
            result.append(model2.best_score['valid_0']['binary_logloss'])
            
        print('All your scores are: ')
        print(result)
        print('The average of your score')
        print(np.mean(result))
        avg_result.append(np.mean(result))
print(avg_result)

The seed we are using is: 2312

round 0001 of 0010, seed=<mtrand.RandomState object at 0x7f51feb63678>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.343065
[100]	valid_0's binary_logloss: 0.254834
[150]	valid_0's binary_logloss: 0.220965
[200]	valid_0's binary_logloss: 0.201352
[250]	valid_0's binary_logloss: 0.19127
[300]	valid_0's binary_logloss: 0.184706
[350]	valid_0's binary_logloss: 0.181717
[400]	valid_0's binary_logloss: 0.180455
[450]	valid_0's binary_logloss: 0.180666
[500]	valid_0's binary_logloss: 0.173666
[550]	valid_0's binary_logloss: 0.170401
[600]	valid_0's binary_logloss: 0.16884
[650]	valid_0's binary_logloss: 0.173522
Early stopping, best iteration is:
[588]	valid_0's binary_logloss: 0.167662

round 0002 of 0010, seed=<mtrand.RandomState object at 0x7f51feb63678>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.378399
[100]	valid_0's binary_logloss: 0.301905
[150]	valid_

[350]	valid_0's binary_logloss: 0.194827
[400]	valid_0's binary_logloss: 0.193151
[450]	valid_0's binary_logloss: 0.191603
[500]	valid_0's binary_logloss: 0.189083
[550]	valid_0's binary_logloss: 0.192485
Early stopping, best iteration is:
[486]	valid_0's binary_logloss: 0.187184

round 0004 of 0010, seed=<mtrand.RandomState object at 0x7f4fb93e6510>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.323819
[100]	valid_0's binary_logloss: 0.233011
[150]	valid_0's binary_logloss: 0.198224
[200]	valid_0's binary_logloss: 0.181314
[250]	valid_0's binary_logloss: 0.173802
[300]	valid_0's binary_logloss: 0.16524
[350]	valid_0's binary_logloss: 0.162879
[400]	valid_0's binary_logloss: 0.156504
[450]	valid_0's binary_logloss: 0.155874
[500]	valid_0's binary_logloss: 0.157055
Early stopping, best iteration is:
[427]	valid_0's binary_logloss: 0.153709

round 0005 of 0010, seed=<mtrand.RandomState object at 0x7f4fb93e6510>
Training until validation sc

[150]	valid_0's binary_logloss: 0.249987
[200]	valid_0's binary_logloss: 0.243479
[250]	valid_0's binary_logloss: 0.227212
[300]	valid_0's binary_logloss: 0.223046
[350]	valid_0's binary_logloss: 0.220706
[400]	valid_0's binary_logloss: 0.221034
Early stopping, best iteration is:
[342]	valid_0's binary_logloss: 0.218621

round 0007 of 0010, seed=<mtrand.RandomState object at 0x7f4f99ab5798>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.355334
[100]	valid_0's binary_logloss: 0.261883
[150]	valid_0's binary_logloss: 0.23218
[200]	valid_0's binary_logloss: 0.216746
[250]	valid_0's binary_logloss: 0.211628
[300]	valid_0's binary_logloss: 0.207755
[350]	valid_0's binary_logloss: 0.209932
[400]	valid_0's binary_logloss: 0.206766
[450]	valid_0's binary_logloss: 0.206726
[500]	valid_0's binary_logloss: 0.208441
Early stopping, best iteration is:
[415]	valid_0's binary_logloss: 0.205294

round 0008 of 0010, seed=<mtrand.RandomState object at 0x7

0.20997504631763109

num_leaves  [6,8,9,10,12,14,16,20]

2312:0.20693540784004155, 0.20807777115480661,0.208204, 0.2101704305882699, 0.20958977185302161, 0.2097247594090052, 0.21252432775232091, 0.21097843374429764
     0.21085407105688136, 0.20972141239725756,0.20768 , 0.20802226387790204, 0.21319986508838254, 0.2114961113773989, 0.20997504631763109, 0.21187703738304534, 
     0.21143941426691132, 0.20521332038873236,0.206531, 0.20727516177890259, 0.20963941105802347, 0.20857695536404003, 0.21374484604203184, 0.21189068566992506

try another one [6,7,8,9]
[0.21142256014136723, 0.20535415309766486, 0.20584818981873357, 0.20905026043297328]

## change num_leaf to 9

0.20826665232053476, 0.20893920897305204, 0.20820437367235373, 0.20789867033173567,
0.21021699406462796, 0.20997471390055775, 0.20768137792375355, 0.20762240986338024,
0.20571142430466466, 0.20512413321541617, 0.2065318518352405, 0.21059664548852566]



In [40]:
model2.best_score['valid_0']['binary_logloss']

0.21655854539955169

# Fine tune the last one

In [13]:
#results
avg_result = []
#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)

ran_num = 463465 #463465#56491
for ran_num in  [1123, 463465]:#  [1123,4677,6745 ,2312, 56491,463465]:
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 5
    kf = KFold(n_splits=nr_runs, random_state=split_seed)

    for param in [6,8,10,12,14,18,20,22]:
        result = []
        for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
            print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

            tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

            x1, x2 = train_X[train_index], train_X[test_index]
            y1, y2 = train_y[train_index], train_y[test_index] 

            ##LightGBM
            lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
            lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
            #gbdt
            params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
            params['boosting'] = 'dart'
            params['metric'] = 'binary_logloss'
            params['learning_rate'] = 0.04
            params['max_depth'] = 5
            params['num_leaves'] = param # higher number of leaves
            params['feature_fraction'] = 0.8 # Controls overfit
            params['bagging_fraction'] = 0.9    
            params['bagging_freq'] = 3
            params['seed'] = ran_num + r
            #dart
            params['drop_rate'] = 0.1
            params['skip_drop'] = 0.5
            params['max_drop'] = 10
            params['verbose'] = -1 

            model3 = lgb.train(params, 
                           lgb_train, 
                           nr_round, 
                           lgb_valid, 
                           verbose_eval=50, early_stopping_rounds=min_round)
            result.append(model3.best_score['valid_0']['binary_logloss'])
            
        print('All your scores are: ')
        print(result)
        print('The average of your score')
        print(np.mean(result))
        avg_result.append(np.mean(result))
print(avg_result)

The seed we are using is: 1123

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0ca8>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.441487
[100]	valid_0's binary_logloss: 0.374806
[150]	valid_0's binary_logloss: 0.343087
[200]	valid_0's binary_logloss: 0.319263
[250]	valid_0's binary_logloss: 0.295517
[300]	valid_0's binary_logloss: 0.286817
[350]	valid_0's binary_logloss: 0.283165
[400]	valid_0's binary_logloss: 0.26954
[450]	valid_0's binary_logloss: 0.259985
[500]	valid_0's binary_logloss: 0.252553
[550]	valid_0's binary_logloss: 0.246402
[600]	valid_0's binary_logloss: 0.245004
[650]	valid_0's binary_logloss: 0.23783
[700]	valid_0's binary_logloss: 0.234628
[750]	valid_0's binary_logloss: 0.229842
[800]	valid_0's binary_logloss: 0.230116
[850]	valid_0's binary_logloss: 0.231584
[900]	valid_0's binary_logloss: 0.227668
[950]	valid_0's binary_logloss: 0.223863
[1000]	valid_0's binary_logloss: 0.222578
[1050]	valid_0's


round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0ca8>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.405814
[100]	valid_0's binary_logloss: 0.341539
[150]	valid_0's binary_logloss: 0.300143
[200]	valid_0's binary_logloss: 0.273013
[250]	valid_0's binary_logloss: 0.253096
[300]	valid_0's binary_logloss: 0.243682
[350]	valid_0's binary_logloss: 0.226306
[400]	valid_0's binary_logloss: 0.226077
[450]	valid_0's binary_logloss: 0.21135
[500]	valid_0's binary_logloss: 0.209386
[550]	valid_0's binary_logloss: 0.204842
[600]	valid_0's binary_logloss: 0.200552
[650]	valid_0's binary_logloss: 0.196731
[700]	valid_0's binary_logloss: 0.194661
[750]	valid_0's binary_logloss: 0.189943
[800]	valid_0's binary_logloss: 0.189661
[850]	valid_0's binary_logloss: 0.185628
[900]	valid_0's binary_logloss: 0.187222
[950]	valid_0's binary_logloss: 0.185584
[1000]	valid_0's binary_logloss: 0.185861
[1050]	valid_0's binary_logloss: 0.183517
[110

[550]	valid_0's binary_logloss: 0.225871
[600]	valid_0's binary_logloss: 0.223918
[650]	valid_0's binary_logloss: 0.219983
[700]	valid_0's binary_logloss: 0.219146
[750]	valid_0's binary_logloss: 0.219527
[800]	valid_0's binary_logloss: 0.216357
[850]	valid_0's binary_logloss: 0.214872
[900]	valid_0's binary_logloss: 0.214897
[950]	valid_0's binary_logloss: 0.214093
[1000]	valid_0's binary_logloss: 0.213342
[1050]	valid_0's binary_logloss: 0.213104
[1100]	valid_0's binary_logloss: 0.2125
Early stopping, best iteration is:
[1036]	valid_0's binary_logloss: 0.211922

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0ca8>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.43424
[100]	valid_0's binary_logloss: 0.393704
[150]	valid_0's binary_logloss: 0.341988
[200]	valid_0's binary_logloss: 0.320549
[250]	valid_0's binary_logloss: 0.302033
[300]	valid_0's binary_logloss: 0.282979
[350]	valid_0's binary_logloss: 0.256518
[400]	vali


round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0ca8>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.396444
[100]	valid_0's binary_logloss: 0.331035
[150]	valid_0's binary_logloss: 0.288231
[200]	valid_0's binary_logloss: 0.258972
[250]	valid_0's binary_logloss: 0.238894
[300]	valid_0's binary_logloss: 0.229056
[350]	valid_0's binary_logloss: 0.215164
[400]	valid_0's binary_logloss: 0.21422
[450]	valid_0's binary_logloss: 0.201387
[500]	valid_0's binary_logloss: 0.197004
[550]	valid_0's binary_logloss: 0.19315
[600]	valid_0's binary_logloss: 0.188502
[650]	valid_0's binary_logloss: 0.182941
[700]	valid_0's binary_logloss: 0.18202
[750]	valid_0's binary_logloss: 0.180166
[800]	valid_0's binary_logloss: 0.178821
[850]	valid_0's binary_logloss: 0.176018
[900]	valid_0's binary_logloss: 0.177709
[950]	valid_0's binary_logloss: 0.178825
Early stopping, best iteration is:
[850]	valid_0's binary_logloss: 0.176018

round 0003 of

[500]	valid_0's binary_logloss: 0.230918
[550]	valid_0's binary_logloss: 0.225965
[600]	valid_0's binary_logloss: 0.225467
[650]	valid_0's binary_logloss: 0.219441
[700]	valid_0's binary_logloss: 0.218535
[750]	valid_0's binary_logloss: 0.21987
[800]	valid_0's binary_logloss: 0.21624
[850]	valid_0's binary_logloss: 0.213056
[900]	valid_0's binary_logloss: 0.213679
[950]	valid_0's binary_logloss: 0.21241
Early stopping, best iteration is:
[865]	valid_0's binary_logloss: 0.211803

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0ca8>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.428919
[100]	valid_0's binary_logloss: 0.388512
[150]	valid_0's binary_logloss: 0.335773
[200]	valid_0's binary_logloss: 0.313826
[250]	valid_0's binary_logloss: 0.302852
[300]	valid_0's binary_logloss: 0.282864
[350]	valid_0's binary_logloss: 0.25462
[400]	valid_0's binary_logloss: 0.245657
[450]	valid_0's binary_logloss: 0.238479
[500]	valid_0's

[100]	valid_0's binary_logloss: 0.330301
[150]	valid_0's binary_logloss: 0.289258
[200]	valid_0's binary_logloss: 0.26211
[250]	valid_0's binary_logloss: 0.240822
[300]	valid_0's binary_logloss: 0.231459
[350]	valid_0's binary_logloss: 0.21614
[400]	valid_0's binary_logloss: 0.212908
[450]	valid_0's binary_logloss: 0.201423
[500]	valid_0's binary_logloss: 0.198666
[550]	valid_0's binary_logloss: 0.192186
[600]	valid_0's binary_logloss: 0.18916
[650]	valid_0's binary_logloss: 0.185912
[700]	valid_0's binary_logloss: 0.185694
[750]	valid_0's binary_logloss: 0.183733
[800]	valid_0's binary_logloss: 0.182485
[850]	valid_0's binary_logloss: 0.177929
[900]	valid_0's binary_logloss: 0.179302
[950]	valid_0's binary_logloss: 0.181669
Early stopping, best iteration is:
[882]	valid_0's binary_logloss: 0.177813

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0ca8>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.460945
[100]	valid_0'

[700]	valid_0's binary_logloss: 0.2426
[750]	valid_0's binary_logloss: 0.239415
[800]	valid_0's binary_logloss: 0.236835
[850]	valid_0's binary_logloss: 0.235439
[900]	valid_0's binary_logloss: 0.231838
[950]	valid_0's binary_logloss: 0.232372
[1000]	valid_0's binary_logloss: 0.229348
[1050]	valid_0's binary_logloss: 0.228295
[1100]	valid_0's binary_logloss: 0.229309
[1150]	valid_0's binary_logloss: 0.228657
Early stopping, best iteration is:
[1052]	valid_0's binary_logloss: 0.227707

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0e58>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.486118
[100]	valid_0's binary_logloss: 0.413862
[150]	valid_0's binary_logloss: 0.387838
[200]	valid_0's binary_logloss: 0.34969
[250]	valid_0's binary_logloss: 0.322992
[300]	valid_0's binary_logloss: 0.305567
[350]	valid_0's binary_logloss: 0.287513
[400]	valid_0's binary_logloss: 0.258782
[450]	valid_0's binary_logloss: 0.251469
[500]	val

[100]	valid_0's binary_logloss: 0.348877
[150]	valid_0's binary_logloss: 0.331085
[200]	valid_0's binary_logloss: 0.299948
[250]	valid_0's binary_logloss: 0.283868
[300]	valid_0's binary_logloss: 0.276551
[350]	valid_0's binary_logloss: 0.270954
[400]	valid_0's binary_logloss: 0.251837
[450]	valid_0's binary_logloss: 0.253091
[500]	valid_0's binary_logloss: 0.249757
[550]	valid_0's binary_logloss: 0.242095
[600]	valid_0's binary_logloss: 0.236637
[650]	valid_0's binary_logloss: 0.231876
[700]	valid_0's binary_logloss: 0.22895
[750]	valid_0's binary_logloss: 0.225878
[800]	valid_0's binary_logloss: 0.223809
[850]	valid_0's binary_logloss: 0.222648
[900]	valid_0's binary_logloss: 0.22133
[950]	valid_0's binary_logloss: 0.218605
[1000]	valid_0's binary_logloss: 0.218667
[1050]	valid_0's binary_logloss: 0.215107
[1100]	valid_0's binary_logloss: 0.214972
[1150]	valid_0's binary_logloss: 0.214162
[1200]	valid_0's binary_logloss: 0.213469
[1250]	valid_0's binary_logloss: 0.212257
[1300]	valid

[650]	valid_0's binary_logloss: 0.224068
[700]	valid_0's binary_logloss: 0.219922
[750]	valid_0's binary_logloss: 0.2174
[800]	valid_0's binary_logloss: 0.216931
[850]	valid_0's binary_logloss: 0.217661
[900]	valid_0's binary_logloss: 0.217297
[950]	valid_0's binary_logloss: 0.216292
[1000]	valid_0's binary_logloss: 0.21237
[1050]	valid_0's binary_logloss: 0.212562
[1100]	valid_0's binary_logloss: 0.212217
Early stopping, best iteration is:
[1007]	valid_0's binary_logloss: 0.211571

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0e58>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.469858
[100]	valid_0's binary_logloss: 0.392005
[150]	valid_0's binary_logloss: 0.353068
[200]	valid_0's binary_logloss: 0.321732
[250]	valid_0's binary_logloss: 0.294593
[300]	valid_0's binary_logloss: 0.273859
[350]	valid_0's binary_logloss: 0.252808
[400]	valid_0's binary_logloss: 0.233652
[450]	valid_0's binary_logloss: 0.230732
[500]	vali

[800]	valid_0's binary_logloss: 0.181884
[850]	valid_0's binary_logloss: 0.181744
[900]	valid_0's binary_logloss: 0.179739
[950]	valid_0's binary_logloss: 0.178864
[1000]	valid_0's binary_logloss: 0.178855
[1050]	valid_0's binary_logloss: 0.176574
[1100]	valid_0's binary_logloss: 0.176152
[1150]	valid_0's binary_logloss: 0.177589
[1200]	valid_0's binary_logloss: 0.175458
[1250]	valid_0's binary_logloss: 0.175999
[1300]	valid_0's binary_logloss: 0.177261
Early stopping, best iteration is:
[1229]	valid_0's binary_logloss: 0.17534

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0e58>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.460566
[100]	valid_0's binary_logloss: 0.360393
[150]	valid_0's binary_logloss: 0.330018
[200]	valid_0's binary_logloss: 0.312121
[250]	valid_0's binary_logloss: 0.296363
[300]	valid_0's binary_logloss: 0.279756
[350]	valid_0's binary_logloss: 0.277078
[400]	valid_0's binary_logloss: 0.270457
[450

[400]	valid_0's binary_logloss: 0.247866
[450]	valid_0's binary_logloss: 0.250517
[500]	valid_0's binary_logloss: 0.244974
[550]	valid_0's binary_logloss: 0.236937
[600]	valid_0's binary_logloss: 0.233212
[650]	valid_0's binary_logloss: 0.230533
[700]	valid_0's binary_logloss: 0.227494
[750]	valid_0's binary_logloss: 0.224384
[800]	valid_0's binary_logloss: 0.220665
[850]	valid_0's binary_logloss: 0.221983
Early stopping, best iteration is:
[796]	valid_0's binary_logloss: 0.220403

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0e58>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.409409
[100]	valid_0's binary_logloss: 0.338374
[150]	valid_0's binary_logloss: 0.301523
[200]	valid_0's binary_logloss: 0.271596
[250]	valid_0's binary_logloss: 0.242521
[300]	valid_0's binary_logloss: 0.228017
[350]	valid_0's binary_logloss: 0.221772
[400]	valid_0's binary_logloss: 0.211973
[450]	valid_0's binary_logloss: 0.205316
[500]	valid


num_leaves here: 16

0.20996463972458032, 0.2105201451940987, 0.21304611830042508, 0.21343599728787993, 0.21106262003875184, 0.21354386772253284

num_leaves here: 9

0.21011925200129097, 0.215387241291573, 0.21029209937609511, 0.21529853022367157, 0.21635515450387191, 0.21704195707060975


In [21]:
# This will be the version changed based on my own understanding
def save_blend(preds={}, loc='./'):
    target = 'is_iceberg'
    
    w_total = 0.0
    blend = None
    df_corr = None
    print('\nBlending...')
    for k, v in preds.items():
        if blend is None:
            blend = pd.read_csv('{0}/{1}'.format(loc, k))
            print('load: {0}, w={1}'.format(k, v))
            
            df_corr = pd.DataFrame({'id': blend['id'].tolist()})
            df_corr[k[16:-4]] = blend[target]
            
            w_total += v
            blend[target] = blend[target] * v
                
        else:
            preds_tmp = pd.read_csv('{0}/{1}'.format(loc, k))
            preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
            print('load: {0}, w={1}'.format(k, v))
            df_corr[k[16:-4]] = preds_tmp[target]
            
            w_total += v
            blend[target] += preds_tmp[target] * v
            del preds_tmp
            
    print('\n{}'.format(df_corr.corr()), flush=True)
    #write submission
    blend[target] = blend[target] / w_total
    print('\nPreview: \n{}'.format(blend.head()), flush=True)
    blend.to_csv('{}subm_blend{:03d}_{}.csv'.format(loc, len(preds), tmp), index=False, float_format='%.6f')

def run_lgb(params={}, lgb_train=None, lgb_valid=None, lgb_test=None, test_ids=None, nr_round=2000, min_round=100, file=''):

    print('\nLightGBM: {}'.format(params['boosting'])) 
    model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=False, early_stopping_rounds=min_round)
    
    pred = model2.predict(lgb_test, num_iteration=model2.best_iteration)
    #
    if model2.best_score['valid_0']['binary_logloss']<=0.195:
        subm = pd.DataFrame({'id': test_ids, 'is_iceberg': pred})
        subm.to_csv(file, index=False, float_format='%.6f')
    #   
    df = pd.DataFrame({'feature':model2.feature_name(), 'importances': model2.feature_importance()})
    
    return pred, df


#results
freq = pd.DataFrame()
subms = []

#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)
seed_list=[]
# final_dict ={}
# final_dict['xgb_re'] = []
# final_dict['lgb_re'] = []
# final_dict['lgb_dart_re'] =[]
for rep in range(100):
    ran_num =  np.random.randint(50000,60000,size=1)[0]
    seed_list.append(ran_num)
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 5
    kf = KFold(n_splits=nr_runs, random_state=split_seed)
    tree_lim =0
#     xgb_re = []
#     lgb_re =[]
#     lgb_dart_re= []
    for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
        print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

        tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

        x1, x2 = train_X[train_index], train_X[test_index]
        y1, y2 = train_y[train_index], train_y[test_index]
        #x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=test_ratio, random_state=split_seed + r)
        print('splitted: {0}, {1}'.format(x1.shape, x2.shape), flush=True)
        test_X_dup = test_X.copy()

        #XGB
        xgb_train = xgb.DMatrix(x1, y1)
        xgb_valid = xgb.DMatrix(x2, y2)
        #
        watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
        params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'seed': 99, 'silent': True}
        params['eta'] = 0.03
        params['max_depth'] = 4
        params['subsample'] = 0.9
        params['eval_metric'] = 'logloss'
        params['colsample_bytree'] = 0.8
        params['colsample_bylevel'] = 0.8
        params['max_delta_step'] = 3
        #params['gamma'] = 5.0
        #params['labmda'] = 1
        params['scale_pos_weight'] = 1.0
        params['silent'] = 1
        params['seed'] = ran_num + r
        nr_round = 2000
        min_round = 100

        model1 = xgb.train(params, 
                           xgb_train, 
                           nr_round,  
                           watchlist, 
                           verbose_eval=False,
                           early_stopping_rounds=min_round)

        pred_xgb = model1.predict(xgb.DMatrix(test_X_dup), ntree_limit=model1.best_ntree_limit+tree_lim)

        #
        file = 'gbm/subm_xgb{}{}.csv'.format(rep, r+1)
        if model1.best_score<=0.195:
            subm = pd.DataFrame({'id': test['id'].values, target: pred_xgb})
            subm.to_csv(file, index=False)
        subms.append(file)    

        
        lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
        lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
        #gbdt
        params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
        params['boosting'] = 'gbdt'
        params['metric'] = 'binary_logloss'
        params['learning_rate'] = 0.03
        params['max_depth'] = 5
        params['num_leaves'] = 9 # higher number of leaves
        params['feature_fraction'] = 0.8 # Controls overfit
        params['bagging_fraction'] = 0.9    
        params['bagging_freq'] = 3
        params['seed'] = ran_num + r
        params['silent'] = True
        params['verbose'] = 1000

        file = 'subm_orilgb{}{}.csv'.format(rep, r+1)
        subms.append(file)

        pred, f_tmp = run_lgb(params=params, 
                              lgb_train=lgb_train, 
                              lgb_valid=lgb_valid, 
                              lgb_test=test_X_dup, 
                              test_ids=test['id'].values, 
                              nr_round=nr_round, 
                              min_round=min_round, 
                              file=file)
        
        
        ##LightGBM
        #dart
        params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
        params['boosting'] = 'dart'
        params['metric'] = 'binary_logloss'
        params['learning_rate'] = 0.04
        params['max_depth'] = 5
        params['num_leaves'] = 16 # higher number of leaves
        params['feature_fraction'] = 0.8 # Controls overfit
        params['bagging_fraction'] = 0.9    
        params['bagging_freq'] = 3
        params['seed'] = ran_num + r
        #dart
        params['drop_rate'] = 0.1
        params['skip_drop'] = 0.5
        params['max_drop'] = 10
        params['verbose'] = 1000
        params['silent'] = True
        
        file = 'gbm/subm_lgb{}{}.csv'.format(rep, r+1)
        subms.append(file)

        pred, f_tmp = run_lgb(params=params, 
                              lgb_train=lgb_train, 
                              lgb_valid=lgb_valid, 
                              lgb_test=test_X_dup, 
                              test_ids=test['id'].values, 
                              nr_round=nr_round, 
                              min_round=min_round, 
                              file=file)
    if rep%5==0:
        print(rep)
        

The seed we are using is: 58258

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f24a379a7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))



LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f24a379a7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f24a379a7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f24a379a7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f24a379a7e0>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
0
The seed we are using is: 51225

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f24a2da3c60>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f24a2da3c60>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f24a2da3c60>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt



LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f24a2da3c60>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f24a2da3c60>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
The seed we are using is: 59965

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a6f948>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a6f948>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a6f948>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a6f948>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a6f948>
splitted: (1284, 247), (320, 247)

L


LightGBM: gbdt

LightGBM: dart
The seed we are using is: 56371

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a4b948>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a4b948>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a4b948>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a4b948>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f24a2a4b948>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
The seed we are using is: 50146

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
spli


LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
The seed we are using is: 56111

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

L


LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
45
The seed we are using is: 53670

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
The seed we are using is: 54356

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
s


LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
The seed we are using is: 57140

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

L


LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
The seed we are using is: 50394

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1284, 247), (320, 247)

L


LightGBM: gbdt

LightGBM: dart
The seed we are using is: 50583

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
80
The seed we are using is: 59164

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
s


LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0005 of 0005, seed=<mtrand.RandomState object at 0x7f24a3adf438>
splitted: (1284, 247), (320, 247)

LightGBM: gbdt

LightGBM: dart
The seed we are using is: 57290

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

LightGBM: gbdt

LightGBM: dart

round 0004 of 0005, seed=<mtrand.RandomState object at 0x7f247d46d7e0>
splitted: (1283, 247), (321, 247)

L

In [29]:
import os
#print(os.listdir('gbm'))
waiting_list= [os.path.join('gbm',i) for i in os.listdir('gbm')]
len(waiting_list)
#w_total

513

In [32]:
k = pd.DataFrame()
blend = None
df_corr = None
print('\nBlending...')
v=1
for num,path in enumerate(waiting_list):
    if blend is None:
        blend = pd.read_csv(path)

        w_total += v
        blend[target] = blend[target] * v

    else:
        preds_tmp = pd.read_csv(path)
        preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
        msg = "is_iceberg%d"% (num+1)
        blend[msg] = preds_tmp[target] * v

        



Blending...


In [36]:
blend.to_csv('gbm_test.csv')

In [31]:
target = 'is_iceberg'

w_total = 0.0
blend = None
df_corr = None
print('\nBlending...')
v=1
for path in waiting_list:
    if blend is None:
        blend = pd.read_csv(path)

        w_total += v
        blend[target] = blend[target] * v

    else:
        preds_tmp = pd.read_csv(path)
        preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
        
        w_total += v
        blend[target] += preds_tmp[target] * v
        del preds_tmp

blend[target] = blend[target] / w_total
print('\nPreview: \n{}'.format(blend.head()), flush=True)
blend.to_csv('submission100.csv',index=False)


Blending...

Preview: 
         id  is_iceberg
0  5941774d    0.107648
1  4023181e    0.959657
2  b20200e4    0.221938
3  e7f018bb    0.985318
4  4371c8c3    0.922201


In [7]:
def save_blend(preds={}, loc='./'):
    
    target = 'is_iceberg'
    
    w_total = 0.0
    blend = None
    df_corr = None
    print('\nBlending...')
    for k, v in preds.items():
        if blend is None:
            blend = pd.read_csv('{0}/{1}'.format(loc, k))
            print('load: {0}, w={1}'.format(k, v))
            
            df_corr = pd.DataFrame({'id': blend['id'].tolist()})
            df_corr[k[16:-4]] = blend[target]
            
            w_total += v
            blend[target] = blend[target] * v
                
        else:
            preds_tmp = pd.read_csv('{0}/{1}'.format(loc, k))
            preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
            print('load: {0}, w={1}'.format(k, v))
            df_corr[k[16:-4]] = preds_tmp[target]
            
            w_total += v
            blend[target] += preds_tmp[target] * v
            del preds_tmp
            
    print('\n{}'.format(df_corr.corr()), flush=True)
    #write submission
    blend[target] = blend[target] / w_total
    print('\nPreview: \n{}'.format(blend.head()), flush=True)
    blend.to_csv('{}subm_blend{:03d}_{}.csv'.format(loc, len(preds), tmp), index=False, float_format='%.6f')

def run_lgb(params={}, lgb_train=None, lgb_valid=None, lgb_test=None, test_ids=None, nr_round=2000, min_round=100, file=''):

    print('\nLightGBM: {}'.format(params['boosting'])) 
    model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)
    
    pred = model2.predict(lgb_test, num_iteration=model2.best_iteration)
    #
    subm = pd.DataFrame({'id': test_ids, 'is_iceberg': pred})
    subm.to_csv(file, index=False, float_format='%.6f')
    #   
    df = pd.DataFrame({'feature':model2.feature_name(), 'importances': model2.feature_importance()})
    
    return pred, df


#results
freq = pd.DataFrame()
subms = []

#training
test_ratio = 0.2
nr_runs = 3
split_seed = 25
kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)

for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
    print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

    tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

    x1, x2 = train_X[train_index], train_X[test_index]
    y1, y2 = train_y[train_index], train_y[test_index]
    #x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=test_ratio, random_state=split_seed + r)
    print('splitted: {0}, {1}'.format(x1.shape, x2.shape), flush=True)
    test_X_dup = test_X.copy()

    #XGB
    xgb_train = xgb.DMatrix(x1, y1)
    xgb_valid = xgb.DMatrix(x2, y2)
    #
    watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
    params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'seed': 99, 'silent': True}
    params['eta'] = 0.03
    params['max_depth'] = 4
    params['subsample'] = 0.9
    params['eval_metric'] = 'logloss'
    params['colsample_bytree'] = 0.8
    params['colsample_bylevel'] = 0.8
    params['max_delta_step'] = 3
    #params['gamma'] = 5.0
    #params['labmda'] = 1
    params['scale_pos_weight'] = 1.0
    params['seed'] = split_seed + r
    nr_round = 2000
    min_round = 100

    model1 = xgb.train(params, 
                       xgb_train, 
                       nr_round,  
                       watchlist, 
                       verbose_eval=50, 
                       early_stopping_rounds=min_round)

    pred_xgb = model1.predict(xgb.DMatrix(test_X_dup), ntree_limit=model1.best_ntree_limit+45)

    #
    file = 'subm_{}_xgb_{:02d}.csv'.format(tmp, r+1)
    subm = pd.DataFrame({'id': test['id'].values, target: pred_xgb})
    subm.to_csv(file, index=False, float_format='%.6f')
    subms.append(file)    

    ##LightGBM
    lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
    lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
    #gbdt
    params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
    params['boosting'] = 'gbdt'
    params['metric'] = 'binary_logloss'
    params['learning_rate'] = 0.03
    params['max_depth'] = 5
    params['num_leaves'] = 16 # higher number of leaves
    params['feature_fraction'] = 0.8 # Controls overfit
    params['bagging_fraction'] = 0.9    
    params['bagging_freq'] = 3
    params['seed'] = split_seed + r
    #
    params['verbose'] = -1

    file = 'subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
    subms.append(file)

    pred, f_tmp = run_lgb(params=params, 
                          lgb_train=lgb_train, 
                          lgb_valid=lgb_valid, 
                          lgb_test=test_X_dup, 
                          test_ids=test['id'].values, 
                          nr_round=nr_round, 
                          min_round=min_round, 
                          file=file)

    ##LightGBM
    #dart
    params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
    params['boosting'] = 'dart'
    params['metric'] = 'binary_logloss'
    params['learning_rate'] = 0.04
    params['max_depth'] = 5
    params['num_leaves'] = 16 # higher number of leaves
    params['feature_fraction'] = 0.8 # Controls overfit
    params['bagging_fraction'] = 0.9    
    params['bagging_freq'] = 3
    params['seed'] = split_seed + r
    #dart
    params['drop_rate'] = 0.1
    params['skip_drop'] = 0.5
    params['max_drop'] = 10
    params['verbose'] = -1 

    file = 'subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
    subms.append(file)

    pred, f_tmp = run_lgb(params=params, 
                          lgb_train=lgb_train, 
                          lgb_valid=lgb_valid, 
                          lgb_test=test_X_dup, 
                          test_ids=test['id'].values, 
                          nr_round=nr_round, 
                          min_round=min_round, 
                          file=file)


#blending
preds = {k: 1.0 for k in subms}
save_blend(preds=preds)

[1m[34m.[m[m
[1m[34m..[m[m
.DS_Store
[1m[34m.git[m[m
[1m[34m.ipynb_checkpoints[m[m
36_plain_cnn.csv
41_plain_cnn.csv
50_plain_fcn.csv
67_plain_cnn.csv
6_retrain_inception.csv
Image preprocess testing.ipynb
README.md
Ship-Iceberg Discrimination with Convolutional Neural Networks in High Resolution SAR Images.pdf
The Effectiveness of Data Augmentation in Image Classification using Deep Learning.pdf
Training_log.ipynb
[1m[34m__pycache__[m[m
all_14_inception.csv
cnn.ipynb
cnn.py
cnn_angle.ipynb
cnn_angle.py
[1m[34mdata[m[m
densenet.py
densenet121.ipynb
densenet121_pseudl.ipynb
densenetBC.py
densenetbc100.ipynb
fcn.ipynb
fcn.py
final ensemble.ipynb
gbm.ipynb
inception.ipynb
inception.py
[1m[34mothers[m[m
pre_resnet.py
pre_vgg.py
r2_11_plain_cnn.csv
r2_fcn_11_models.csv
resnet.py
resnet101.ipynb
resnet101_4feat.ipynb
resnet152.ipynb
resnet18.ipynb
resnet34.ipynb
resnet34_4feat.ipynb
resnet34_onlygoodretrain.csv
resnet3

In [10]:
ran_num

array([52161])

In [1]:
import torch

In [4]:
a=  torch.Tensor([1,2,3])
a.size()
a=a.unsqueeze(1)
a.size()

torch.Size([3, 1])