In [30]:
from multiprocessing import Pool
from tqdm import tqdm
import gc
#
import numpy as np # linear algebra
import pandas as pd # data processing
import datetime as dt
#
from collections import OrderedDict
from random import choice, sample, shuffle, uniform, seed
from math import exp, expm1, log1p, log10, log2, sqrt, ceil, floor, isfinite, isnan
from itertools import combinations
#import for image processing
#import cv2
from scipy.stats import kurtosis, skew
from scipy.ndimage import laplace, sobel
#evaluation
from sklearn.model_selection import StratifiedShuffleSplit, KFold
from sklearn.metrics import log_loss
import xgboost as xgb
import lightgbm as lgb

###############################################################################
def read_jason(file='', loc='../input/'):
    print('{}{}'.format(loc, file))
    df = pd.read_json('{}{}'.format(loc, file))
    df['inc_angle'] = df['inc_angle'].replace('na', -1).astype(float)
    #print(df['inc_angle'].value_counts())
    
    band1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df["band_1"]])
    band2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df["band_2"]])
    df = df.drop(['band_1', 'band_2'], axis=1)
    
    bands = np.stack((band1, band2,  0.5 * (band1 + band2)), axis=-1)  # -1 means add to the last dimension
    del band1, band2
    
    return df, bands

###############################################################################
#forked from
#https://www.kaggle.com/the1owl/planet-understanding-the-amazon-from-space/natural-growth-patterns-fractals-of-nature/notebook
def img_to_stats(paths):
    
    img_id, img = paths[0], paths[1]
    
    #ignored error    
    np.seterr(divide='ignore', invalid='ignore')
    
    bins = 20
    scl_min, scl_max = -50, 50
    opt_poly = True
    #opt_poly = False
    
    try:
        st = []
        st_interv = []
        hist_interv = []
        for i in range(img.shape[2]):
            img_sub = np.squeeze(img[:, :, i])
            
            #median, max and min
            sub_st = []
            sub_st += [np.mean(img_sub), np.std(img_sub), np.max(img_sub), np.median(img_sub), np.min(img_sub)]
            sub_st += [(sub_st[2] - sub_st[3]), (sub_st[2] - sub_st[4]), (sub_st[3] - sub_st[4])] 
            sub_st += [(sub_st[-3] / sub_st[1]), (sub_st[-2] / sub_st[1]), (sub_st[-1] / sub_st[1])] #normalized by stdev
            st += sub_st
            #Laplacian, Sobel, kurtosis and skewness
            st_trans = []
            st_trans += [laplace(img_sub, mode='reflect', cval=0.0).ravel().var()] #blurr
            sobel0 = sobel(img_sub, axis=0, mode='reflect', cval=0.0).ravel().var()
            sobel1 = sobel(img_sub, axis=1, mode='reflect', cval=0.0).ravel().var()
            st_trans += [sobel0, sobel1]
            st_trans += [kurtosis(img_sub.ravel()), skew(img_sub.ravel())]
            
            if opt_poly:
                st_interv.append(sub_st)
                #
                st += [x * y for x, y in combinations(st_trans, 2)]
                st += [x + y for x, y in combinations(st_trans, 2)]
                st += [x - y for x, y in combinations(st_trans, 2)]                
 
            #hist
            #hist = list(cv2.calcHist([img], [i], None, [bins], [0., 1.]).flatten())
            hist = list(np.histogram(img_sub, bins=bins, range=(scl_min, scl_max))[0])
            hist_interv.append(hist)
            st += hist
            st += [hist.index(max(hist))] #only the smallest index w/ max value would be incl
            st += [np.std(hist), np.max(hist), np.median(hist), (np.max(hist) - np.median(hist))]

        if opt_poly:
            for x, y in combinations(st_interv, 2):
                st += [float(x[j]) * float(y[j]) for j in range(len(st_interv[0]))]

            for x, y in combinations(hist_interv, 2):
                hist_diff = [x[j] * y[j] for j in range(len(hist_interv[0]))]
                st += [hist_diff.index(max(hist_diff))] #only the smallest index w/ max value would be incl
                st += [np.std(hist_diff), np.max(hist_diff), np.median(hist_diff), (np.max(hist_diff) - np.median(hist_diff))]
                
        #correction
        nan = -999
        for i in range(len(st)):
            if isnan(st[i]) == True:
                st[i] = nan
                
    except:
        print('except: ')
    
    return [img_id, st]


def extract_img_stats(paths):
    imf_d = {}
    p = Pool(8) #(cpu_count())
    ret = p.map(img_to_stats, paths)   # list of pair of (id, bands) bands is np.array shape (75, 75, 3)
    for i in tqdm(range(len(ret)), miniters=100):
        imf_d[ret[i][0]] = ret[i][1]

    ret = []
    fdata = [imf_d[i] for i, j in paths]
    return np.array(fdata, dtype=np.float32)


def process(df, bands):

    data = extract_img_stats([(k, v) for k, v in zip(df['id'].tolist(), bands)]); gc.collect() #(N, 246)
    data = np.concatenate([data, df['inc_angle'].values[:, np.newaxis]], axis=-1); gc.collect() #(N, 247)

    print(data.shape)
    return data

###############################################################################
if __name__ == '__main__':
    
    np.random.seed(1017)
    target = 'is_iceberg'
    def iso(arr):
        p = np.reshape(np.array(arr), [75,75]) >(np.mean(np.array(arr))+2*np.std(np.array(arr)))
        return p * np.reshape(np.array(arr), [75,75])
    def size(arr):     
        return float(np.sum(arr<-5))/(75*75)
    #Load data
    train, train_bands = read_jason(file='train.json', loc='data/processed/')
    test, test_bands = read_jason(file='test.json', loc='data/processed/')

    train_X_full = process(df=train, bands=train_bands)
    train_y_full = train[target].values    
    test_X = process(df=test, bands=test_bands)
    
    data = pd.read_json('data/processed/train.json')
    test = pd.read_json('data/processed/test.json')

    data['iso1'] = data.iloc[:, 0].apply(iso)
    data['iso2'] = data.iloc[:, 1].apply(iso)
    test['iso1'] = test.iloc[:, 0].apply(iso)
    test['iso2'] = test.iloc[:, 1].apply(iso)
    # Feature engineering s1 s2 and size.
    data['s1'] = data.iloc[:,5].apply(size)
    data['s2'] = data.iloc[:,6].apply(size)
    test['s1'] = test['iso1'].apply(size)
    test['s2'] = test['iso2'].apply(size)
    train_X_full= np.concatenate([train_X_full, data.iloc[:,7:]],axis=1)
    test_X = np.concatenate([test_X, test.iloc[:,6:]],axis=1)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f247307fc18>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/xgboost-0.6-py3.5.egg/xgboost/core.py", line 368, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


data/processed/train.json
data/processed/test.json


100%|██████████| 1604/1604 [00:00<00:00, 937261.58it/s]


(1604, 247)


100%|██████████| 8424/8424 [00:00<00:00, 1154290.00it/s]


(8424, 247)


Process ForkPoolWorker-30:
Process ForkPoolWorker-29:
Process ForkPoolWorker-31:
Process ForkPoolWorker-18:
Process ForkPoolWorker-27:
Process ForkPoolWorker-26:
Process ForkPoolWorker-32:
Process ForkPoolWorker-23:
Process ForkPoolWorker-22:
Process ForkPoolWorker-17:
Process ForkPoolWorker-24:
Process ForkPoolWorker-21:
Process ForkPoolWorker-25:
Process ForkPoolWorker-20:
Process ForkPoolWorker-28:
Process ForkPoolWorker-19:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process

  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 342, in get
    with self._rlock:
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 342, in get
    with self._rlock:
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.

In [10]:
# train_X= train_X_full[train_X_full[:,-1]!=-1] #train_X[:,-1]==-1
# train_y= train_y_full[train_X_full[:,-1]!=-1]

train_X= train_X_full
train_y= train_y_full

# train_X = np.delete(train_X_full,249,1)
# test_X = np.delete(test_X,249,1)
# print(train_X.shape, train_y.shape)
# train_X[:,246]

# def iso(arr):
#     p = np.reshape(np.array(arr), [75,75]) >(np.mean(np.array(arr))+2*np.std(np.array(arr)))
#     return p * np.reshape(np.array(arr), [75,75])
# def size(arr):     
#     return float(np.sum(arr<-5))/(75*75)
# # data = pd.read_json('data/processed/train.json')

# data['iso1'] = data.iloc[:, 0].apply(iso)
# data['iso2'] = data.iloc[:, 1].apply(iso)
# test['iso1'] = test.iloc[:, 0].apply(iso)
# test['iso2'] = test.iloc[:, 1].apply(iso)
# # Feature engineering s1 s2 and size.
# data['s1'] = data.iloc[:,5].apply(size)
# data['s2'] = data.iloc[:,6].apply(size)
# test['s1'] = test['iso1'].apply(size)
# test['s2'] = test['iso2'].apply(size)
# train_X_full= np.concatenate([train_X_full, data.iloc[:,7:]],axis=1)
# test_X = np.concatenate([test_X, test.iloc[:,7:]],axis=1)
# # data['iso1'] = data.iloc[:, 0].apply(iso)
# # data['iso2'] = data.iloc[:, 1].apply(iso)
# # # Feature engineering s1 s2 and size.
# # data['s1'] = data.iloc[:,5].apply(size)
# # data['s2'] = data.iloc[:,6].apply(size)


In [9]:
test

Unnamed: 0,band_1,band_2,id,inc_angle,is_iceberg,iso1,iso2,s1,s2
0,"[-27.878360999999998, -27.15416, -28.668615, -...","[-27.154118, -29.537888, -31.0306, -32.190483,...",dfd5f913,43.9239,0,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0.011378,0.010133
1,"[-12.242375, -14.920304999999999, -14.920363, ...","[-31.506321, -27.984554, -26.645678, -23.76760...",e25388fd,38.1562,0,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0.003022,0.016356
2,"[-24.603676, -24.603714, -24.871029, -23.15277...","[-24.870956, -24.092632, -20.653963, -19.41104...",58b2aaa0,45.2859,1,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -19.411043, -0.0, -0.0, -2...",0.016178,0.010667
3,"[-22.454607, -23.082819, -23.998013, -23.99805...","[-27.889421, -27.519794, -27.165262, -29.10350...",4cfc3a18,43.8306,0,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0.009244,0.008533
4,"[-26.006956, -23.164886, -23.164886, -26.89116...","[-27.206915, -30.259186, -30.259186, -23.16495...",271f93f4,35.6256,0,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0.012089,0.007822
5,"[-20.769371, -20.769434, -25.906025, -25.90602...","[-29.288746, -29.712593, -28.884804, -28.88480...",b51d18b5,36.9034,1,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0.021333,0.027200
6,"[-26.673811, -23.666162, -27.622442, -28.31768...","[-24.557735, -26.97868, -27.622442, -29.073456...",31da1a04,34.4751,1,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -21.9427...",0.023644,0.013156
7,"[-24.989119, -27.755224, -25.817074, -24.98927...","[-27.755173, -26.732174, -28.124943, -31.83772...",56929c16,41.1769,0,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0.019733,0.009422
8,"[-17.146641, -17.146572, -17.994583, -19.44553...","[-25.733608, -24.472507, -24.710424, -22.77215...",525ab75c,35.7829,0,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0.001422,0.020622
9,"[-24.020853, -23.551275, -27.18819, -29.126434...","[-28.702518, -33.563324, -29.571918, -29.12643...",192f56eb,43.3007,0,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...","[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0.010667,0.011556


In [2]:
def gen_fft(dt):
    result =pd.DataFrame()
    for j in range(0,dt.shape[0]):            
        for band in ['band_1', 'band_2']:
            x = np.array(dt.iloc[j,:][band])
            multiplier = 1.1
            threshold_h = 45.0
            threshold_v = 45.0
            mean_value = {}


            mean_value['h'] = []
            xx = []
            sph = None
            for i in range(75):
                th = np.reshape(x,(75,75))[i,:]
                sph = np.fft.fft(th)
                mnh = np.mean(abs(sph))     
                sph[abs(sph)<mnh*multiplier] = 0.0
                xx.append(abs(np.fft.ifft(sph)))
                mean_value['h'].append(mnh)
            mxh = np.max(mean_value['h'])
            mih = np.min(mean_value['h'])
            mnh = np.mean(mean_value['h'])


            mean_value['v']=[]
            yy = []
            spv = None
            for i in range(75):
                tv = np.reshape(x,(75,75))[:,i]
                spv = np.fft.fft(tv)
                mnv = np.mean(abs(spv))
                spv[abs(spv)<mnv*multiplier] = 0.0
                yy.append(abs(np.fft.ifft(spv)))
                mean_value['v'].append(mnv)

            mxv = np.max(mean_value['v'])
            miv = np.min(mean_value['v'])                
            mnv = np.mean(mean_value['v'])

            estimate_size = sum(mean_value['v'] > mnv*multiplier)*sum(mean_value['h'] > mnh*multiplier)

            yy = np.transpose(yy)
            if band == 'band_1':
#             fft_data = {'band':band, 'id':dt.iloc[j,:].id, 'inc_angle':dt.iloc[j,:].inc_angle, 'is_iceberg':dt.iloc[j,:].is_iceberg, 'mxv':mxv,'miv':miv,'mnv':mnv,'mean_value_v':mean_value['v'], 'size_v': sum(mean_value['v'] > mnv*multiplier)
#                         ,'mxh':mxh,'mih':mih,'mnh':mnh,'mean_value_h':mean_value['h'], 'size_h': sum(mean_value['h'] > mnh*multiplier) }
                fft_data = (('id',[dt.iloc[j,:].id]), ('mxv',mxv),('miv',miv),('mnv',mnv), ('size_v',sum(mean_value['v'] > mnv*multiplier)),
                ('mxh',mxh),('mih',mih),('mnh',mnh), ('size_h', sum(mean_value['h'] > mnh*multiplier)))
            if band == 'band_2':
                fft_data += (('mxv2',mxv),('miv2',miv),('mnv2',mnv), ('size_v2',sum(mean_value['v'] > mnv*multiplier)),
                ('mxh2',mxh),('mih2',mih),('mnh2',mnh), ('size_h2', sum(mean_value['h'] > mnh*multiplier)))
#             print(len(mean_value['v']))
        temp = pd.DataFrame.from_dict(OrderedDict(fft_data))
        result = pd.concat([result,temp])
        
    return result
                    
                    

a = gen_fft(data)
b = gen_fft(test)
# train_X_full = np.concatenate((train_X_full, a.drop(['id']).values),axis=1)
# test_X = np.concatenate((test_X, b.drop(['id']).values),axis=1)

In [31]:
# np.delete(train_X_full,249,1)[1,:]
# train_X_full[1,:]
# a.drop(['id'],axis=1)
train_X_full = np.concatenate((train_X_full, a.drop(['id'],axis=1).values),axis=1)
test_X = np.concatenate((test_X, b.drop(['id'],axis=1).values),axis=1)
# test_X = np.concatenate([test_X, test.iloc[:,6:]],axis=1)
# test_X = np.concatenate((test_X, b.drop(['id'],axis=1).values),axis=1)

In [32]:
# test_X = test_X[:,:246]
# test_X.shape
print(train_X_full.shape)
print(test_X.shape)
#246+17+2
# test.iloc[:,6:]


(1604, 265)
(8424, 265)


In [31]:
test

0       dfd5f913
1       e25388fd
2       58b2aaa0
3       4cfc3a18
4       271f93f4
5       b51d18b5
6       31da1a04
7       56929c16
8       525ab75c
9       192f56eb
10      3aac67cd
11      161a6860
12      3c794f0c
13      86730f0d
14      e356f7a3
15      87592c38
16      1c18a39e
17      a210f335
18      958d155f
19      6d81d201
20      75126706
21      112a6cfa
22      a29662a4
23      bd1a1bdf
24      31e37d93
25      76b8d446
26      958d42a8
27      70830858
28      faf2c49e
29      02314c59
          ...   
1574    84fe7f94
1575    04e6f331
1576    92c90853
1577    660a98a7
1578    89670962
1579    9d586019
1580    5f49ea3b
1581    968e1414
1582    389d7eaf
1583    65ca9e76
1584    a09cae27
1585    00c5b3e0
1586    7f9df2b0
1587    a2303efc
1588    cb62e5cb
1589    9ff1e0f0
1590    39fd995a
1591    544d0681
1592    cb0319fc
1593    d86deb2b
1594    cdee905a
1595    2539742b
1596    2ea3c9f1
1597    9cadda28
1598    8376a077
1599    04e11240
1600    c7d6f6f8
1601    bba1a0

In [34]:
# This will be the version changed based on my own understanding
def save_blend(preds={}, loc='./'):
    target = 'is_iceberg'
    
    w_total = 0.0
    blend = None
    df_corr = None
    print('\nBlending...')
    for k, v in preds.items():
        if blend is None:
            blend = pd.read_csv('{0}/{1}'.format(loc, k))
            print('load: {0}, w={1}'.format(k, v))
            
            df_corr = pd.DataFrame({'id': blend['id'].tolist()})
            df_corr[k[16:-4]] = blend[target]
            
            w_total += v
            blend[target] = blend[target] * v
                
        else:
            preds_tmp = pd.read_csv('{0}/{1}'.format(loc, k))
            preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
            print('load: {0}, w={1}'.format(k, v))
            df_corr[k[16:-4]] = preds_tmp[target]
            
            w_total += v
            blend[target] += preds_tmp[target] * v
            del preds_tmp
            
    print('\n{}'.format(df_corr.corr()), flush=True)
    #write submission
    blend[target] = blend[target] / w_total
    print('\nPreview: \n{}'.format(blend.head()), flush=True)
    blend.to_csv('{}subm_blend{:03d}_{}.csv'.format(loc, len(preds), tmp), index=False, float_format='%.6f')

def run_lgb(params={}, lgb_train=None, lgb_valid=None, lgb_test=None, test_ids=None, nr_round=2000, min_round=100, file=''):

    print('\nLightGBM: {}'.format(params['boosting'])) 
    model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)
    
    pred = model2.predict(lgb_test, num_iteration=model2.best_iteration)
    #
    subm = pd.DataFrame({'id': test_ids, 'is_iceberg': pred})
    subm.to_csv(file, index=False, float_format='%.6f')
    #   
    df = pd.DataFrame({'feature':model2.feature_name(), 'importances': model2.feature_importance()})
    
    return pred, df


#results
freq = pd.DataFrame()
subms = []

#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)
seed_list=[]
# final_dict ={}
# final_dict['xgb_re'] = []
# final_dict['lgb_re'] = []
# final_dict['lgb_dart_re'] =[]
for rep in range(1):
    ran_num =  np.random.randint(50000,60000,size=1)[0]
    seed_list.append(ran_num)
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 5
    kf = KFold(n_splits=nr_runs, random_state=split_seed)
    tree_lim =0
    xgb_re = []
    lgb_re =[]
    lgb_dart_re= []
    for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
        print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

        tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

        x1, x2 = train_X[train_index], train_X[test_index]
        y1, y2 = train_y[train_index], train_y[test_index]
        #x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=test_ratio, random_state=split_seed + r)
        print('splitted: {0}, {1}'.format(x1.shape, x2.shape), flush=True)
        test_X_dup = test_X.copy()

        #XGB
        xgb_train = xgb.DMatrix(x1, y1)
        xgb_valid = xgb.DMatrix(x2, y2)
        #
        watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
        params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'seed': 99, 'silent': True}
        params['eta'] = 0.03
        params['max_depth'] = 4
        params['subsample'] = 0.9
        params['eval_metric'] = 'logloss'
        params['colsample_bytree'] = 0.8
        params['colsample_bylevel'] = 0.8
        params['max_delta_step'] = 3
        #params['gamma'] = 5.0
        #params['labmda'] = 1
        params['scale_pos_weight'] = 1.0
        params['seed'] = ran_num + r
        nr_round = 2000
        min_round = 100

        model1 = xgb.train(params, 
                           xgb_train, 
                           nr_round,  
                           watchlist, 
                           verbose_eval=50, 
                           early_stopping_rounds=min_round)

        pred_xgb = model1.predict(xgb.DMatrix(test_X_dup), ntree_limit=model1.best_ntree_limit+tree_lim)

        #
        file = 'gbm/subm_{}_xgb_{:02d}.csv'.format(tmp, r+1)
        subm = pd.DataFrame({'id': test['id'].values, target: pred_xgb})
        subm.to_csv(file, index=False, float_format='%.6f')
        subms.append(file)    

        ##LightGBM
        lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
        lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
        #gbdt
        params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
        params['boosting'] = 'gbdt'
        params['metric'] = 'binary_logloss'
        params['learning_rate'] = 0.03
        params['max_depth'] = 5
        params['num_leaves'] = 9 # higher number of leaves
        params['feature_fraction'] = 0.8 # Controls overfit
        params['bagging_fraction'] = 0.9    
        params['bagging_freq'] = 3
        params['seed'] = ran_num + r
        #
        params['verbose'] = -1

        file = 'gbm/subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
        subms.append(file)
        
        model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)

        ##LightGBM
        #dart
        params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
        params['boosting'] = 'dart'
        params['metric'] = 'binary_logloss'
        params['learning_rate'] = 0.04
        params['max_depth'] = 5
        params['num_leaves'] = 16 # higher number of leaves
        params['feature_fraction'] = 0.8 # Controls overfit
        params['bagging_fraction'] = 0.9    
        params['bagging_freq'] = 3
        params['seed'] = ran_num + r
        #dart
        params['drop_rate'] = 0.1
        params['skip_drop'] = 0.5
        params['max_drop'] = 10
        params['verbose'] = -1 

        file = 'gbm/subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
        subms.append(file)

        model3 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)
        
        
        xgb_re.append(model1.best_score)
        lgb_re.append(model2.best_score['valid_0']['binary_logloss'])
        lgb_dart_re.append(model3.best_score['valid_0']['binary_logloss'])
    
#     final_dict['xgb_re'].append(np.mean(xgb_re))
#     final_dict['lgb_re'].append(np.mean(lgb_re))
#     final_dict['lgb_dart_re'].append(np.mean(lgb_dart_re))
    
    #blending
    preds = {k: 1.0 for k in subms}
    save_blend(preds=preds)

The seed we are using is: 58828

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f2684696168>
splitted: (1283, 265), (321, 265)
[0]	train-logloss:0.676031	valid-logloss:0.676764
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.305669	valid-logloss:0.352983
[100]	train-logloss:0.197015	valid-logloss:0.27242
[150]	train-logloss:0.140457	valid-logloss:0.242431
[200]	train-logloss:0.105486	valid-logloss:0.223807
[250]	train-logloss:0.080958	valid-logloss:0.213532
[300]	train-logloss:0.063245	valid-logloss:0.206793
[350]	train-logloss:0.05009	valid-logloss:0.20169
[400]	train-logloss:0.040743	valid-logloss:0.197232
[450]	train-logloss:0.033358	valid-logloss:0.195457
[500]	train-logloss:0.027726	valid-logloss:0.194399
[550]	train-logloss:0.023321	valid-logloss:0.194148
[600]	train-logloss:0.019943	valid-logloss:0.19502
Stopping. Best iteration:
[545]	train-log

Stopping. Best iteration:
[422]	train-logloss:0.038409	valid-logloss:0.204492

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.349948
[100]	valid_0's binary_logloss: 0.271037
[150]	valid_0's binary_logloss: 0.239794
[200]	valid_0's binary_logloss: 0.227625
[250]	valid_0's binary_logloss: 0.219024
[300]	valid_0's binary_logloss: 0.214025
[350]	valid_0's binary_logloss: 0.214819
[400]	valid_0's binary_logloss: 0.212579
[450]	valid_0's binary_logloss: 0.212799
[500]	valid_0's binary_logloss: 0.211184
[550]	valid_0's binary_logloss: 0.213168
[600]	valid_0's binary_logloss: 0.213657
Early stopping, best iteration is:
[500]	valid_0's binary_logloss: 0.211184
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.444292
[100]	valid_0's binary_logloss: 0.355852
[150]	valid_0's binary_logloss: 0.32824
[200]	valid_0's binary_logloss: 0.286147
[250]	valid_0's binary_logloss: 0.27299
[300]	valid_0's binary_log

FileNotFoundError: File b'.//gbm/subm_2018-01-16-00-41_lgb_gbdt_02.csv' does not exist

In [7]:
for i,v in final_dict.items():
    print(v)

[0.21012963979079585, 0.20862186577636441, 0.20983491831459072, 0.205572354848373, 0.20768137792375349, 0.21065259329173974, 0.21025686751234796, 0.2092285317725569, 0.20937734740484334, 0.20710388216406561]
[0.21192038774617994, 0.21510278865547719, 0.21349441290720278, 0.21314132306764719, 0.21106262003875184, 0.21150269654329173, 0.20970752255349692, 0.21026733285106491, 0.21018590252162195, 0.21561990540992712]
[0.20853060000000001, 0.20904379999999997, 0.20738019999999996, 0.20734639999999999, 0.207984, 0.20712419999999998, 0.20816319999999999, 0.20811739999999998, 0.20851959999999997, 0.20861739999999998]


In [4]:
# This will be the version changed based on my own understanding
def save_blend(preds={}, loc='./'):
    target = 'is_iceberg'
    
    w_total = 0.0
    blend = None
    df_corr = None
    print('\nBlending...')
    for k, v in preds.items():
        if blend is None:
            blend = pd.read_csv('{0}/{1}'.format(loc, k))
            print('load: {0}, w={1}'.format(k, v))
            
            df_corr = pd.DataFrame({'id': blend['id'].tolist()})
            df_corr[k[16:-4]] = blend[target]
            
            w_total += v
            blend[target] = blend[target] * v
                
        else:
            preds_tmp = pd.read_csv('{0}/{1}'.format(loc, k))
            preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
            print('load: {0}, w={1}'.format(k, v))
            df_corr[k[16:-4]] = preds_tmp[target]
            
            w_total += v
            blend[target] += preds_tmp[target] * v
            del preds_tmp
            
    print('\n{}'.format(df_corr.corr()), flush=True)
    #write submission
    blend[target] = blend[target] / w_total
    print('\nPreview: \n{}'.format(blend.head()), flush=True)
    blend.to_csv('{}subm_blend{:03d}_{}.csv'.format(loc, len(preds), tmp), index=False, float_format='%.6f')

def run_lgb(params={}, lgb_train=None, lgb_valid=None, lgb_test=None, test_ids=None, nr_round=2000, min_round=100, file=''):

    print('\nLightGBM: {}'.format(params['boosting'])) 
    model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)
    
    pred = model2.predict(lgb_test, num_iteration=model2.best_iteration)
    #
    subm = pd.DataFrame({'id': test_ids, 'is_iceberg': pred})
    subm.to_csv(file, index=False, float_format='%.6f')
    #   
    df = pd.DataFrame({'feature':model2.feature_name(), 'importances': model2.feature_importance()})
    
    return pred, df

In [45]:
#results
freq = pd.DataFrame()

avg_result = []
#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)

ran_num = 463465 #463465#56491
for ran_num in [56491,463465]:
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 5
    kf = KFold(n_splits=nr_runs, random_state=split_seed)

    for param in [0.5,0.8,1.2,1.5]:
        result = []
        for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
            print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

            tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

            x1, x2 = train_X[train_index], train_X[test_index]
            y1, y2 = train_y[train_index], train_y[test_index]
            #x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=test_ratio, random_state=split_seed + r)
            print('splitted: {0}, {1}'.format(x1.shape, x2.shape), flush=True)
            test_X_dup = test_X.copy()

            #XGB
            xgb_train = xgb.DMatrix(x1, y1)
            xgb_valid = xgb.DMatrix(x2, y2)
            #
            watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
            params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'seed': 99, 'silent': True}
            params['eta'] = 0.05
            params['max_depth'] = 4
            params['subsample'] = 0.9
            params['eval_metric'] = 'logloss'
            params['colsample_bytree'] = 0.8
            params['colsample_bylevel'] = param
            params['max_delta_step'] = 3
            #params['gamma'] = 5.0
            params['labmda'] = param
            params['scale_pos_weight'] = 1.0
            params['seed'] = ran_num + r
            nr_round = 2000
            min_round = 100

            model1 = xgb.train(params, 
                               xgb_train, 
                               nr_round,  
                               watchlist, 
                               verbose_eval=50, 
                               early_stopping_rounds=min_round)

            pred_xgb = model1.predict(xgb.DMatrix(test_X_dup), ntree_limit=model1.best_ntree_limit)

            #
            msg= 'xgbfold%d'%r
            freq[msg] = pred_xgb
            result.append(model1.best_score)
        print('All your scores are: ')
        print(result)
        print('The average of your score')
        print(np.mean(result))
        avg_result.append(np.mean(result))
print(avg_result)

The seed we are using is: 56491

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f4f99a3e4c8>
splitted: (1283, 247), (321, 247)
[0]	train-logloss:0.666214	valid-logloss:0.668001
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.23468	valid-logloss:0.302959
[100]	train-logloss:0.1385	valid-logloss:0.240614
[150]	train-logloss:0.090079	valid-logloss:0.213791
[200]	train-logloss:0.061226	valid-logloss:0.201662
[250]	train-logloss:0.043525	valid-logloss:0.195157
[300]	train-logloss:0.032578	valid-logloss:0.194752
[350]	train-logloss:0.024474	valid-logloss:0.193095
Stopping. Best iteration:
[280]	train-logloss:0.036201	valid-logloss:0.192763


round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f4f99a3e4c8>
splitted: (1283, 247), (321, 247)
[0]	train-logloss:0.666596	valid-logloss:0.668727
Multiple eval metrics have been passed: 'valid-logloss' will be u

XGBoostError: b'value 1.2 for Parameter colsample_bylevel exceed bound [0,1]'

#model1.best_score

nta = [0.01,0.015,0.02,0.025,0.03]
[0.20835880000000001, 0.20946060000000002, 0.2080698, 0.20710020000000001, 0.20994420000000003]
#change to different split.
[0.2067958, 0.20759760000000002, 0.20812819999999999, 0.20848559999999999, 0.20875680000000002]

then we want to see tree depth sensitivity.
[2,3,4,5]
[0.2127414, 0.20789479999999999, 0.2080698, 0.20814539999999998
 
subsample 0.5,0.6,0.7,0.8,0.85,0.9,0.95 #maybe because data points are limited?
[0.21047080000000001, 0.20971980000000001, 0.2092918, 0.2080698,0.20451239, 0.2070148,0.20853039999999998]

change to other data split! consistent here!
[0.21159359999999999, 0.21181939999999999, 0.20943299999999998, 0.2081281,0.2075963 0.207232,0.20908] 

 
#### colsample_bytree 0.6,0.7,0.8,0.9
[0.2066904, 0.20730179999999998, 0.2070148, 0.21193840000000003]
[0.2090204, 0.20862359999999999, 0.207232, 0.20750860000000002]

#### max_delta: Maximum delta step we allow each tree’s weight estimation to be.
[0.20828599999999997, 0.2070148, 0.2070148, 0.2070148]
[0.20910699999999999, 0.207232, 0.207232, 0.207232]

  [0.6,0.7,0.8,0.9]
0.20991120000000002, 0.2083854, 0.20834060000000001, 0.20772740000000001, 
0.20727820000000002, 0.20918019999999998, 0.21255940000000001, 0.2111894

np.mean(result)

In [54]:
avg_result = []
#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)

ran_num = 463465 #463465#56491
for ran_num in [2312, 56491,463465]:
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 10
    kf = KFold(n_splits=nr_runs, random_state=split_seed)

    for param in [0.8]:
        result = []
        for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
            print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

            tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

            x1, x2 = train_X[train_index], train_X[test_index]
            y1, y2 = train_y[train_index], train_y[test_index] 

            ##LightGBM
            lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
            lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
            #gbdt
            params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
            params['boosting'] = 'gbdt'
            params['metric'] = 'binary_logloss'
            params['learning_rate'] = 0.03
            params['max_depth'] = 5
            params['num_leaves'] = 9 # higher number of leaves
            params['feature_fraction'] = param # Controls overfit
            params['bagging_fraction'] = 0.9    
            params['bagging_freq'] = 3
            params['seed'] = ran_num + r
            #
            params['verbose'] = -1

            file = 'gbm/subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
            subms.append(file)

            model2 = lgb.train(params, 
                           lgb_train, 
                           nr_round, 
                           lgb_valid, 
                           verbose_eval=50, early_stopping_rounds=min_round)
            result.append(model2.best_score['valid_0']['binary_logloss'])
            
        print('All your scores are: ')
        print(result)
        print('The average of your score')
        print(np.mean(result))
        avg_result.append(np.mean(result))
print(avg_result)

The seed we are using is: 2312

round 0001 of 0010, seed=<mtrand.RandomState object at 0x7f51feb63678>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.343065
[100]	valid_0's binary_logloss: 0.254834
[150]	valid_0's binary_logloss: 0.220965
[200]	valid_0's binary_logloss: 0.201352
[250]	valid_0's binary_logloss: 0.19127
[300]	valid_0's binary_logloss: 0.184706
[350]	valid_0's binary_logloss: 0.181717
[400]	valid_0's binary_logloss: 0.180455
[450]	valid_0's binary_logloss: 0.180666
[500]	valid_0's binary_logloss: 0.173666
[550]	valid_0's binary_logloss: 0.170401
[600]	valid_0's binary_logloss: 0.16884
[650]	valid_0's binary_logloss: 0.173522
Early stopping, best iteration is:
[588]	valid_0's binary_logloss: 0.167662

round 0002 of 0010, seed=<mtrand.RandomState object at 0x7f51feb63678>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.378399
[100]	valid_0's binary_logloss: 0.301905
[150]	valid_

0.20997504631763109

num_leaves  [6,8,9,10,12,14,16,20]

2312:0.20693540784004155, 0.20807777115480661,0.208204, 0.2101704305882699, 0.20958977185302161, 0.2097247594090052, 0.21252432775232091, 0.21097843374429764
     0.21085407105688136, 0.20972141239725756,0.20768 , 0.20802226387790204, 0.21319986508838254, 0.2114961113773989, 0.20997504631763109, 0.21187703738304534, 
     0.21143941426691132, 0.20521332038873236,0.206531, 0.20727516177890259, 0.20963941105802347, 0.20857695536404003, 0.21374484604203184, 0.21189068566992506

try another one [6,7,8,9]
[0.21142256014136723, 0.20535415309766486, 0.20584818981873357, 0.20905026043297328]

## change num_leaf to 9

0.20826665232053476, 0.20893920897305204, 0.20820437367235373, 0.20789867033173567,
0.21021699406462796, 0.20997471390055775, 0.20768137792375355, 0.20762240986338024,
0.20571142430466466, 0.20512413321541617, 0.2065318518352405, 0.21059664548852566]



In [40]:
model2.best_score['valid_0']['binary_logloss']

0.21655854539955169

# Fine tune the last one

In [13]:
#results
avg_result = []
#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)

ran_num = 463465 #463465#56491
for ran_num in  [1123, 463465]:#  [1123,4677,6745 ,2312, 56491,463465]:
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 5
    kf = KFold(n_splits=nr_runs, random_state=split_seed)

    for param in [6,8,10,12,14,18,20,22]:
        result = []
        for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
            print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

            tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

            x1, x2 = train_X[train_index], train_X[test_index]
            y1, y2 = train_y[train_index], train_y[test_index] 

            ##LightGBM
            lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
            lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
            #gbdt
            params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
            params['boosting'] = 'dart'
            params['metric'] = 'binary_logloss'
            params['learning_rate'] = 0.04
            params['max_depth'] = 5
            params['num_leaves'] = param # higher number of leaves
            params['feature_fraction'] = 0.8 # Controls overfit
            params['bagging_fraction'] = 0.9    
            params['bagging_freq'] = 3
            params['seed'] = ran_num + r
            #dart
            params['drop_rate'] = 0.1
            params['skip_drop'] = 0.5
            params['max_drop'] = 10
            params['verbose'] = -1 

            model3 = lgb.train(params, 
                           lgb_train, 
                           nr_round, 
                           lgb_valid, 
                           verbose_eval=50, early_stopping_rounds=min_round)
            result.append(model3.best_score['valid_0']['binary_logloss'])
            
        print('All your scores are: ')
        print(result)
        print('The average of your score')
        print(np.mean(result))
        avg_result.append(np.mean(result))
print(avg_result)

The seed we are using is: 1123

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f3d962d0ca8>
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.441487
[100]	valid_0's binary_logloss: 0.374806
[150]	valid_0's binary_logloss: 0.343087
[200]	valid_0's binary_logloss: 0.319263
[250]	valid_0's binary_logloss: 0.295517
[300]	valid_0's binary_logloss: 0.286817
[350]	valid_0's binary_logloss: 0.283165
[400]	valid_0's binary_logloss: 0.26954
[450]	valid_0's binary_logloss: 0.259985
[500]	valid_0's binary_logloss: 0.252553
[550]	valid_0's binary_logloss: 0.246402
[600]	valid_0's binary_logloss: 0.245004
[650]	valid_0's binary_logloss: 0.23783
[700]	valid_0's binary_logloss: 0.234628
[750]	valid_0's binary_logloss: 0.229842
[800]	valid_0's binary_logloss: 0.230116
[850]	valid_0's binary_logloss: 0.231584
[900]	valid_0's binary_logloss: 0.227668
[950]	valid_0's binary_logloss: 0.223863
[1000]	valid_0's binary_logloss: 0.222578
[1050]	valid_0's


num_leaves here: 16

0.20996463972458032, 0.2105201451940987, 0.21304611830042508, 0.21343599728787993, 0.21106262003875184, 0.21354386772253284

num_leaves here: 9

0.21011925200129097, 0.215387241291573, 0.21029209937609511, 0.21529853022367157, 0.21635515450387191, 0.21704195707060975


In [42]:
# This will be the version changed based on my own understanding
def save_blend(preds={}, loc='./'):
    target = 'is_iceberg'
    
    w_total = 0.0
    blend = None
    df_corr = None
    print('\nBlending...')
    for k, v in preds.items():
        if blend is None:
            blend = pd.read_csv('{0}/{1}'.format(loc, k))
            print('load: {0}, w={1}'.format(k, v))
            
            df_corr = pd.DataFrame({'id': blend['id'].tolist()})
            df_corr[k[16:-4]] = blend[target]
            
            w_total += v
            blend[target] = blend[target] * v
                
        else:
            preds_tmp = pd.read_csv('{0}/{1}'.format(loc, k))
            preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
            print('load: {0}, w={1}'.format(k, v))
            df_corr[k[16:-4]] = preds_tmp[target]
            
            w_total += v
            blend[target] += preds_tmp[target] * v
            del preds_tmp
            
    print('\n{}'.format(df_corr.corr()), flush=True)
    #write submission
    blend[target] = blend[target] / w_total
    print('\nPreview: \n{}'.format(blend.head()), flush=True)
    blend.to_csv('{}subm_blend{:03d}_{}.csv'.format(loc, len(preds), tmp), index=False, float_format='%.6f')

def run_lgb(params={}, lgb_train=None, lgb_valid=None, lgb_test=None, test_ids=None, nr_round=2000, min_round=100, file=''):

    print('\nLightGBM: {}'.format(params['boosting'])) 
    model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=False, early_stopping_rounds=min_round)
    
    pred = model2.predict(lgb_test, num_iteration=model2.best_iteration)
    #
#     if model2.best_score['valid_0']['binary_logloss']<=0.195:
    subm = pd.DataFrame({'id': test_ids, 'is_iceberg': pred})
    subm.to_csv(file, index=False, float_format='%.6f')
    #   
    df = pd.DataFrame({'feature':model2.feature_name(), 'importances': model2.feature_importance()})
    
    return pred, df


#results
freq = pd.DataFrame()
subms = []

#training
# test_ratio = 0.2
# nr_runs = 3
# split_seed = 25
# kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)
seed_list=[]
# final_dict ={}
# final_dict['xgb_re'] = []
# final_dict['lgb_re'] = []
# final_dict['lgb_dart_re'] =[]
for rep in range(1):
    ran_num =  np.random.randint(50000,60000,size=1)[0]
    seed_list.append(ran_num)
    split_seed= np.random.RandomState(ran_num)
    print('The seed we are using is: %d' % ran_num)
    nr_runs = 5
    kf = KFold(n_splits=nr_runs, random_state=split_seed)
    tree_lim =0
#     xgb_re = []
#     lgb_re =[]
#     lgb_dart_re= []
    for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
        print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

        tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

        x1, x2 = train_X[train_index], train_X[test_index]
        y1, y2 = train_y[train_index], train_y[test_index]
        #x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=test_ratio, random_state=split_seed + r)
        print('splitted: {0}, {1}'.format(x1.shape, x2.shape), flush=True)
        test_X_dup = test_X.copy()

        #XGB
        xgb_train = xgb.DMatrix(x1, y1)
        xgb_valid = xgb.DMatrix(x2, y2)
        #
        watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
        params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'seed': 99, 'silent': True}
        params['eta'] = 0.03
        params['max_depth'] = 4
        params['subsample'] = 0.9
        params['eval_metric'] = 'logloss'
        params['colsample_bytree'] = 0.8
        params['colsample_bylevel'] = 0.8
        params['max_delta_step'] = 3
        #params['gamma'] = 5.0
        #params['labmda'] = 1
        params['scale_pos_weight'] = 1.0
        params['silent'] = False
        params['seed'] = ran_num + r
        nr_round = 2000
        min_round = 100

        model1 = xgb.train(params, 
                           xgb_train, 
                           nr_round,  
                           watchlist, 
                           verbose_eval=100
                           ,
                           early_stopping_rounds=min_round)

        pred_xgb = model1.predict(xgb.DMatrix(test_X_dup), ntree_limit=model1.best_ntree_limit+tree_lim)

        #
        file = 'gbm/subm_xgb{}{}.csv'.format(rep, r+1)
#         if model1.best_score<=0.195:
        subm = pd.DataFrame({'id': test['id'].values, target: pred_xgb})
        subm.to_csv(file, index=False)
        subms.append(file)    

        
        lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
        lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
        #gbdt
        params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
        params['boosting'] = 'gbdt'
        params['metric'] = 'binary_logloss'
        params['learning_rate'] = 0.03
        params['max_depth'] = 5
        params['num_leaves'] = 9 # higher number of leaves
        params['feature_fraction'] = 0.8 # Controls overfit
        params['bagging_fraction'] = 0.9    
        params['bagging_freq'] = 3
        params['seed'] = ran_num + r
        params['silent'] = False
        params['verbose'] = 1000

        file = 'gbm/subm_orilgb{}{}.csv'.format(rep, r+1)
        subms.append(file)

        pred, f_tmp = run_lgb(params=params, 
                              lgb_train=lgb_train, 
                              lgb_valid=lgb_valid, 
                              lgb_test=test_X_dup, 
                              test_ids=test['id'].values, 
                              nr_round=nr_round, 
                              min_round=min_round, 
                              file=file)
        
        
        ##LightGBM
        #dart
        params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
        params['boosting'] = 'dart'
        params['metric'] = 'binary_logloss'
        params['learning_rate'] = 0.04
        params['max_depth'] = 5
        params['num_leaves'] = 16 # higher number of leaves
        params['feature_fraction'] = 0.8 # Controls overfit
        params['bagging_fraction'] = 0.9    
        params['bagging_freq'] = 3
        params['seed'] = ran_num + r
        #dart
        params['drop_rate'] = 0.1
        params['skip_drop'] = 0.5
        params['max_drop'] = 10
        params['verbose'] = 1000
        params['silent'] = False
        
        file = 'gbm/subm_lgb{}{}.csv'.format(rep, r+1)
        subms.append(file)

        pred, f_tmp = run_lgb(params=params, 
                              lgb_train=lgb_train, 
                              lgb_valid=lgb_valid, 
                              lgb_test=test_X_dup, 
                              test_ids=test['id'].values, 
                              nr_round=nr_round, 
                              min_round=min_round, 
                              file=file)
    if rep%5==0:
        print(rep)
        

The seed we are using is: 52204

round 0001 of 0005, seed=<mtrand.RandomState object at 0x7f26846964c8>
splitted: (1283, 265), (321, 265)
[0]	train-logloss:0.677142	valid-logloss:0.67987
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.200635	valid-logloss:0.276155
[200]	train-logloss:0.106665	valid-logloss:0.222632
[300]	train-logloss:0.064316	valid-logloss:0.205287
[400]	train-logloss:0.041729	valid-logloss:0.195431
[500]	train-logloss:0.028727	valid-logloss:0.192955
[600]	train-logloss:0.020455	valid-logloss:0.191053
[700]	train-logloss:0.015365	valid-logloss:0.190849
Stopping. Best iteration:
[640]	train-logloss:0.018225	valid-logloss:0.189769


LightGBM: gbdt


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))



LightGBM: dart

round 0002 of 0005, seed=<mtrand.RandomState object at 0x7f26846964c8>
splitted: (1283, 265), (321, 265)
[0]	train-logloss:0.67773	valid-logloss:0.677893
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[100]	train-logloss:0.202642	valid-logloss:0.258654
[200]	train-logloss:0.107838	valid-logloss:0.199357
[300]	train-logloss:0.064817	valid-logloss:0.18259
[400]	train-logloss:0.042206	valid-logloss:0.173318
[500]	train-logloss:0.028425	valid-logloss:0.173133
Stopping. Best iteration:
[472]	train-logloss:0.031731	valid-logloss:0.171771


LightGBM: gbdt

LightGBM: dart

round 0003 of 0005, seed=<mtrand.RandomState object at 0x7f26846964c8>
splitted: (1283, 265), (321, 265)
[0]	train-logloss:0.675441	valid-logloss:0.677959
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.

In [47]:
import os
#print(os.listdir('gbm'))
waiting_list= [os.path.join('gbm',i) for i in os.listdir('gbm') if 'csv' in i]
len(waiting_list)
for i in waiting_list:
    print(i)
#w_total

gbm/subm_lgb02.csv
gbm/subm_orilgb05.csv
gbm/subm_orilgb02.csv
gbm/subm_orilgb01.csv
gbm/subm_orilgb03.csv
gbm/subm_orilgb04.csv
gbm/subm_lgb03.csv
gbm/subm_lgb01.csv
gbm/subm_xgb01.csv
gbm/subm_xgb03.csv
gbm/subm_xgb02.csv
gbm/subm_xgb04.csv
gbm/subm_xgb05.csv
gbm/subm_lgb04.csv
gbm/subm_lgb05.csv


In [32]:
k = pd.DataFrame()
blend = None
df_corr = None
print('\nBlending...')
v=1
for num,path in enumerate(waiting_list):
    if blend is None:
        blend = pd.read_csv(path)

        w_total += v
        blend[target] = blend[target] * v

    else:
        preds_tmp = pd.read_csv(path)
        preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
        msg = "is_iceberg%d"% (num+1)
        blend[msg] = preds_tmp[target] * v

        



Blending...


In [36]:
blend.to_csv('gbm_test.csv')

In [50]:
target = 'is_iceberg'

w_total = 0.0
blend = None
df_corr = None
print('\nBlending...')
v=1
for num,path in enumerate(waiting_list):
    if blend is None:
        blend = pd.read_csv(path)

        w_total += v
        blend[target] = blend[target] * v

    else:
        preds_tmp = pd.read_csv(path)
        preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
        
        
        w_total += v
        msg = 'is_iceberg%d' %num
        blend[msg] = preds_tmp[target] * v
        del preds_tmp

# blend[target] = blend[target] / w_total
# print('\nPreview: \n{}'.format(blend.head()), flush=True)
# blend.to_csv('submission1010.csv',index=False)


Blending...


In [54]:
temp11 = blend.drop(['id'],axis=1)
temp11['is_iceberg_max'] = temp11.iloc[:, :15].max(axis=1)
temp11['is_iceberg_min'] = temp11.iloc[:, :15].min(axis=1)
temp11['is_iceberg_median'] = temp11.iloc[:, :15].median(axis=1)
temp11['is_iceberg_mean'] = temp11.iloc[:, :15].mean(axis=1)
# set up cutoff threshold for lower and upper bounds, easy to twist 
cutoff_lo = 0.8
cutoff_hi = 0.2


temp11['is_iceberg'] = np.where(np.all(temp11.iloc[:,0:15] > cutoff_lo, axis=1), 
                                    temp11['is_iceberg_max'], 
                                    np.where(np.all(temp11.iloc[:,0:15] < cutoff_hi, axis=1),
                                             temp11['is_iceberg_min'], 
                                             temp11['is_iceberg_median']))

In [56]:
k = pd.DataFrame()
k['id'] = blend['id']
k['is_iceberg'] = temp11['is_iceberg']
k.to_csv('22.csv',index=False)

In [7]:
def save_blend(preds={}, loc='./'):
    
    target = 'is_iceberg'
    
    w_total = 0.0
    blend = None
    df_corr = None
    print('\nBlending...')
    for k, v in preds.items():
        if blend is None:
            blend = pd.read_csv('{0}/{1}'.format(loc, k))
            print('load: {0}, w={1}'.format(k, v))
            
            df_corr = pd.DataFrame({'id': blend['id'].tolist()})
            df_corr[k[16:-4]] = blend[target]
            
            w_total += v
            blend[target] = blend[target] * v
                
        else:
            preds_tmp = pd.read_csv('{0}/{1}'.format(loc, k))
            preds_tmp = blend[['id']].merge(preds_tmp, how='left', on='id')
            print('load: {0}, w={1}'.format(k, v))
            df_corr[k[16:-4]] = preds_tmp[target]
            
            w_total += v
            blend[target] += preds_tmp[target] * v
            del preds_tmp
            
    print('\n{}'.format(df_corr.corr()), flush=True)
    #write submission
    blend[target] = blend[target] / w_total
    print('\nPreview: \n{}'.format(blend.head()), flush=True)
    blend.to_csv('{}subm_blend{:03d}_{}.csv'.format(loc, len(preds), tmp), index=False, float_format='%.6f')

def run_lgb(params={}, lgb_train=None, lgb_valid=None, lgb_test=None, test_ids=None, nr_round=2000, min_round=100, file=''):

    print('\nLightGBM: {}'.format(params['boosting'])) 
    model2 = lgb.train(params, 
                       lgb_train, 
                       nr_round, 
                       lgb_valid, 
                       verbose_eval=50, early_stopping_rounds=min_round)
    
    pred = model2.predict(lgb_test, num_iteration=model2.best_iteration)
    #
    subm = pd.DataFrame({'id': test_ids, 'is_iceberg': pred})
    subm.to_csv(file, index=False, float_format='%.6f')
    #   
    df = pd.DataFrame({'feature':model2.feature_name(), 'importances': model2.feature_importance()})
    
    return pred, df


#results
freq = pd.DataFrame()
subms = []

#training
test_ratio = 0.2
nr_runs = 3
split_seed = 25
kf = StratifiedShuffleSplit(n_splits=nr_runs, test_size=test_ratio, train_size=None, random_state=split_seed)

for r, (train_index, test_index) in enumerate(kf.split(train_X, train_y)):
    print('\nround {:04d} of {:04d}, seed={}'.format(r+1, nr_runs, split_seed))

    tmp = dt.datetime.now().strftime("%Y-%m-%d-%H-%M")

    x1, x2 = train_X[train_index], train_X[test_index]
    y1, y2 = train_y[train_index], train_y[test_index]
    #x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=test_ratio, random_state=split_seed + r)
    print('splitted: {0}, {1}'.format(x1.shape, x2.shape), flush=True)
    test_X_dup = test_X.copy()

    #XGB
    xgb_train = xgb.DMatrix(x1, y1)
    xgb_valid = xgb.DMatrix(x2, y2)
    #
    watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
    params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'seed': 99, 'silent': True}
    params['eta'] = 0.03
    params['max_depth'] = 4
    params['subsample'] = 0.9
    params['eval_metric'] = 'logloss'
    params['colsample_bytree'] = 0.8
    params['colsample_bylevel'] = 0.8
    params['max_delta_step'] = 3
    #params['gamma'] = 5.0
    #params['labmda'] = 1
    params['scale_pos_weight'] = 1.0
    params['seed'] = split_seed + r
    nr_round = 2000
    min_round = 100

    model1 = xgb.train(params, 
                       xgb_train, 
                       nr_round,  
                       watchlist, 
                       verbose_eval=50, 
                       early_stopping_rounds=min_round)

    pred_xgb = model1.predict(xgb.DMatrix(test_X_dup), ntree_limit=model1.best_ntree_limit+45)

    #
    file = 'subm_{}_xgb_{:02d}.csv'.format(tmp, r+1)
    subm = pd.DataFrame({'id': test['id'].values, target: pred_xgb})
    subm.to_csv(file, index=False, float_format='%.6f')
    subms.append(file)    

    ##LightGBM
    lgb_train = lgb.Dataset(x1, label=y1, free_raw_data=False)
    lgb_valid = lgb.Dataset(x2, label=y2, reference=lgb_train, free_raw_data=False)
    #gbdt
    params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
    params['boosting'] = 'gbdt'
    params['metric'] = 'binary_logloss'
    params['learning_rate'] = 0.03
    params['max_depth'] = 5
    params['num_leaves'] = 16 # higher number of leaves
    params['feature_fraction'] = 0.8 # Controls overfit
    params['bagging_fraction'] = 0.9    
    params['bagging_freq'] = 3
    params['seed'] = split_seed + r
    #
    params['verbose'] = -1

    file = 'subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
    subms.append(file)

    pred, f_tmp = run_lgb(params=params, 
                          lgb_train=lgb_train, 
                          lgb_valid=lgb_valid, 
                          lgb_test=test_X_dup, 
                          test_ids=test['id'].values, 
                          nr_round=nr_round, 
                          min_round=min_round, 
                          file=file)

    ##LightGBM
    #dart
    params = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 'objective': 'binary', 'is_training_metric': False, 'seed': 99}
    params['boosting'] = 'dart'
    params['metric'] = 'binary_logloss'
    params['learning_rate'] = 0.04
    params['max_depth'] = 5
    params['num_leaves'] = 16 # higher number of leaves
    params['feature_fraction'] = 0.8 # Controls overfit
    params['bagging_fraction'] = 0.9    
    params['bagging_freq'] = 3
    params['seed'] = split_seed + r
    #dart
    params['drop_rate'] = 0.1
    params['skip_drop'] = 0.5
    params['max_drop'] = 10
    params['verbose'] = -1 

    file = 'subm_{}_lgb_{}_{:02d}.csv'.format(tmp, params['boosting'], r+1)
    subms.append(file)

    pred, f_tmp = run_lgb(params=params, 
                          lgb_train=lgb_train, 
                          lgb_valid=lgb_valid, 
                          lgb_test=test_X_dup, 
                          test_ids=test['id'].values, 
                          nr_round=nr_round, 
                          min_round=min_round, 
                          file=file)


#blending
preds = {k: 1.0 for k in subms}
save_blend(preds=preds)

[1m[34m.[m[m
[1m[34m..[m[m
.DS_Store
[1m[34m.git[m[m
[1m[34m.ipynb_checkpoints[m[m
36_plain_cnn.csv
41_plain_cnn.csv
50_plain_fcn.csv
67_plain_cnn.csv
6_retrain_inception.csv
Image preprocess testing.ipynb
README.md
Ship-Iceberg Discrimination with Convolutional Neural Networks in High Resolution SAR Images.pdf
The Effectiveness of Data Augmentation in Image Classification using Deep Learning.pdf
Training_log.ipynb
[1m[34m__pycache__[m[m
all_14_inception.csv
cnn.ipynb
cnn.py
cnn_angle.ipynb
cnn_angle.py
[1m[34mdata[m[m
densenet.py
densenet121.ipynb
densenet121_pseudl.ipynb
densenetBC.py
densenetbc100.ipynb
fcn.ipynb
fcn.py
final ensemble.ipynb
gbm.ipynb
inception.ipynb
inception.py
[1m[34mothers[m[m
pre_resnet.py
pre_vgg.py
r2_11_plain_cnn.csv
r2_fcn_11_models.csv
resnet.py
resnet101.ipynb
resnet101_4feat.ipynb
resnet152.ipynb
resnet18.ipynb
resnet34.ipynb
resnet34_4feat.ipynb
resnet34_onlygoodretrain.csv
resnet3

In [10]:
ran_num

array([52161])

In [1]:
import torch

In [4]:
a=  torch.Tensor([1,2,3])
a.size()
a=a.unsqueeze(1)
a.size()

torch.Size([3, 1])