In [1]:
import pandas as pd
train_df = pd.read_json('../input/train.json')
test_df = pd.read_json('../input/test.json')
print(train_df.head())

import numpy as np
import cv2
from itertools import combinations
from scipy.stats import kurtosis, skew
from scipy.ndimage import laplace, sobel

def get_feat_from_img(img_sub):
    bins = 20
    scl_min, scl_max = -50, 50
    st = []
    sub_st = []
    sub_st += [np.mean(img_sub), np.std(img_sub), np.max(img_sub), np.median(img_sub), np.min(img_sub)]
    sub_st += [(sub_st[2] - sub_st[3]), (sub_st[2] - sub_st[4]), (sub_st[3] - sub_st[4])] 
    sub_st += [(sub_st[-3] / sub_st[1]), (sub_st[-2] / sub_st[1]), (sub_st[-1] / sub_st[1])] #normalized by stdev
    st += sub_st
    
    #Laplacian, Sobel, kurtosis and skewness
    st_trans = []
    st_trans += [laplace(img_sub, mode='reflect', cval=0.0).ravel().var()] #blurr
    sobel0 = sobel(img_sub, axis=0, mode='reflect', cval=0.0).ravel().var()
    sobel1 = sobel(img_sub, axis=1, mode='reflect', cval=0.0).ravel().var()
    st_trans += [sobel0, sobel1]
    st_trans += [kurtosis(img_sub.ravel()), skew(img_sub.ravel())]

    #
    st += [x * y for x, y in combinations(st_trans, 2)]
    st += [x + y for x, y in combinations(st_trans, 2)]
    st += [x - y for x, y in combinations(st_trans, 2)]                

    #hist
    #hist = list(cv2.calcHist([img], [i], None, [bins], [0., 1.]).flatten())
    hist = list(np.histogram(img_sub, bins=bins, range=(scl_min, scl_max))[0])
    st += hist
    st += [hist.index(max(hist))] #only the smallest index w/ max value would be incl
    st += [np.std(hist), np.max(hist), np.median(hist), (np.max(hist) - np.median(hist))]
    return st
    
def get_other_feat(df):
    feats = []
    for i, row in df.iterrows():
        tmp_feat = []
        img1 = np.array(row['band_1']).reshape(75, 75).astype('float32')
        img2 = np.array(row['band_2']).reshape(75, 75).astype('float32')
        img3 = (img1+img2)/2
        if row['inc_angle'] == 'na':
            ang = -1
        else:
            ang = float(row['inc_angle'])
            
        tmp_feat = get_feat_from_img(img1) + get_feat_from_img(img2) + get_feat_from_img(img3) + [ang]
        feats.append(tmp_feat)
        
    return np.array(feats).astype('float32')

train_feat = get_other_feat(train_df)
test_feat = get_other_feat(test_df)

                                              band_1  \
0  [-27.878360999999998, -27.15416, -28.668615, -...   
1  [-12.242375, -14.920304999999999, -14.920363, ...   
2  [-24.603676, -24.603714, -24.871029, -23.15277...   
3  [-22.454607, -23.082819, -23.998013, -23.99805...   
4  [-26.006956, -23.164886, -23.164886, -26.89116...   

                                              band_2        id inc_angle  \
0  [-27.154118, -29.537888, -31.0306, -32.190483,...  dfd5f913   43.9239   
1  [-31.506321, -27.984554, -26.645678, -23.76760...  e25388fd   38.1562   
2  [-24.870956, -24.092632, -20.653963, -19.41104...  58b2aaa0   45.2859   
3  [-27.889421, -27.519794, -27.165262, -29.10350...  4cfc3a18   43.8306   
4  [-27.206915, -30.259186, -30.259186, -23.16495...  271f93f4   35.6256   

   is_iceberg  
0           0  
1           0  
2           1  
3           0  
4           0  


In [2]:
print(train_feat.shape,test_feat.shape)

(1604, 199) (8424, 199)


In [3]:
y = train_df.is_iceberg.values
print(y[:5])

[0 0 1 0 0]


In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
def cv_feat(model_f,fold_cnt=3,rnd=1,params={}):
    train_pred, test_pred = np.zeros((1604,1)),np.zeros((8424,1))
    kf = KFold(n_splits=fold_cnt, shuffle=True, random_state=2*rnd)
    avg_train_l,avg_val_l = 0,0
    print(model_f(**params))
    for train_index, test_index in kf.split(train_feat):
        curr_x,curr_y = train_feat[train_index],y[train_index]
        val_x,val_y = train_feat[test_index],y[test_index]
        
        model = model_f(**params)
        model.fit(curr_x,curr_y)
        
        curr_train_pred = model.predict_proba(curr_x)
        curr_val_pred = model.predict_proba(val_x)
        train_pred[test_index] = curr_val_pred[:,1].reshape(-1,1)
        curr_test_pred = model.predict_proba(test_feat)/fold_cnt
        test_pred = test_pred + curr_test_pred[:,1].reshape(-1,1)
        
        loss1 = log_loss(curr_y,curr_train_pred)
        loss2 = log_loss(val_y,curr_val_pred)
        avg_train_l += loss1/fold_cnt
        avg_val_l += loss2/fold_cnt
        print('this fold train loss',loss1,'val loss',loss2)
        print('============================')
    print('all avg',avg_train_l,avg_val_l)
    return train_pred,test_pred
print('def done')

def done


In [21]:
# lr
from sklearn.linear_model import LogisticRegression
lr_train,lr_pred = cv_feat(LogisticRegression,
                           fold_cnt=5,
                           rnd=1,
                           params={'C':2.0,'max_iter':100})
import pickle
with open('../features/other_model_lr2.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
this fold train loss 0.150262123761 val loss 0.28825628492
this fold train loss 0.170058937695 val loss 0.235593068118
this fold train loss 0.166186963664 val loss 0.188479565374
this fold train loss 0.163442099747 val loss 0.362230491392
this fold train loss 0.158448607177 val loss 0.254202857795
all avg 0.161679746409 0.26575245352


In [34]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
tmp_params = {
    'n_estimators':50,
    'max_depth':10,
    'random_state':42
}
lr_train,lr_pred = cv_feat(RandomForestClassifier,fold_cnt=5,
                           rnd=1,
                           params=tmp_params)
with open('../features/other_model_rf2.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
this fold train loss 0.117361609549 val loss 0.284550498281
this fold train loss 0.114624747072 val loss 0.290099689673
this fold train loss 0.124820530995 val loss 0.27762039581
this fold train loss 0.112714923692 val loss 0.274965110068
this fold train loss 0.119268526755 val loss 0.298419272298
all avg 0.117758067613 0.285130993226


In [7]:
tmp_params = {
    'n_estimators':200,
    'learning_rate':0.01,
    'random_state':42
}
#lr_train,lr_pred = cv_feat(AdaBoostClassifier,fold_cnt=5,params=tmp_params)
# with open('../features/other_model_ada.pkl','wb') as fout:
#     pickle.dump([lr_train,lr_pred],fout)

In [35]:
tmp_params = {
    'n_estimators':500,
    'learning_rate':0.1,
    'random_state':42,
}
lr_train,lr_pred = cv_feat(GradientBoostingClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_gbrt2.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)
this fold train loss 0.00538217353621 val loss 0.262754245818
this fold train loss 0.00629780179347 val loss 0.257350402834
this fold train loss 0.00590146581619 val loss 0.250531816604
this fold train loss 0.00501798708689 val loss 0.223542546626
this fold train loss 0.00531041430817 val loss 0.293378876565
all avg 0.00558196850819 0.25751157769


In [41]:
from xgboost import XGBClassifier
tmp_params = {
    'n_estimators':200,
    'colsample_bytree':1,
    'min_child_weight':1,
    
}
lr_train,lr_pred = cv_feat(XGBClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_xgb2.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
this fold train loss 0.0470813498795 val loss 0.232911134502
this fold train loss 0.0491627649343 val loss 0.236815817551
this fold train loss 0.0530640187421 val loss 0.19998797233
this fold train loss 0.0501395934692 val loss 0.207976929297
this fold train loss 0.0463526003241 val loss 0.240199753553
all avg 0.0491600654698 0.223578321447


In [52]:
from lightgbm import LGBMClassifier
tmp_params = {
    'max_depth':3,  
    'n_estimators':200,
    'boosting':'gbrt'
}
lr_train,lr_pred = cv_feat(LGBMClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_lgb2.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LGBMClassifier(boosting='gbrt', boosting_type='gbdt', colsample_bytree=1.0,
        learning_rate=0.1, max_bin=255, max_depth=3, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=200,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0415159842129 val loss 0.229851511384


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0454491325444 val loss 0.240340455722


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0482162248667 val loss 0.197131015262


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0458085955759 val loss 0.21564618552


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.042941952219 val loss 0.25154883846
all avg 0.0447863778838 0.22690360127
