In [1]:
import pandas as pd
train_df = pd.read_json('../input/train.json')
test_df = pd.read_json('../input/test.json')
print(train_df.head())

import numpy as np
import cv2
from itertools import combinations
from scipy.ndimage import laplace, sobel
from scipy.stats import kurtosis, skew

def get_img_feat(img_org):
    img = img_org.ravel()
    feats = [np.mean(img),np.std(img),np.median(img),np.max(img),np.min(img),
             kurtosis(img),skew(img),np.max(img)-np.min(img)]
    return feats
    
def get_other_feat(df):
    feats = []
    for i, row in df.iterrows():
        tmp_feat = []
        img1 = np.array(row['band_1']).reshape(75, 75).astype('float32')
        img2 = np.array(row['band_2']).reshape(75, 75).astype('float32')
        
        if row['inc_angle'] == 'na':
            ang = -1
        else:
            ang = float(row['inc_angle'])
        
        feat1 = get_img_feat(img1)
        feat2 = get_img_feat(img2)
        feat_len = len(feat1)
        tmp_feat += feat1 + feat2 + [ang]
        for i in range(feat_len):
            tmp_feat.append(feat1[i]-feat2[i])
            tmp_feat.append(feat1[i]+feat2[i])
        
        lap_1 = laplace(img1, mode='reflect', cval=0.0).var()
        lap_2 = laplace(img2, mode='reflect', cval=0.0).var()
        
        # sob
        sob1 = sobel(img1, axis=0, mode='reflect', cval=0.0).var()
        sob2 = sobel(img1, axis=1, mode='reflect', cval=0.0).var()
        sob3 = sobel(img2, axis=0, mode='reflect', cval=0.0).var()
        sob4 = sobel(img2, axis=1, mode='reflect', cval=0.0).var()
        
        tmp_feat += [lap_1,lap_2,sob1,sob2,sob3,sob4]
        
        feats.append(tmp_feat)
        
    return np.array(feats).astype('float32')

train_feat = get_other_feat(train_df)
test_feat = get_other_feat(test_df)

                                              band_1  \
0  [-27.878360999999998, -27.15416, -28.668615, -...   
1  [-12.242375, -14.920304999999999, -14.920363, ...   
2  [-24.603676, -24.603714, -24.871029, -23.15277...   
3  [-22.454607, -23.082819, -23.998013, -23.99805...   
4  [-26.006956, -23.164886, -23.164886, -26.89116...   

                                              band_2        id inc_angle  \
0  [-27.154118, -29.537888, -31.0306, -32.190483,...  dfd5f913   43.9239   
1  [-31.506321, -27.984554, -26.645678, -23.76760...  e25388fd   38.1562   
2  [-24.870956, -24.092632, -20.653963, -19.41104...  58b2aaa0   45.2859   
3  [-27.889421, -27.519794, -27.165262, -29.10350...  4cfc3a18   43.8306   
4  [-27.206915, -30.259186, -30.259186, -23.16495...  271f93f4   35.6256   

   is_iceberg  
0           0  
1           0  
2           1  
3           0  
4           0  


In [2]:
print(train_feat.shape,test_feat.shape)

(1604, 39) (8424, 39)


In [3]:
y = train_df.is_iceberg.values
print(y[:5])

[0 0 1 0 0]


In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
def cv_feat(model_f,fold_cnt=3,rnd=1,params={}):
    train_pred, test_pred = np.zeros((1604,1)),np.zeros((8424,1))
    kf = KFold(n_splits=fold_cnt, shuffle=True, random_state=2*rnd)
    avg_train_l,avg_val_l = 0,0
    print(model_f(**params))
    for train_index, test_index in kf.split(train_feat):
        curr_x,curr_y = train_feat[train_index],y[train_index]
        val_x,val_y = train_feat[test_index],y[test_index]
        
        model = model_f(**params)
        model.fit(curr_x,curr_y)
        
        curr_train_pred = model.predict_proba(curr_x)
        curr_val_pred = model.predict_proba(val_x)
        train_pred[test_index] = curr_val_pred[:,1].reshape(-1,1)
        curr_test_pred = model.predict_proba(test_feat)/fold_cnt
        test_pred = test_pred + curr_test_pred[:,1].reshape(-1,1)
        
        loss1 = log_loss(curr_y,curr_train_pred)
        loss2 = log_loss(val_y,curr_val_pred)
        avg_train_l += loss1/fold_cnt
        avg_val_l += loss2/fold_cnt
        print('this fold train loss',loss1,'val loss',loss2)
        print('============================')
    print('all avg',avg_train_l,avg_val_l)
    return train_pred,test_pred
print('def done')

def done


In [22]:
# lr
from sklearn.linear_model import LogisticRegression
lr_train,lr_pred = cv_feat(LogisticRegression,
                           fold_cnt=6,
                           params={'C':2.0,'max_iter':100},rnd=2)
import pickle
with open('../features/other_model_lr4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
this fold train loss 0.287731041382 val loss 0.262732928163
this fold train loss 0.27989720333 val loss 0.310595262597
this fold train loss 0.277975666022 val loss 0.311678439165
this fold train loss 0.277803243272 val loss 0.314822358111
this fold train loss 0.281082104942 val loss 0.294895949458
this fold train loss 0.285190185047 val loss 0.276584347618
all avg 0.281613240666 0.295218214185


In [41]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
tmp_params = {
    'n_estimators':30,
    'max_depth':5,
    'random_state':42
}
lr_train,lr_pred = cv_feat(RandomForestClassifier,fold_cnt=7,params=tmp_params)
with open('../features/other_model_rf4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
this fold train loss 0.332707916851 val loss 0.416499510302
this fold train loss 0.332503579345 val loss 0.393370767949
this fold train loss 0.331014507639 val loss 0.386497911175
this fold train loss 0.330437118176 val loss 0.371705114631
this fold train loss 0.333927795106 val loss 0.367035762438
this fold train loss 0.324476612527 val loss 0.410691940824
this fold train loss 0.331683319328 val loss 0.362861542234
all avg 0.330964406996 0.386951792793


In [7]:
tmp_params = {
    'n_estimators':200,
    'learning_rate':0.01,
    'random_state':42
}
#lr_train,lr_pred = cv_feat(AdaBoostClassifier,fold_cnt=5,params=tmp_params)
# with open('../features/other_model_ada.pkl','wb') as fout:
#     pickle.dump([lr_train,lr_pred],fout)

In [59]:
tmp_params = {
    'n_estimators':230,
    'learning_rate':0.1,
    'random_state':42,
    'subsample':1.0,
    'min_samples_leaf':1,
    'max_depth':3
}
lr_train,lr_pred = cv_feat(GradientBoostingClassifier,fold_cnt=5,
                           params=tmp_params,
                           rnd=1
                          )
with open('../features/other_model_gbrt4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=230,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)
this fold train loss 0.0495707480273 val loss 0.252043622088
this fold train loss 0.0546954703019 val loss 0.233812466356
this fold train loss 0.0596918157523 val loss 0.190514160306
this fold train loss 0.0545535026418 val loss 0.208425240605
this fold train loss 0.0539377536118 val loss 0.244591307856
all avg 0.054489858067 0.225877359442


In [81]:
from xgboost import XGBClassifier
tmp_params = {
    'n_estimators':200,
    'colsample_bytree':1,
    'min_child_weight':1,
    'learning_rate':0.1,

    
}
lr_train,lr_pred = cv_feat(XGBClassifier,fold_cnt=5,
                           params=tmp_params,rnd=1)
with open('../features/other_model_xgb4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
this fold train loss 0.0760642053151 val loss 0.243601235928
this fold train loss 0.0804241277613 val loss 0.23459369554
this fold train loss 0.0826821901988 val loss 0.19195923385
this fold train loss 0.082669580743 val loss 0.212452242594
this fold train loss 0.0765358782398 val loss 0.244702244014
all avg 0.0796751964516 0.225461730385


In [99]:
from lightgbm import LGBMClassifier
tmp_params = {
    'max_depth':3,  
    'n_estimators':200,
    'min_child_samples':20,
    #'reg_lambda':0.1,
}
lr_train,lr_pred = cv_feat(LGBMClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_lgb4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=3, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=200,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0675601854902 val loss 0.25630813982


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0767522518115 val loss 0.235380362336


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0790560033704 val loss 0.199976805237


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0810209224713 val loss 0.225123786418


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0732322664606 val loss 0.248995267605
all avg 0.0755243259208 0.233156872283
