In [11]:
import pandas as pd
train_df = pd.read_json('../input/train.json')
test_df = pd.read_json('../input/test.json')
print(train_df.head())

import numpy as np
import cv2
from itertools import combinations
from scipy.ndimage import laplace, sobel

def get_img_feat(img):
    feats = [np.mean(img),np.std(img),np.median(img),np.max(img),np.min(img)]
    return feats
    
def get_other_feat(df):
    feats = []
    for i, row in df.iterrows():
        tmp_feat = []
        img1 = np.array(row['band_1']).reshape(75, 75).astype('float32')
        img2 = np.array(row['band_2']).reshape(75, 75).astype('float32')
        img3 = (img1 + img2)/2
        if row['inc_angle'] == 'na':
            ang = 0
        else:
            ang = float(row['inc_angle'])
            
        # base
        st_trans = get_img_feat(img1) + get_img_feat(img2) + get_img_feat(img3) + [ang] 
        tmp_feat += st_trans
        tmp_feat += [x * y for x, y in combinations(st_trans, 2)]
        tmp_feat += [x + y for x, y in combinations(st_trans, 2)]
        tmp_feat += [x - y for x, y in combinations(st_trans, 2)]
        
        # lap
        lap_1 = laplace(img1, mode='reflect', cval=0.0)
        lap_2 = laplace(img2, mode='reflect', cval=0.0)
        lap_3 = laplace(img3, mode='reflect', cval=0.0)
        st_trans = get_img_feat(lap_1) + get_img_feat(lap_2) + get_img_feat(lap_3)
        
        # sob
        sob1 = sobel(img1, axis=0, mode='reflect', cval=0.0)
        sob2 = sobel(img1, axis=1, mode='reflect', cval=0.0)
        sob3 = sobel(img2, axis=0, mode='reflect', cval=0.0)
        sob4 = sobel(img2, axis=1, mode='reflect', cval=0.0)
        sob5 = sobel(img3, axis=0, mode='reflect', cval=0.0)
        sob6 = sobel(img3, axis=1, mode='reflect', cval=0.0)
        st_trans = st_trans + get_img_feat(sob1) + get_img_feat(sob2) + get_img_feat(sob3) + get_img_feat(sob4)
        st_trans = st_trans + get_img_feat(sob5) + get_img_feat(sob6)
        tmp_feat += st_trans
        
        # hist
        hist = list(np.histogram(img1, bins=20)[0])
        tmp_feat += hist
        tmp_feat += [np.std(hist), np.max(hist), np.median(hist), (np.max(hist) - np.median(hist))]
        
        hist = list(np.histogram(img2, bins=20)[0])
        tmp_feat += hist
        tmp_feat += [np.std(hist), np.max(hist), np.median(hist), (np.max(hist) - np.median(hist))]
        
        hist = list(np.histogram(img3, bins=20)[0])
        tmp_feat += hist
        tmp_feat += [np.std(hist), np.max(hist), np.median(hist), (np.max(hist) - np.median(hist))]
        feats.append(tmp_feat)
        
    return np.array(feats).astype('float32')

train_feat = get_other_feat(train_df)
test_feat = get_other_feat(test_df)

                                              band_1  \
0  [-27.878360999999998, -27.15416, -28.668615, -...   
1  [-12.242375, -14.920304999999999, -14.920363, ...   
2  [-24.603676, -24.603714, -24.871029, -23.15277...   
3  [-22.454607, -23.082819, -23.998013, -23.99805...   
4  [-26.006956, -23.164886, -23.164886, -26.89116...   

                                              band_2        id inc_angle  \
0  [-27.154118, -29.537888, -31.0306, -32.190483,...  dfd5f913   43.9239   
1  [-31.506321, -27.984554, -26.645678, -23.76760...  e25388fd   38.1562   
2  [-24.870956, -24.092632, -20.653963, -19.41104...  58b2aaa0   45.2859   
3  [-27.889421, -27.519794, -27.165262, -29.10350...  4cfc3a18   43.8306   
4  [-27.206915, -30.259186, -30.259186, -23.16495...  271f93f4   35.6256   

   is_iceberg  
0           0  
1           0  
2           1  
3           0  
4           0  


In [12]:
print(train_feat.shape,test_feat.shape)

(1604, 493) (8424, 493)


In [13]:
y = train_df.is_iceberg.values
print(y[:5])

[0 0 1 0 0]


In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
def cv_feat(model_f,fold_cnt=3,rnd=1,params={}):
    train_pred, test_pred = np.zeros((1604,1)),np.zeros((8424,1))
    kf = KFold(n_splits=fold_cnt, shuffle=True, random_state=2*rnd)
    avg_train_l,avg_val_l = 0,0
    print(model_f(**params))
    for train_index, test_index in kf.split(train_feat):
        curr_x,curr_y = train_feat[train_index],y[train_index]
        val_x,val_y = train_feat[test_index],y[test_index]
        
        model = model_f(**params)
        model.fit(curr_x,curr_y)
        
        curr_train_pred = model.predict_proba(curr_x)
        curr_val_pred = model.predict_proba(val_x)
        train_pred[test_index] = curr_val_pred[:,1].reshape(-1,1)
        curr_test_pred = model.predict_proba(test_feat)/fold_cnt
        test_pred = test_pred + curr_test_pred[:,1].reshape(-1,1)
        
        loss1 = log_loss(curr_y,curr_train_pred)
        loss2 = log_loss(val_y,curr_val_pred)
        avg_train_l += loss1/fold_cnt
        avg_val_l += loss2/fold_cnt
        print('this fold train loss',loss1,'val loss',loss2)
        print('============================')
    print('all avg',avg_train_l,avg_val_l)
    return train_pred,test_pred
print('def done')

def done


In [15]:
# lr
from sklearn.linear_model import LogisticRegression
lr_train,lr_pred = cv_feat(LogisticRegression,fold_cnt=5,params={'C':2.0,'max_iter':100})
import pickle
with open('../features/other_model_lr3.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
this fold train loss 0.144876200595 val loss 0.361868364692
this fold train loss 0.148391356691 val loss 0.365346893886
this fold train loss 0.167208677689 val loss 0.193670177927
this fold train loss 0.140235635263 val loss 0.429489179487
this fold train loss 0.142843200207 val loss 0.418051705378
all avg 0.148711014089 0.353685264274


In [16]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
tmp_params = {
    'n_estimators':50,
    'max_depth':10,
    'random_state':42
}
lr_train,lr_pred = cv_feat(RandomForestClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_rf3.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
this fold train loss 0.120126313925 val loss 0.2936809979
this fold train loss 0.122748232456 val loss 0.296912808996
this fold train loss 0.127886004863 val loss 0.254783445403
this fold train loss 0.1174082328 val loss 0.287328474347
this fold train loss 0.12031557239 val loss 0.287901754522
all avg 0.121696871287 0.284121496234


In [17]:
tmp_params = {
    'n_estimators':200,
    'learning_rate':0.01,
    'random_state':42
}
#lr_train,lr_pred = cv_feat(AdaBoostClassifier,fold_cnt=5,params=tmp_params)
# with open('../features/other_model_ada.pkl','wb') as fout:
#     pickle.dump([lr_train,lr_pred],fout)

In [18]:
tmp_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'random_state':42,
}
lr_train,lr_pred = cv_feat(GradientBoostingClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_gbrt3.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)
this fold train loss 0.0401790279744 val loss 0.265526443499
this fold train loss 0.0421715707481 val loss 0.224438066165
this fold train loss 0.0474347214469 val loss 0.177007093279
this fold train loss 0.0395962294178 val loss 0.216851814365
this fold train loss 0.0379853451897 val loss 0.259199549207
all avg 0.0414733789554 0.228604593303


In [19]:
from xgboost import XGBClassifier
tmp_params = {
    'n_estimators':200,
    'colsample_bytree':1,
    'min_child_weight':1,
    
}
lr_train,lr_pred = cv_feat(XGBClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_xgb3.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
this fold train loss 0.0329529985874 val loss 0.255709478286
this fold train loss 0.0370144478323 val loss 0.230055434636
this fold train loss 0.0384466781268 val loss 0.168870448389
this fold train loss 0.0351772871207 val loss 0.216290700139
this fold train loss 0.0311763129352 val loss 0.246747610231
all avg 0.0349535449205 0.223534734336


In [20]:
from lightgbm import LGBMClassifier
tmp_params = {
    'max_depth':3,  
    'n_estimators':200
}
lr_train,lr_pred = cv_feat(LGBMClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_lgb3.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=3, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=200,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0281136680849 val loss 0.269864855251


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0324313124827 val loss 0.234726841288


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0332933274586 val loss 0.178377831445


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0289990992101 val loss 0.225836234432


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0287024773518 val loss 0.250443071345
all avg 0.0303079769176 0.231849766752
