In [None]:
import pandas as pd
train_df = pd.read_json('../input/train.json')
test_df = pd.read_json('../input/test.json')
print(train_df.head())

import numpy as np
import cv2
from itertools import combinations
from scipy.ndimage import laplace, sobel
from scipy.stats import kurtosis, skew

In [121]:
def get_img_feat(img_org):
    img = img_org.ravel()
    feats = [np.mean(img),np.std(img),np.median(img),np.max(img),np.min(img),
             kurtosis(img),skew(img),np.max(img)-np.min(img)]
    return feats

def sob_feat(img):
    img = img.ravel()
    feats = [img.var()]#,img.mean(),img.max()]
    return feats
    
def get_other_feat(df):
    feats = []
    for i, row in df.iterrows():
        tmp_feat = []
        img1 = np.array(row['band_1']).reshape(75, 75).astype('float32')
        img2 = np.array(row['band_2']).reshape(75, 75).astype('float32')
        
        if row['inc_angle'] == 'na':
            ang = -1
        else:
            ang = float(row['inc_angle'])
        
        feat1 = get_img_feat(img1)
        feat2 = get_img_feat(img2)
        feat_len = len(feat1)
        tmp_feat += feat1 + feat2 + [ang]
        for i in range(feat_len):
            tmp_feat.append(feat1[i]-feat2[i])
            tmp_feat.append(feat1[i]+feat2[i])
        
        lap_1 = laplace(img1, mode='reflect', cval=0.0)
        lap_2 = laplace(img2, mode='reflect', cval=0.0)
        
        # sob
        sob1 = sobel(img1, axis=0, mode='reflect', cval=0.0)
        sob2 = sobel(img1, axis=1, mode='reflect', cval=0.0)
        sob3 = sobel(img2, axis=0, mode='reflect', cval=0.0)
        sob4 = sobel(img2, axis=1, mode='reflect', cval=0.0)
        
        tmp_feat += [lap_1.var(),lap_2.var(),
                     lap_1.mean(),lap_2.mean(),
                     np.median(lap_1),np.median(lap_2),
                     lap_1.min(),lap_2.min(),
                     lap_1.max(),lap_2.max()]
        tmp_feat +=  sob_feat(sob1)
        tmp_feat +=  sob_feat(sob2)
        tmp_feat +=  sob_feat(sob3)
        tmp_feat +=  sob_feat(sob4)
                     
        
        feats.append(tmp_feat)
        
    return np.array(feats).astype('float32')

train_feat = get_other_feat(train_df)
test_feat = get_other_feat(test_df)

In [122]:
print(train_feat.shape,test_feat.shape)

(1604, 47) (8424, 47)


In [123]:
y = train_df.is_iceberg.values
print(y[:5])

[0 0 1 0 0]


In [124]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
def cv_feat(model_f,fold_cnt=3,rnd=1,params={}):
    train_pred, test_pred = np.zeros((1604,1)),np.zeros((8424,1))
    kf = KFold(n_splits=fold_cnt, shuffle=True, random_state=2*rnd)
    avg_train_l,avg_val_l = 0,0
    print(model_f(**params))
    for train_index, test_index in kf.split(train_feat):
        curr_x,curr_y = train_feat[train_index],y[train_index]
        val_x,val_y = train_feat[test_index],y[test_index]
        
        model = model_f(**params)
        model.fit(curr_x,curr_y)
        
        curr_train_pred = model.predict_proba(curr_x)
        curr_val_pred = model.predict_proba(val_x)
        train_pred[test_index] = curr_val_pred[:,1].reshape(-1,1)
        curr_test_pred = model.predict_proba(test_feat)/fold_cnt
        test_pred = test_pred + curr_test_pred[:,1].reshape(-1,1)
        
        loss1 = log_loss(curr_y,curr_train_pred)
        loss2 = log_loss(val_y,curr_val_pred)
        avg_train_l += loss1/fold_cnt
        avg_val_l += loss2/fold_cnt
        print('this fold train loss',loss1,'val loss',loss2)
        print('============================')
    print('all avg',avg_train_l,avg_val_l)
    return train_pred,test_pred
print('def done')

def done


In [125]:
# lr
from sklearn.linear_model import LogisticRegression
lr_train,lr_pred = cv_feat(LogisticRegression,
                           fold_cnt=5,
                           params={'C':2.0,'max_iter':100},rnd=2)
import pickle
with open('../features/other_model_lr4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
this fold train loss 0.28346680734 val loss 0.269383982705
this fold train loss 0.266844250564 val loss 0.342750661321
this fold train loss 0.283986089517 val loss 0.263473158322
this fold train loss 0.271168136823 val loss 0.316982235079
this fold train loss 0.279891935227 val loss 0.284484611506
all avg 0.277071443894 0.295414929787


In [126]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
tmp_params = {
    'n_estimators':20,
    'max_depth':5,
    'random_state':42
}
lr_train,lr_pred = cv_feat(RandomForestClassifier,fold_cnt=7,params=tmp_params)
with open('../features/other_model_rf4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
this fold train loss 0.332741966762 val loss 0.395147960132
this fold train loss 0.324608489944 val loss 0.414022266324
this fold train loss 0.337654477224 val loss 0.370658694911
this fold train loss 0.338918728628 val loss 0.367910521598
this fold train loss 0.329491481291 val loss 0.364845784461
this fold train loss 0.32708683367 val loss 0.405368963986
this fold train loss 0.333871719534 val loss 0.370725549828
all avg 0.332053385293 0.384097105891


In [127]:
# tmp_params = {
#     'n_estimators':20,
#     'learning_rate':0.01,
#     'random_state':42
# }
# lr_train,lr_pred = cv_feat(AdaBoostClassifier,fold_cnt=5,params=tmp_params)
# with open('../features/other_model_ada4.pkl','wb') as fout:
#     pickle.dump([lr_train,lr_pred],fout)

In [128]:
tmp_params = {
    'n_estimators':230,
    'learning_rate':0.1,
    'random_state':42,
    'subsample':1.0,
    'min_samples_leaf':1,
    'max_depth':3
}
lr_train,lr_pred = cv_feat(GradientBoostingClassifier,fold_cnt=5,
                           params=tmp_params,
                           rnd=1
                          )
with open('../features/other_model_gbrt4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=230,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)
this fold train loss 0.0484457957525 val loss 0.260340096733
this fold train loss 0.0518920103925 val loss 0.23600689074
this fold train loss 0.0536541821668 val loss 0.190907007327
this fold train loss 0.0543177994383 val loss 0.20983728902
this fold train loss 0.0485981545413 val loss 0.24498760639
all avg 0.0513815884583 0.228415778042


In [129]:
from xgboost import XGBClassifier
tmp_params = {
    'n_estimators':200,
    'colsample_bytree':1,
    'min_child_weight':1,
    'learning_rate':0.1,

    
}
lr_train,lr_pred = cv_feat(XGBClassifier,fold_cnt=5,
                           params=tmp_params,rnd=1)
with open('../features/other_model_xgb4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
this fold train loss 0.0707619651157 val loss 0.244624168607
this fold train loss 0.0771160590047 val loss 0.236731505637
this fold train loss 0.0818905922132 val loss 0.188375400413
this fold train loss 0.0781106261424 val loss 0.209318274038
this fold train loss 0.0739296283551 val loss 0.248443547755
all avg 0.0763617741662 0.22549857929


In [130]:
from lightgbm import LGBMClassifier
tmp_params = {
    'max_depth':3,  
    'n_estimators':200,
    'min_child_samples':20,
    #'reg_lambda':0.1,
}
lr_train,lr_pred = cv_feat(LGBMClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_lgb4.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=3, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=200,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0653782255222 val loss 0.263467525753


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0779592295279 val loss 0.22497207784


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0779151253656 val loss 0.18646074989


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0764226885245 val loss 0.222260040529


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0697431038508 val loss 0.243907797654
all avg 0.0734836745582 0.228213638333
