In [1]:
import pandas as pd
train_df = pd.read_json('../input/train.json')
test_df = pd.read_json('../input/test.json')
print(train_df.head())

import numpy as np
from itertools import combinations
from scipy.ndimage import laplace, sobel
from scipy.stats import kurtosis, skew

                                              band_1  \
0  [-27.878360999999998, -27.15416, -28.668615, -...   
1  [-12.242375, -14.920304999999999, -14.920363, ...   
2  [-24.603676, -24.603714, -24.871029, -23.15277...   
3  [-22.454607, -23.082819, -23.998013, -23.99805...   
4  [-26.006956, -23.164886, -23.164886, -26.89116...   

                                              band_2        id inc_angle  \
0  [-27.154118, -29.537888, -31.0306, -32.190483,...  dfd5f913   43.9239   
1  [-31.506321, -27.984554, -26.645678, -23.76760...  e25388fd   38.1562   
2  [-24.870956, -24.092632, -20.653963, -19.41104...  58b2aaa0   45.2859   
3  [-27.889421, -27.519794, -27.165262, -29.10350...  4cfc3a18   43.8306   
4  [-27.206915, -30.259186, -30.259186, -23.16495...  271f93f4   35.6256   

   is_iceberg  
0           0  
1           0  
2           1  
3           0  
4           0  


In [2]:
def get_img_feat(img_org):
    img = img_org.ravel()
    feats = [np.mean(img),np.std(img),np.median(img),np.max(img),np.min(img)]
    return feats

from skimage.feature import hog
def get_hog_feat(img):
    hog_image = hog(img)
    return hog_image.ravel()

def get_other_feat(df):
    band1,band2,band3,band4,angs = [],[],[],[],[]
    for i, row in df.iterrows():
        tmp_feat = []
        img1 = np.array(row['band_1']).astype('float32').reshape(75,75)
        img2 = np.array(row['band_2']).astype('float32').reshape(75,75)
        
        if row['inc_angle'] == 'na':
            ang = -1
        else:
            ang = float(row['inc_angle'])
            
        img3 = (img1+img2)*ang/2.0
        img = np.dstack([img1,img2,img3])
        #print(img.shape)

        tmp_feat = [ang] + get_img_feat(img1) + get_img_feat(img2) + get_img_feat(img3)
        tmp_feat += list(get_hog_feat(img1)) + list(get_hog_feat(img2))
        angs.append(tmp_feat)
    return angs
        
        

a_angs = get_other_feat(train_df)
b_angs = get_other_feat(test_df)
print('raw feats')

/usr/local/lib/python3.5/dist-packages/skimage/feature/_hog.py:119: skimage_deprecation: Default value of `block_norm`==`L1` is deprecated and will be changed to `L2-Hys` in v0.15
  'be changed to `L2-Hys` in v0.15', skimage_deprecation)


raw feats


In [3]:
print(len(a_angs[0]))

7954


In [58]:
from sklearn import decomposition
comp = 100
pca_b1 = decomposition.PCA(n_components=comp, whiten=True, random_state=15)
train_feat = pca_b1.fit_transform(np.array(a_angs))
test_feat = pca_b1.transform(np.array(b_angs))

In [59]:
print('pca done',train_feat.shape)

pca done (1604, 100)


In [60]:
print(train_feat.shape,test_feat.shape)

(1604, 100) (8424, 100)


In [61]:
y = train_df.is_iceberg.values
print(y[:5])

[0 0 1 0 0]


In [62]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
def cv_feat(model_f,fold_cnt=3,rnd=1,params={}):
    train_pred, test_pred = np.zeros((1604,1)),np.zeros((8424,1))
    kf = KFold(n_splits=fold_cnt, shuffle=True, random_state=2*rnd)
    avg_train_l,avg_val_l = 0,0
    print(model_f(**params))
    for train_index, test_index in kf.split(train_feat):
        curr_x,curr_y = train_feat[train_index],y[train_index]
        val_x,val_y = train_feat[test_index],y[test_index]
        
        model = model_f(**params)
        model.fit(curr_x,curr_y)
        
        curr_train_pred = model.predict_proba(curr_x)
        curr_val_pred = model.predict_proba(val_x)
        train_pred[test_index] = curr_val_pred[:,1].reshape(-1,1)
        curr_test_pred = model.predict_proba(test_feat)/fold_cnt
        test_pred = test_pred + curr_test_pred[:,1].reshape(-1,1)
        
        loss1 = log_loss(curr_y,curr_train_pred)
        loss2 = log_loss(val_y,curr_val_pred)
        avg_train_l += loss1/fold_cnt
        avg_val_l += loss2/fold_cnt
        print('this fold train loss',loss1,'val loss',loss2)
        print('============================')
    print('all avg',avg_train_l,avg_val_l)
    return train_pred,test_pred
print('def done')

def done


In [63]:
# lr
from sklearn.linear_model import LogisticRegression
lr_train,lr_pred = cv_feat(LogisticRegression,
                           fold_cnt=5,
                           params={'C':4.0,'max_iter':100},rnd=2)
import pickle
with open('../features/other_model_lr8.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LogisticRegression(C=4.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
this fold train loss 0.269386466638 val loss 0.404149132428
this fold train loss 0.259653462977 val loss 0.440906650833
this fold train loss 0.284374275615 val loss 0.320693930304
this fold train loss 0.270894964677 val loss 0.392390795125
this fold train loss 0.283805048447 val loss 0.334623992915
all avg 0.273622843671 0.378552900321


In [64]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
tmp_params = {
    'n_estimators':25,
    'max_depth':8,
    'random_state':42
}
lr_train,lr_pred = cv_feat(RandomForestClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_rf8.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
this fold train loss 0.308165266592 val loss 0.483434186804
this fold train loss 0.301766994495 val loss 0.461749482516
this fold train loss 0.307211242299 val loss 0.436702338151
this fold train loss 0.314233922706 val loss 0.465284021533
this fold train loss 0.309547675901 val loss 0.472849105451
all avg 0.308185020399 0.464003826891


In [65]:
# tmp_params = {
#     'n_estimators':20,
#     'learning_rate':0.01,
#     'random_state':42
# }
# lr_train,lr_pred = cv_feat(AdaBoostClassifier,fold_cnt=5,params=tmp_params)
# with open('../features/other_model_ada4.pkl','wb') as fout:
#     pickle.dump([lr_train,lr_pred],fout)

In [66]:
tmp_params = {
    'n_estimators':230,
    'learning_rate':0.1,
    'random_state':42,
    'subsample':1.0,
    'min_samples_leaf':1,
    'max_depth':3
}
lr_train,lr_pred = cv_feat(GradientBoostingClassifier,fold_cnt=5,
                           params=tmp_params,
                           rnd=1
                          )
with open('../features/other_model_gbrt8.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=230,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)
this fold train loss 0.0639251411234 val loss 0.339664064947
this fold train loss 0.0575079253514 val loss 0.326549729855
this fold train loss 0.0614482500407 val loss 0.295933624057
this fold train loss 0.0600693129791 val loss 0.311650222212
this fold train loss 0.054895951967 val loss 0.349832157115
all avg 0.0595693162923 0.324725959637


In [67]:
from xgboost import XGBClassifier
tmp_params = {
    'n_estimators':50,
    'colsample_bytree':1,
    'min_child_weight':1,
    'learning_rate':0.1,

    
}
lr_train,lr_pred = cv_feat(XGBClassifier,fold_cnt=5,
                           params=tmp_params,rnd=1)
with open('../features/other_model_xgb8.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
this fold train loss 0.263670170265 val loss 0.388673091864
this fold train loss 0.271266560518 val loss 0.376387626559
this fold train loss 0.279249230797 val loss 0.34037718572
this fold train loss 0.270698858087 val loss 0.362372293241
this fold train loss 0.266488230815 val loss 0.371144251668
all avg 0.270274610096 0.367790889811


In [68]:
from lightgbm import LGBMClassifier
tmp_params = {
    'max_depth':3,  
    'n_estimators':500,
    #'min_child_samples':20,
    #'reg_lambda':0.1,
}
lr_train,lr_pred = cv_feat(LGBMClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_lgb8.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=3, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=500,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.00961449217188 val loss 0.371308823045


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.00778771877931 val loss 0.397186026636


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.00973853810493 val loss 0.361505680826


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0100531103306 val loss 0.325669121834


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.00939053722775 val loss 0.374439296618
all avg 0.00931687932289 0.366021789792
