In [17]:
import pandas as pd
train_df = pd.read_json('../input/train.json')
test_df = pd.read_json('../input/test.json')
print(train_df.head())
print(test_df.head())

import numpy as np

from itertools import combinations
from scipy.ndimage import laplace, sobel
from scipy.stats import kurtosis, skew

                                              band_1  \
0  [-27.878360999999998, -27.15416, -28.668615, -...   
1  [-12.242375, -14.920304999999999, -14.920363, ...   
2  [-24.603676, -24.603714, -24.871029, -23.15277...   
3  [-22.454607, -23.082819, -23.998013, -23.99805...   
4  [-26.006956, -23.164886, -23.164886, -26.89116...   

                                              band_2        id inc_angle  \
0  [-27.154118, -29.537888, -31.0306, -32.190483,...  dfd5f913   43.9239   
1  [-31.506321, -27.984554, -26.645678, -23.76760...  e25388fd   38.1562   
2  [-24.870956, -24.092632, -20.653963, -19.41104...  58b2aaa0   45.2859   
3  [-27.889421, -27.519794, -27.165262, -29.10350...  4cfc3a18   43.8306   
4  [-27.206915, -30.259186, -30.259186, -23.16495...  271f93f4   35.6256   

   is_iceberg  
0           0  
1           0  
2           1  
3           0  
4           0  
                                              band_1  \
0  [-15.863251, -15.201077, -17.887735, -19.17248...  

In [18]:


def get_img_feat(img):
    feats = [np.mean(img),np.std(img),np.median(img),np.max(img),np.min(img)]
    return feats
    
def get_other_feat(df):
    feats = []
    band1,band2,band3,band4 = [],[],[],[]
    for i, row in df.iterrows():
        tmp_feat = []
        img1 = np.array(row['band_1']).reshape(75, 75).astype('float32')
        img2 = np.array(row['band_2']).reshape(75, 75).astype('float32')
        if row['inc_angle'] == 'na':
            ang = -1
        else:
            ang = float(row['inc_angle'])
        img3 = (img1+img2)*ang/2.0
        img4 = (img1-img2)*ang/2.0
        band1.append(img1.ravel())
        band2.append(img2.ravel())
        band3.append(img3.ravel())
        band4.append(img4.ravel())
            
        # base
        st_trans = get_img_feat(img1) + get_img_feat(img2) + [ang] 
        tmp_feat += st_trans
        tmp_feat += [x * y for x, y in combinations(st_trans, 2)]
        tmp_feat += [x + y for x, y in combinations(st_trans, 2)]
        tmp_feat += [x - y for x, y in combinations(st_trans, 2)]
        
        # lap
        lap_1 = laplace(img1, mode='reflect', cval=0.0)
        lap_2 = laplace(img2, mode='reflect', cval=0.0)
        st_trans = get_img_feat(lap_1) + get_img_feat(lap_2)
        
        # sob
        sob1 = sobel(img1, axis=0, mode='reflect', cval=0.0)
        sob2 = sobel(img1, axis=1, mode='reflect', cval=0.0)
        sob3 = sobel(img2, axis=0, mode='reflect', cval=0.0)
        sob4 = sobel(img2, axis=1, mode='reflect', cval=0.0)
        st_trans = st_trans + get_img_feat(sob1) + get_img_feat(sob2) + get_img_feat(sob3) + get_img_feat(sob4)
        tmp_feat += st_trans
        
        # hist
        hist = list(np.histogram(img1, bins=20)[0])
        tmp_feat += hist
        tmp_feat += [np.std(hist), np.max(hist), np.median(hist), (np.max(hist) - np.median(hist))]
        
        hist = list(np.histogram(img2, bins=20)[0])
        tmp_feat += hist
        tmp_feat += [np.std(hist), np.max(hist), np.median(hist), (np.max(hist) - np.median(hist))]
        
        tmp_feat += get_img_feat(img3) + get_img_feat(img4)
        feats.append(tmp_feat)
    return band1,band2,band3,band4,feats

a_band1,a_band2,a_band3,a_band4,a_angs = get_other_feat(train_df)
b_band1,b_band2,b_band3,b_band4,b_angs = get_other_feat(test_df)
print('raw feats')

raw feats


In [19]:
from sklearn import decomposition
comp = 50
pca_b1 = decomposition.PCA(n_components=comp, whiten=True, random_state=15)
pca_b2 = decomposition.PCA(n_components=comp, whiten=True, random_state=16)
pca_b3 = decomposition.PCA(n_components=comp, whiten=True, random_state=17)
pca_b4 = decomposition.PCA(n_components=comp, whiten=True, random_state=18)

a_band1_feat = pca_b1.fit_transform(np.array(a_band1))
a_band2_feat = pca_b2.fit_transform(np.array(a_band2))
a_band3_feat = pca_b3.fit_transform(np.array(a_band3))
a_band4_feat = pca_b4.fit_transform(np.array(a_band4))

b_band1_feat = pca_b1.transform(np.array(b_band1))
b_band2_feat = pca_b2.transform(np.array(b_band2))
b_band3_feat = pca_b3.transform(np.array(b_band3))
b_band4_feat = pca_b4.transform(np.array(b_band4))

print('pca done',a_band1_feat.shape)

pca done (1604, 50)


In [20]:
a_bands = np.hstack([a_band1,a_band2,a_band3,a_band4])
b_bands = np.hstack([b_band1,b_band2,b_band3,b_band4])
pca_bx = decomposition.PCA(n_components=60, whiten=True, random_state=15)
a_band_feat = pca_bx.fit_transform(np.array(a_bands))
b_band_feat = pca_bx.transform(np.array(b_bands))
print('pca done',a_band_feat.shape)

pca done (1604, 60)


In [21]:
train_feat = np.hstack([a_band1_feat,a_band2_feat,a_band3_feat,a_band4_feat,a_band_feat,np.array(a_angs)])
test_feat = np.hstack([b_band1_feat,b_band2_feat,b_band3_feat,b_band4_feat,b_band_feat,np.array(b_angs)])

print(train_feat.shape,test_feat.shape)

(1604, 524) (8424, 524)


In [22]:
import pickle
with open('other_feat_v2.pkl','wb') as fout:
    pickle.dump([train_feat,test_feat],fout)
print('done')

done


In [23]:
y = train_df.is_iceberg.values
print(y[:5])

[0 0 1 0 0]


In [24]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
def cv_feat(model_f,fold_cnt=3,rnd=1,params={}):
    train_pred, test_pred = np.zeros((1604,1)),np.zeros((8424,1))
    kf = KFold(n_splits=fold_cnt, shuffle=True, random_state=2*rnd)
    avg_train_l,avg_val_l = 0,0
    print(model_f(**params))
    for train_index, test_index in kf.split(train_feat):
        curr_x,curr_y = train_feat[train_index],y[train_index]
        val_x,val_y = train_feat[test_index],y[test_index]
        
        model = model_f(**params)
        model.fit(curr_x,curr_y)
        
        curr_train_pred = model.predict_proba(curr_x)
        curr_val_pred = model.predict_proba(val_x)
        train_pred[test_index] = curr_val_pred[:,1].reshape(-1,1)
        curr_test_pred = model.predict_proba(test_feat)/fold_cnt
        test_pred = test_pred + curr_test_pred[:,1].reshape(-1,1)
        
        loss1 = log_loss(curr_y,curr_train_pred)
        loss2 = log_loss(val_y,curr_val_pred)
        avg_train_l += loss1/fold_cnt
        avg_val_l += loss2/fold_cnt
        print('this fold train loss',loss1,'val loss',loss2)
        print('============================')
    print('all avg',avg_train_l,avg_val_l)
    return train_pred,test_pred
print('def done')

def done


In [28]:
# lr
from sklearn.linear_model import LogisticRegression
lr_train,lr_pred = cv_feat(LogisticRegression,
                           fold_cnt=5,
                           params={'C':2.0,'max_iter':30},rnd=2)
import pickle
with open('../2nd_features/other_model_lr7.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=30, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
this fold train loss 0.207241946492 val loss 0.286242288204
this fold train loss 0.21964769911 val loss 0.320364011012
this fold train loss 0.225364154965 val loss 0.279900130972
this fold train loss 0.214315087908 val loss 0.290438618302
this fold train loss 0.20934601504 val loss 0.29697624655
all avg 0.215182980703 0.294784259008


In [29]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
tmp_params = {
    'n_estimators':200,
    'max_depth':10,
    'random_state':42
}
lr_train,lr_pred = cv_feat(RandomForestClassifier,fold_cnt=5,params=tmp_params)
with open('../2nd_features/other_model_rf7.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
this fold train loss 0.129296674685 val loss 0.30854446922
this fold train loss 0.124918438728 val loss 0.306518453839
this fold train loss 0.126731662737 val loss 0.273198313406
this fold train loss 0.122876105306 val loss 0.306134897742
this fold train loss 0.121726111663 val loss 0.307918551165
all avg 0.125109798624 0.300462937074


In [30]:
tmp_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'random_state':42,
    'subsample':1.0,
    'min_samples_leaf':1,
    'max_depth':3
}
lr_train,lr_pred = cv_feat(GradientBoostingClassifier,fold_cnt=5,
                           params=tmp_params,
                           rnd=1
                          )
with open('../2nd_features/other_model_gbrt7.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)
this fold train loss 0.0283999852997 val loss 0.258734573524
this fold train loss 0.0307841721068 val loss 0.193073629645
this fold train loss 0.0278715195163 val loss 0.186784206681
this fold train loss 0.0292057013911 val loss 0.264800696067
this fold train loss 0.0244842934591 val loss 0.255111447009
all avg 0.0281491343546 0.231700910585


In [31]:
from xgboost import XGBClassifier
tmp_params = {
    'n_estimators':200,
    'colsample_bytree':1,
    'min_child_weight':1,
    'learning_rate':0.1,

    
}
lr_train,lr_pred = cv_feat(XGBClassifier,fold_cnt=5,
                           params=tmp_params,rnd=1)
with open('../2nd_features/other_model_xgb7.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
this fold train loss 0.021899639521 val loss 0.236528103225
this fold train loss 0.0249325548873 val loss 0.183005113788
this fold train loss 0.0252389829969 val loss 0.185084255211
this fold train loss 0.0226569409003 val loss 0.237792021364
this fold train loss 0.0223828670976 val loss 0.254871491637
all avg 0.0234221970806 0.219456197045


In [32]:
from lightgbm import LGBMClassifier
tmp_params = {
    'max_depth':3,  
    'n_estimators':200,
    #'min_child_samples':20,
    #'reg_lambda':0.1,
}
lr_train,lr_pred = cv_feat(LGBMClassifier,fold_cnt=5,params=tmp_params)
with open('../2nd_features/other_model_lgb7.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=3, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=200,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0173393599209 val loss 0.238832237991


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0198665303243 val loss 0.192208286688


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0194814666523 val loss 0.178429272092


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0171481851864 val loss 0.237144395837


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0169822979942 val loss 0.251440546473
all avg 0.0181635680156 0.219610947816
