In [85]:
import pandas as pd
train_df = pd.read_json('../input/train.json')
test_df = pd.read_json('../input/test.json')
print(train_df.head())

import numpy as np
import cv2
from itertools import combinations
from scipy.ndimage import laplace, sobel
from scipy.stats import kurtosis, skew

                                              band_1  \
0  [-27.878360999999998, -27.15416, -28.668615, -...   
1  [-12.242375, -14.920304999999999, -14.920363, ...   
2  [-24.603676, -24.603714, -24.871029, -23.15277...   
3  [-22.454607, -23.082819, -23.998013, -23.99805...   
4  [-26.006956, -23.164886, -23.164886, -26.89116...   

                                              band_2        id inc_angle  \
0  [-27.154118, -29.537888, -31.0306, -32.190483,...  dfd5f913   43.9239   
1  [-31.506321, -27.984554, -26.645678, -23.76760...  e25388fd   38.1562   
2  [-24.870956, -24.092632, -20.653963, -19.41104...  58b2aaa0   45.2859   
3  [-27.889421, -27.519794, -27.165262, -29.10350...  4cfc3a18   43.8306   
4  [-27.206915, -30.259186, -30.259186, -23.16495...  271f93f4   35.6256   

   is_iceberg  
0           0  
1           0  
2           1  
3           0  
4           0  


In [86]:
def get_img_feat(img_org):
    img = img_org.ravel()
    feats = [np.mean(img),np.std(img),np.median(img),np.max(img),np.min(img)]
    return feats

    
def get_other_feat(df):
    band1,band2,band3,band4,angs = [],[],[],[],[]
    for i, row in df.iterrows():
        tmp_feat = []
        img1 = np.array(row['band_1']).astype('float32')
        img2 = np.array(row['band_2']).astype('float32')
        
        if row['inc_angle'] == 'na':
            ang = -1
        else:
            ang = float(row['inc_angle'])
            
        img3 = (img1+img2)*ang/2.0
        img4 = (img1-img2)*ang/2.0
        band1.append(img1)
        band2.append(img2)
        band3.append(img3)
        band4.append(img4)
        tmp_feat = [ang] + get_img_feat(img1) + get_img_feat(img2) + get_img_feat(img3) + get_img_feat(img4)
        angs.append(tmp_feat)
    return band1,band2,band3,band4,angs
        
        

a_band1,a_band2,a_band3,a_band4,a_angs = get_other_feat(train_df)
b_band1,b_band2,b_band3,b_band4,b_angs = get_other_feat(test_df)
print('raw feats')

raw feats


In [87]:
from sklearn import decomposition
pca_b1 = decomposition.PCA(n_components=50, whiten=True, random_state=15)
pca_b2 = decomposition.PCA(n_components=50, whiten=True, random_state=16)
pca_b3 = decomposition.PCA(n_components=50, whiten=True, random_state=17)
pca_b4 = decomposition.PCA(n_components=50, whiten=True, random_state=18)

a_band1_feat = pca_b1.fit_transform(np.array(a_band1))
a_band2_feat = pca_b2.fit_transform(np.array(a_band2))
a_band3_feat = pca_b3.fit_transform(np.array(a_band3))
a_band4_feat = pca_b4.fit_transform(np.array(a_band4))

b_band1_feat = pca_b1.transform(np.array(b_band1))
b_band2_feat = pca_b2.transform(np.array(b_band2))
b_band3_feat = pca_b3.transform(np.array(b_band3))
b_band4_feat = pca_b4.transform(np.array(b_band4))

print('pca done',a_band1_feat.shape)

pca done (1604, 50)


In [88]:
train_feat = np.hstack([a_band1_feat,a_band2_feat,a_band3_feat,a_band4_feat,np.array(a_angs)])
test_feat = np.hstack([b_band1_feat,b_band2_feat,b_band3_feat,b_band4_feat,np.array(b_angs)])

print(train_feat.shape,test_feat.shape)

(1604, 221) (8424, 221)


In [89]:
y = train_df.is_iceberg.values
print(y[:5])

[0 0 1 0 0]


In [90]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
def cv_feat(model_f,fold_cnt=3,rnd=1,params={}):
    train_pred, test_pred = np.zeros((1604,1)),np.zeros((8424,1))
    kf = KFold(n_splits=fold_cnt, shuffle=True, random_state=2*rnd)
    avg_train_l,avg_val_l = 0,0
    print(model_f(**params))
    for train_index, test_index in kf.split(train_feat):
        curr_x,curr_y = train_feat[train_index],y[train_index]
        val_x,val_y = train_feat[test_index],y[test_index]
        
        model = model_f(**params)
        model.fit(curr_x,curr_y)
        
        curr_train_pred = model.predict_proba(curr_x)
        curr_val_pred = model.predict_proba(val_x)
        train_pred[test_index] = curr_val_pred[:,1].reshape(-1,1)
        curr_test_pred = model.predict_proba(test_feat)/fold_cnt
        test_pred = test_pred + curr_test_pred[:,1].reshape(-1,1)
        
        loss1 = log_loss(curr_y,curr_train_pred)
        loss2 = log_loss(val_y,curr_val_pred)
        avg_train_l += loss1/fold_cnt
        avg_val_l += loss2/fold_cnt
        print('this fold train loss',loss1,'val loss',loss2)
        print('============================')
    print('all avg',avg_train_l,avg_val_l)
    return train_pred,test_pred
print('def done')

def done


In [100]:
# lr
from sklearn.linear_model import LogisticRegression
lr_train,lr_pred = cv_feat(LogisticRegression,
                           fold_cnt=5,
                           params={'C':2.0,'max_iter':100},rnd=2)
import pickle
with open('../features/other_model_lr5.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
this fold train loss 0.229916396005 val loss 0.448423534124
this fold train loss 0.227237337199 val loss 0.478266737288
this fold train loss 0.223011052401 val loss 0.494714550796
this fold train loss 0.23191737697 val loss 0.421971990877
this fold train loss 0.237887719148 val loss 0.414473665352
all avg 0.229993976345 0.451570095687


In [107]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
tmp_params = {
    'n_estimators':25,
    'max_depth':8,
    'random_state':42
}
lr_train,lr_pred = cv_feat(RandomForestClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_rf5.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
this fold train loss 0.236693528102 val loss 0.410720955754
this fold train loss 0.24545326116 val loss 0.378380299809
this fold train loss 0.25528310747 val loss 0.382674611316
this fold train loss 0.232883270576 val loss 0.372400844952
this fold train loss 0.241476683899 val loss 0.380748696052
all avg 0.242357970242 0.384985081576


In [93]:
# tmp_params = {
#     'n_estimators':20,
#     'learning_rate':0.01,
#     'random_state':42
# }
# lr_train,lr_pred = cv_feat(AdaBoostClassifier,fold_cnt=5,params=tmp_params)
# with open('../features/other_model_ada4.pkl','wb') as fout:
#     pickle.dump([lr_train,lr_pred],fout)

In [110]:
tmp_params = {
    'n_estimators':230,
    'learning_rate':0.1,
    'random_state':42,
    'subsample':1.0,
    'min_samples_leaf':1,
    'max_depth':3
}
lr_train,lr_pred = cv_feat(GradientBoostingClassifier,fold_cnt=5,
                           params=tmp_params,
                           rnd=1
                          )
with open('../features/other_model_gbrt5.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=230,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)
this fold train loss 0.0317809830181 val loss 0.268880044719
this fold train loss 0.0313338029998 val loss 0.236646141961
this fold train loss 0.0323396521184 val loss 0.26348425245
this fold train loss 0.03095675678 val loss 0.292284447111
this fold train loss 0.0312389058258 val loss 0.263077176563
all avg 0.0315300201484 0.264874412561


In [95]:
from xgboost import XGBClassifier
tmp_params = {
    'n_estimators':50,
    'colsample_bytree':1,
    'min_child_weight':1,
    'learning_rate':0.1,

    
}
lr_train,lr_pred = cv_feat(XGBClassifier,fold_cnt=5,
                           params=tmp_params,rnd=1)
with open('../features/other_model_xgb5.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
this fold train loss 0.204493118465 val loss 0.324220771379
this fold train loss 0.210131122239 val loss 0.302956643793
this fold train loss 0.217470997353 val loss 0.298650399341
this fold train loss 0.201142933805 val loss 0.333196300262
this fold train loss 0.210257128038 val loss 0.321714240327
all avg 0.20869905998 0.31614767102


In [96]:
from lightgbm import LGBMClassifier
tmp_params = {
    'max_depth':3,  
    'n_estimators':500,
    #'min_child_samples':20,
    #'reg_lambda':0.1,
}
lr_train,lr_pred = cv_feat(LGBMClassifier,fold_cnt=5,params=tmp_params)
with open('../features/other_model_lgb5.pkl','wb') as fout:
    pickle.dump([lr_train,lr_pred],fout)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=3, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=500,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.00125056944852 val loss 0.338489119731


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.00140465979348 val loss 0.283837113901


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0014321677666 val loss 0.283778423407


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.0013837236445 val loss 0.342397624023


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


this fold train loss 0.00115615611576 val loss 0.323817773956
all avg 0.00132545535377 0.314464011004
