In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from keras.utils.np_utils import to_categorical
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, Flatten, Lambda, Activation
from keras.layers import Conv2D, MaxPooling2D, ZeroPadding2D, GlobalAveragePooling2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

train_df = pd.read_json('../input/train.json')
test_df = pd.read_json('../input/test.json')

In [7]:
import pickle
with open('new_preprocess_x_y.pkl','rb') as fin:
    train_x,y,test_x = pickle.load(fin)

In [8]:
from keras.callbacks import ModelCheckpoint,LearningRateScheduler
def create_model():
    '''Create the FCN and return a keras model.'''

    model = Sequential()
    model.add(Conv2D(32, (3, 3),input_shape=(75, 75, 3), activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(32, (3, 3), activation='relu', strides=1))
    model.add(Conv2D(32, (3, 3), activation='relu', strides=1))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    
    model.add(ZeroPadding2D((1, 1)))
    model.add(Conv2D(64, (3, 3), activation='relu', strides=1))
    model.add(Conv2D(64, (3, 3), activation='relu', strides=1))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    
    model.add(Conv2D(64, (3, 3), activation='relu', strides=1))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(1, activation='sigmoid'))
    return model
print('model model')


model model


In [10]:
from sklearn.model_selection import KFold
from keras.preprocessing.image import ImageDataGenerator

def lr_f(epoch):
    if epoch<10:
        return 0.001
    elif epoch<30:
        return 0.0005
    else:
        return 0.00005

def kfold_train(fold_cnt=3,rnd=233):
    train_pred, test_pred = np.zeros((1604,1)),np.zeros((8424,1))
    kf = KFold(n_splits=fold_cnt, shuffle=True, random_state=2*rnd)
    for train_index, test_index in kf.split(train_x):
        curr_x,curr_y = train_x[train_index],y[train_index]
        val_x,val_y = train_x[test_index],y[test_index]
        datagen = ImageDataGenerator(
            width_shift_range=0.2,
            height_shift_range=0.2,
            shear_range=0.2,
            horizontal_flip=True,
            vertical_flip=True
        )
        
        
        bat_size = 32
        steps_train = len(curr_y)//bat_size
        
        
        model = create_model()
        model.compile(loss='binary_crossentropy', optimizer=Adam(0.0005), metrics=['accuracy'])
        model_p = 'best_m.h5'
        model_chk = ModelCheckpoint(filepath=model_p, monitor='val_loss', save_best_only=True, verbose=1)
        lr_s = LearningRateScheduler(lr_f)
        model.fit_generator(datagen.flow(curr_x, curr_y, batch_size=bat_size),
                  validation_data=(val_x,val_y),
                  steps_per_epoch = steps_train,
                  epochs=50, 
                  verbose=2,
                  callbacks=[model_chk,lr_s]
                 )
        
        
        model = load_model(model_p)
        train_pred[test_index] = model.predict(val_x)
        test_pred = test_pred + model.predict(test_x)/fold_cnt
        print('============================')
    return train_pred,test_pred

train_pred,test_pred = kfold_train(fold_cnt=4)

Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.64032, saving model to best_m.h5
 - 2s - loss: 0.6674 - acc: 0.5841 - val_loss: 0.6403 - val_acc: 0.6160
Epoch 2/50
Epoch 00002: val_loss improved from 0.64032 to 0.58922, saving model to best_m.h5
 - 1s - loss: 0.6311 - acc: 0.6038 - val_loss: 0.5892 - val_acc: 0.6234
Epoch 3/50
Epoch 00003: val_loss improved from 0.58922 to 0.54435, saving model to best_m.h5
 - 1s - loss: 0.5606 - acc: 0.6868 - val_loss: 0.5443 - val_acc: 0.7207
Epoch 4/50
Epoch 00004: val_loss improved from 0.54435 to 0.52949, saving model to best_m.h5
 - 1s - loss: 0.5319 - acc: 0.7299 - val_loss: 0.5295 - val_acc: 0.7132
Epoch 5/50
Epoch 00005: val_loss improved from 0.52949 to 0.40854, saving model to best_m.h5
 - 1s - loss: 0.4552 - acc: 0.7874 - val_loss: 0.4085 - val_acc: 0.7980
Epoch 6/50
Epoch 00006: val_loss improved from 0.40854 to 0.39018, saving model to best_m.h5
 - 1s - loss: 0.3891 - acc: 0.8336 - val_loss: 0.3902 - val_acc: 0.8354
Epoch 7/50
Epo

Epoch 10/50
Epoch 00010: val_loss did not improve
 - 1s - loss: 0.3966 - acc: 0.8133 - val_loss: 0.3198 - val_acc: 0.8529
Epoch 11/50
Epoch 00011: val_loss improved from 0.29485 to 0.28277, saving model to best_m.h5
 - 1s - loss: 0.3161 - acc: 0.8527 - val_loss: 0.2828 - val_acc: 0.8903
Epoch 12/50
Epoch 00012: val_loss improved from 0.28277 to 0.27351, saving model to best_m.h5
 - 1s - loss: 0.3277 - acc: 0.8448 - val_loss: 0.2735 - val_acc: 0.8878
Epoch 13/50
Epoch 00013: val_loss did not improve
 - 1s - loss: 0.2857 - acc: 0.8708 - val_loss: 0.3071 - val_acc: 0.8878
Epoch 14/50
Epoch 00014: val_loss did not improve
 - 1s - loss: 0.3141 - acc: 0.8597 - val_loss: 0.2772 - val_acc: 0.8554
Epoch 15/50
Epoch 00015: val_loss improved from 0.27351 to 0.25352, saving model to best_m.h5
 - 1s - loss: 0.2818 - acc: 0.8831 - val_loss: 0.2535 - val_acc: 0.8903
Epoch 16/50
Epoch 00016: val_loss did not improve
 - 1s - loss: 0.2893 - acc: 0.8682 - val_loss: 0.3507 - val_acc: 0.8753
Epoch 17/50
Ep

Epoch 22/50
Epoch 00022: val_loss did not improve
 - 1s - loss: 0.2661 - acc: 0.8860 - val_loss: 0.2778 - val_acc: 0.8653
Epoch 23/50
Epoch 00023: val_loss did not improve
 - 1s - loss: 0.2715 - acc: 0.8736 - val_loss: 0.2668 - val_acc: 0.8778
Epoch 24/50
Epoch 00024: val_loss did not improve
 - 1s - loss: 0.3051 - acc: 0.8767 - val_loss: 0.2622 - val_acc: 0.8828
Epoch 25/50
Epoch 00025: val_loss improved from 0.24822 to 0.24296, saving model to best_m.h5
 - 1s - loss: 0.2786 - acc: 0.8758 - val_loss: 0.2430 - val_acc: 0.8903
Epoch 26/50
Epoch 00026: val_loss did not improve
 - 1s - loss: 0.2452 - acc: 0.8859 - val_loss: 0.2452 - val_acc: 0.8928
Epoch 27/50
Epoch 00027: val_loss did not improve
 - 1s - loss: 0.2308 - acc: 0.9006 - val_loss: 0.2825 - val_acc: 0.8803
Epoch 28/50
Epoch 00028: val_loss did not improve
 - 1s - loss: 0.2438 - acc: 0.8921 - val_loss: 0.2513 - val_acc: 0.8828
Epoch 29/50
Epoch 00029: val_loss did not improve
 - 1s - loss: 0.2658 - acc: 0.8947 - val_loss: 0.374

Epoch 33/50
Epoch 00033: val_loss did not improve
 - 1s - loss: 0.2421 - acc: 0.9006 - val_loss: 0.3178 - val_acc: 0.8778
Epoch 34/50
Epoch 00034: val_loss improved from 0.30895 to 0.30555, saving model to best_m.h5
 - 1s - loss: 0.1870 - acc: 0.9211 - val_loss: 0.3056 - val_acc: 0.8803
Epoch 35/50
Epoch 00035: val_loss did not improve
 - 1s - loss: 0.2225 - acc: 0.9090 - val_loss: 0.3179 - val_acc: 0.8803
Epoch 36/50
Epoch 00036: val_loss did not improve
 - 1s - loss: 0.1997 - acc: 0.9217 - val_loss: 0.3445 - val_acc: 0.8753
Epoch 37/50
Epoch 00037: val_loss did not improve
 - 1s - loss: 0.2089 - acc: 0.9093 - val_loss: 0.3112 - val_acc: 0.8803
Epoch 38/50
Epoch 00038: val_loss did not improve
 - 1s - loss: 0.1918 - acc: 0.9186 - val_loss: 0.3208 - val_acc: 0.8853
Epoch 39/50
Epoch 00039: val_loss did not improve
 - 1s - loss: 0.2067 - acc: 0.9150 - val_loss: 0.3272 - val_acc: 0.8853
Epoch 40/50
Epoch 00040: val_loss did not improve
 - 1s - loss: 0.1971 - acc: 0.9214 - val_loss: 0.319

In [11]:
import pickle
with open('../features/cnn_2_aug1_feat_new_preprocess.pkl','wb') as fout: # no help
    pickle.dump([train_pred,test_pred],fout)

# train feat loss
from sklearn.metrics import log_loss
print(log_loss(y,train_pred))
    
submission = pd.DataFrame()
submission['id']=test_df['id']
submission['is_iceberg']=test_pred
print(submission.head())
submission.to_csv('../results/cnn_2_aug1_sub_new_preprocess.csv', index=False)

0.256762398449
         id    is_iceberg
0  5941774d  1.517783e-02
1  4023181e  3.707070e-01
2  b20200e4  2.806452e-08
3  e7f018bb  9.553727e-01
4  4371c8c3  3.394186e-02
