In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from keras.utils.np_utils import to_categorical
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, Flatten, Lambda, Activation
from keras.layers import Conv2D, MaxPooling2D, ZeroPadding2D, GlobalAveragePooling2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

train_df = pd.read_json('../input/train.json')
test_df = pd.read_json('../input/test.json')

Using TensorFlow backend.


In [2]:
def std_img(x):
    for i in range(3):
        x[:, :, i] -= np.mean(x[:, :, i].flatten())
        x[:, :, i] /= np.std(x[:, :, i].flatten()) + 1e-7
    return x

def get_image(df):
    '''Create 3-channel 'images'. Return rescale-normalised images.'''
    images = []
    for i, row in df.iterrows():
        #make 75x75 image
        band_1 = np.array(row['band_1']).reshape(75, 75)
        band_2 = np.array(row['band_2']).reshape(75, 75)
        band_3 = band_1 + band_2 # plus since log(x*y) = log(x) + log(y)
        
        # Rescale
        img = np.dstack((band_1, band_2, band_3))
        images.append(std_img(img))

    return np.array(images)


train_x = get_image(train_df)
test_x = get_image(test_df)

print(train_x.shape,test_x.shape)

(1604, 75, 75, 3) (8424, 75, 75, 3)


In [3]:
y = train_df.is_iceberg.values
print(y[:5])

[0 0 1 0 0]


In [4]:
from keras.callbacks import ModelCheckpoint,LearningRateScheduler
def ConvBlock(model, layers, filters):
    '''Create [layers] layers consisting of zero padding, a convolution with [filters] 3x3 filters and batch normalization. Perform max pooling after the last layer.'''
    for i in range(layers):
        model.add(ZeroPadding2D((1, 1)))
        model.add(Conv2D(filters, (3, 3), activation='relu'))
        model.add(BatchNormalization(axis=3))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

def create_model():
    '''Create the FCN and return a keras model.'''

    model = Sequential()

    # Input image: 75x75x3
    model.add(Lambda(lambda x: x, input_shape=(75, 75, 3)))
    ConvBlock(model, 1, 32)
    # 37x37x32
    ConvBlock(model, 1, 64)
    # 18x18x64
    ConvBlock(model, 1, 128)
    # 9x9x128
    ConvBlock(model, 1, 128)
    # 4x4x128
    model.add(Flatten())
    model.add(Dense(1,activation='sigmoid'))
    
    return model
print('model model')


model model


In [10]:
from sklearn.model_selection import KFold
from keras.preprocessing.image import ImageDataGenerator

def lr_f(epoch):
    if epoch<10:
        return 0.001
    elif epoch<30:
        return 0.0005
    else:
        return 0.0001

def kfold_train(fold_cnt=3,rnd=42):
    train_pred, test_pred,best_acc_pred = np.zeros((1604,1)),np.zeros((8424,1)),np.zeros((8424,1))
    kf = KFold(n_splits=fold_cnt, shuffle=True, random_state=2*rnd)
    for train_index, test_index in kf.split(train_x):
        curr_x,curr_y = train_x[train_index],y[train_index]
        val_x,val_y = train_x[test_index],y[test_index]
        datagen = ImageDataGenerator(
            width_shift_range=0.05,
            height_shift_range=0.05,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True,
            vertical_flip=True
        )
        
        
        bat_size = 16
        steps_train = len(curr_y)//bat_size
        
        
        model = create_model()
        model.compile(loss='binary_crossentropy', optimizer=Adam(0.0005), metrics=['accuracy'])
        model_p = 'best_m.h5'
        model_p2 = 'best_acc_m.h5'
        model_chk = ModelCheckpoint(filepath=model_p, monitor='val_loss', save_best_only=True, verbose=1)
        model_chk_2 = ModelCheckpoint(filepath=model_p2, monitor='val_acc', save_best_only=True, verbose=1)
        lr_s = LearningRateScheduler(lr_f)
        model.fit_generator(datagen.flow(curr_x, curr_y, batch_size=bat_size),
                  validation_data=(val_x,val_y),
                  steps_per_epoch = steps_train,
                  epochs=50, 
                  verbose=2,
                  callbacks=[model_chk,model_chk_2,lr_s]
                 )
        
        
        model = load_model(model_p)
        train_pred[test_index] = model.predict(val_x)
        test_pred = test_pred + model.predict(test_x)/fold_cnt
        
        model = load_model(model_p2)
        best_acc_pred = best_acc_pred + model.predict(test_x)/fold_cnt
        print('============================')
    return train_pred,test_pred,best_acc_pred

train_pred,test_pred,best_acc_pred = kfold_train(fold_cnt=4)

Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.46249, saving model to best_m.h5
Epoch 00001: val_acc improved from -inf to 0.83042, saving model to best_acc_m.h5
 - 4s - loss: 0.5897 - acc: 0.7220 - val_loss: 0.4625 - val_acc: 0.8304
Epoch 2/50
Epoch 00002: val_loss did not improve
Epoch 00002: val_acc did not improve
 - 2s - loss: 0.4120 - acc: 0.8072 - val_loss: 0.6816 - val_acc: 0.6035
Epoch 3/50
Epoch 00003: val_loss did not improve
Epoch 00003: val_acc did not improve
 - 2s - loss: 0.4033 - acc: 0.8039 - val_loss: 0.7368 - val_acc: 0.6633
Epoch 4/50
Epoch 00004: val_loss improved from 0.46249 to 0.40211, saving model to best_m.h5
Epoch 00004: val_acc did not improve
 - 2s - loss: 0.4098 - acc: 0.8206 - val_loss: 0.4021 - val_acc: 0.8155
Epoch 5/50
Epoch 00005: val_loss did not improve
Epoch 00005: val_acc did not improve
 - 2s - loss: 0.3523 - acc: 0.8433 - val_loss: 0.4736 - val_acc: 0.8180
Epoch 6/50
Epoch 00006: val_loss improved from 0.40211 to 0.23926, saving model t

Epoch 00048: val_loss did not improve
Epoch 00048: val_acc did not improve
 - 2s - loss: 0.1706 - acc: 0.9275 - val_loss: 0.1845 - val_acc: 0.9177
Epoch 49/50
Epoch 00049: val_loss did not improve
Epoch 00049: val_acc did not improve
 - 2s - loss: 0.1733 - acc: 0.9333 - val_loss: 0.1807 - val_acc: 0.9277
Epoch 50/50
Epoch 00050: val_loss did not improve
Epoch 00050: val_acc did not improve
 - 2s - loss: 0.1616 - acc: 0.9325 - val_loss: 0.1831 - val_acc: 0.9277
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.68997, saving model to best_m.h5
Epoch 00001: val_acc improved from -inf to 0.65337, saving model to best_acc_m.h5
 - 4s - loss: 0.6189 - acc: 0.7181 - val_loss: 0.6900 - val_acc: 0.6534
Epoch 2/50
Epoch 00002: val_loss improved from 0.68997 to 0.60734, saving model to best_m.h5
Epoch 00002: val_acc improved from 0.65337 to 0.75561, saving model to best_acc_m.h5
 - 3s - loss: 0.4092 - acc: 0.8200 - val_loss: 0.6073 - val_acc: 0.7556
Epoch 3/50
Epoch 00003: val_loss did not i

Epoch 43/50
Epoch 00043: val_loss did not improve
Epoch 00043: val_acc did not improve
 - 2s - loss: 0.1791 - acc: 0.9250 - val_loss: 0.2123 - val_acc: 0.9052
Epoch 44/50
Epoch 00044: val_loss did not improve
Epoch 00044: val_acc did not improve
 - 2s - loss: 0.1415 - acc: 0.9381 - val_loss: 0.2252 - val_acc: 0.9077
Epoch 45/50
Epoch 00045: val_loss did not improve
Epoch 00045: val_acc did not improve
 - 2s - loss: 0.1513 - acc: 0.9356 - val_loss: 0.2412 - val_acc: 0.8803
Epoch 46/50
Epoch 00046: val_loss did not improve
Epoch 00046: val_acc did not improve
 - 2s - loss: 0.1619 - acc: 0.9339 - val_loss: 0.2752 - val_acc: 0.8803
Epoch 47/50
Epoch 00047: val_loss did not improve
Epoch 00047: val_acc did not improve
 - 2s - loss: 0.1679 - acc: 0.9375 - val_loss: 0.2178 - val_acc: 0.9127
Epoch 48/50
Epoch 00048: val_loss did not improve
Epoch 00048: val_acc did not improve
 - 2s - loss: 0.1602 - acc: 0.9333 - val_loss: 0.2336 - val_acc: 0.8853
Epoch 49/50
Epoch 00049: val_loss did not impr

Epoch 41/50
Epoch 00041: val_loss did not improve
Epoch 00041: val_acc did not improve
 - 2s - loss: 0.1913 - acc: 0.9195 - val_loss: 0.2217 - val_acc: 0.9127
Epoch 42/50
Epoch 00042: val_loss did not improve
Epoch 00042: val_acc did not improve
 - 2s - loss: 0.1736 - acc: 0.9283 - val_loss: 0.2101 - val_acc: 0.9102
Epoch 43/50
Epoch 00043: val_loss did not improve
Epoch 00043: val_acc did not improve
 - 2s - loss: 0.1776 - acc: 0.9250 - val_loss: 0.2170 - val_acc: 0.9152
Epoch 44/50
Epoch 00044: val_loss did not improve
Epoch 00044: val_acc did not improve
 - 2s - loss: 0.1654 - acc: 0.9300 - val_loss: 0.2163 - val_acc: 0.9102
Epoch 45/50
Epoch 00045: val_loss did not improve
Epoch 00045: val_acc did not improve
 - 2s - loss: 0.1536 - acc: 0.9364 - val_loss: 0.2103 - val_acc: 0.9177
Epoch 46/50
Epoch 00046: val_loss did not improve
Epoch 00046: val_acc did not improve
 - 3s - loss: 0.1766 - acc: 0.9217 - val_loss: 0.2279 - val_acc: 0.9027
Epoch 47/50
Epoch 00047: val_loss did not impr

Epoch 39/50
Epoch 00039: val_loss did not improve
Epoch 00039: val_acc did not improve
 - 2s - loss: 0.1367 - acc: 0.9467 - val_loss: 0.3067 - val_acc: 0.8828
Epoch 40/50
Epoch 00040: val_loss did not improve
Epoch 00040: val_acc did not improve
 - 2s - loss: 0.1658 - acc: 0.9292 - val_loss: 0.2886 - val_acc: 0.8878
Epoch 41/50
Epoch 00041: val_loss did not improve
Epoch 00041: val_acc did not improve
 - 2s - loss: 0.1489 - acc: 0.9300 - val_loss: 0.2874 - val_acc: 0.8803
Epoch 42/50
Epoch 00042: val_loss did not improve
Epoch 00042: val_acc did not improve
 - 2s - loss: 0.1384 - acc: 0.9458 - val_loss: 0.2871 - val_acc: 0.8853
Epoch 43/50
Epoch 00043: val_loss did not improve
Epoch 00043: val_acc did not improve
 - 2s - loss: 0.1490 - acc: 0.9375 - val_loss: 0.2844 - val_acc: 0.8828
Epoch 44/50
Epoch 00044: val_loss did not improve
Epoch 00044: val_acc did not improve
 - 2s - loss: 0.1432 - acc: 0.9383 - val_loss: 0.3128 - val_acc: 0.8753
Epoch 45/50
Epoch 00045: val_loss did not impr

In [12]:
import pickle
with open('../features/cnn_4_aug1_feat.pkl','wb') as fout:
    pickle.dump([train_pred,test_pred],fout)

# train feat loss
from sklearn.metrics import log_loss
print(log_loss(y,train_pred))
    
submission = pd.DataFrame()
submission['id']=test_df['id']
submission['is_iceberg']=test_pred
print(submission.head())
submission.to_csv('../results/cnn_4_aug1_sub.csv', index=False)

print('---------------')

    
submission = pd.DataFrame()
submission['id']=test_df['id']
submission['is_iceberg']=best_acc_pred
print(submission.head())
submission.to_csv('../results/cnn_4_aug1_best_acc_sub.csv', index=False)

0.209987771801
         id  is_iceberg
0  5941774d    0.009021
1  4023181e    0.965640
2  b20200e4    0.238042
3  e7f018bb    0.994940
4  4371c8c3    0.494432
---------------
         id  is_iceberg
0  5941774d    0.066792
1  4023181e    0.937345
2  b20200e4    0.238906
3  e7f018bb    0.992169
4  4371c8c3    0.356168
