### Intel MobileODT Kaggle competition

https://www.kaggle.com/c/intel-mobileodt-cervical-cancer-screening


In [1]:
import os, sys
from __future__ import print_function, division
from importlib import reload 
import utils_p3; reload(utils_p3)
from utils_p3 import *
%matplotlib inline
from IPython.display import FileLink
import tensorflow as tf
import six
import numpy as np
import pandas as pd
import cv2
import glob
import random
from PIL import ImageFile
from keras import applications
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K
#print("TensorFlow version: %s" % tf.__version__)
print("Theano version: %s" % theano.__version__)
print("Keras version: %s" % keras.__version__)

Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 1080 Ti (0000:01:00.0)
Using Theano backend.


Theano version: 0.9.0
Keras version: 2.0.3


### Global declarations

In [2]:
HOME_DIR = os.getcwd()
#path = "data/imgs/"
path = "data/testing/"
#path = "data/testing/sample/"
batch_size = 1 # for pre-computation of last conv layer's output and the 5 augmented copies of training (OOM issue)
#batch_size = 32 # for FCL and submission computation
ImageFile.LOAD_TRUNCATED_IMAGES = True
keras.backend.image_data_format() #verify image_data_format for theano vs TF

'channels_first'

### Setup batches

In [3]:
batches = get_batches(path+'train', batch_size=batch_size, shuffle=False)
val_batches = get_batches(path+'valid', batch_size=batch_size, shuffle=False)
test_batches = get_batches(path+'test', batch_size=batch_size, shuffle=False)
steps_per_epoch = int(np.ceil(batches.samples/batch_size))
validation_steps = int(np.ceil(val_batches.samples/batch_size))

Found 5048 images belonging to 3 classes.
Found 1683 images belonging to 3 classes.
Found 4018 images belonging to 1 classes.


In [4]:
(val_classes, trn_classes, val_labels, trn_labels, 
    val_filenames, filenames, test_filenames) = get_classes(path)

Found 5048 images belonging to 3 classes.
Found 1683 images belonging to 3 classes.
Found 4018 images belonging to 1 classes.


## Imagenet conv features with VGG16
Based on code from Statefarm_original notebook of @Jeremy, lesson 4.

Since we have so little data, and it is similar to imagenet images (full color photos), using pre-trained VGG weights is likely to be helpful - in fact it seems likely that we won't need to fine-tune the convolutional layer weights much, if at all.

So we can pre-compute the output of the last convolutional layer, as we did in lesson 3 when we experimented with dropout. (However this means that we can't use full data augmentation, since we can't pre-compute something that changes every image.)

In [5]:
# Import our class, using VGG16 with BatchNorm
import vgg16bn_p3; reload(vgg16bn_p3) # *_p3 version code for Python 3.6 and Keras 2.0
from vgg16bn_p3 import Vgg16BN

### Step 1: pre-compute the output of the last convolutional layer

In [6]:
# Grab VGG16 and find the last convolutional layer.
vgg = Vgg16BN()
model=vgg.model
last_conv_idx = [i for i,l in enumerate(model.layers) if type(l) is Convolution2D][-1]
conv_layers = model.layers[:last_conv_idx+1]

In [7]:
# Build a new model that includes everything up to that last convolutional layer
conv_model = Sequential(conv_layers)

In [None]:
model.summary()

In [None]:
# Predict the outputs of that model by calculating the activations of that last convolutional layer
conv_feat = conv_model.predict_generator(batches, int(np.ceil(batches.samples/batch_size)), workers=3)

In [None]:
# As this takes time, save it to load it in the future
save_array(path+'results/bn_conv_feat.dat', conv_feat)

In [None]:
conv_val_feat = conv_model.predict_generator(val_batches, int(np.ceil(val_batches.samples/batch_size)), workers=3)

In [None]:
save_array(path+'results/bn_conv_val_feat.dat', conv_val_feat)

In [None]:
conv_test_feat = conv_model.predict_generator(test_batches, int(np.ceil(test_batches.samples/batch_size)), workers=3)

In [None]:
save_array(path+'results/bn_conv_test_feat.dat', conv_test_feat)

In [None]:
# We can look at the original model and find the last convo layer "conv2d_13" with output shape (none, 512, 14, 14)
# and compare it with the shape of of our new model's output.
model.summary()

In [None]:
# It's the same than "conv2d_13" !
conv_val_feat.shape

In [None]:
# If this notebook was fully run once, we can directly reload the activations
conv_feat = load_array(path+'results/bn_conv_feat.dat')
conv_val_feat = load_array(path+'results/bn_conv_val_feat.dat')
#conv_test_feat = load_array(path+'results/bn_conv_test_feat.dat')
conv_val_feat.shape

### Step 2: Build new model on top, with dense layers
Since we've pre-computed the output of the last convolutional layer, we need to create a network that takes that as input, and predicts our 3 classes.

Let's try using a simplified version of VGG's dense layers.

In [None]:
# we make 'p' a parameter to try different Dropout amounts
def get_bn_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p),
        Dense(4096, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(4096, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(3, activation='softmax')
        ]

In [None]:
p=0.5

In [None]:
bn_model = Sequential(get_bn_layers(p))
bn_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# quick test, verify batch_size
bn_model.fit(conv_feat, trn_labels, batch_size=batch_size, epochs=1, 
             validation_data=(conv_val_feat, val_labels))

In [None]:
# full run on 15 epochs
bn_model.fit(conv_feat, trn_labels, batch_size=batch_size, epochs=15, 
             validation_data=(conv_val_feat, val_labels))

In [None]:
bn_model.save_weights(path+'models/bn_conv22.h5')

### Step 3: Pre-computed data augmentation by making 5 augmented copies of training set

We'll use our usual data augmentation parameters:

In [8]:
# Warning: uses HUGE amount of RAM (up to 60gb) and takes 90 mins for a simple 'batches.samples*2'
# with batch_size=64. Check forum at
# http://forums.fast.ai/t/state-farm-full-how-not-to-run-out-of-memory-with-vgg-da-batches-samples-5/3469/2
# Maybe reduce batch_size and workers to 1 ? YES !!!
# Also kernel needs a 'Restart and clear output' to clear RAM (now 15gb RAM + 12gb SWAP used),
# so run Step 1 + 2 first, then reset and run Step 3.
gen_t = image.ImageDataGenerator(rotation_range=30, height_shift_range=0.025, horizontal_flip=True,
                                shear_range=0.05, width_shift_range=0.1, zoom_range=0.5)
da_batches = get_batches(path+'train', gen_t, batch_size=1, shuffle=False)

Found 5048 images belonging to 3 classes.


In [9]:
da_conv_feat = conv_model.predict_generator(da_batches, int(np.ceil(da_batches.samples*5)), workers=3)

In [10]:
save_array(path+'results/bn_da5_conv_feat24.dat', da_conv_feat)

Let's include the real training data as well in its non-augmented form.

In [11]:
conv_feat = load_array(path+'results/bn_conv_feat.dat')

In [None]:
da_conv_feat = load_array(path+'results/bn_da5_conv_feat24.dat')

In [12]:
da_conv_feat_update = np.concatenate([da_conv_feat, conv_feat])

In [13]:
save_array(path+'results/bn_da5_conv_feat_update24.dat', da_conv_feat_update)

In [None]:
da_conv_feat_update = load_array(path+'results/bn_da5_conv_feat_update24.dat')

Since we've now got a dataset 6x bigger than before, we'll need to copy our labels 6 times too.

In [14]:
da_trn_labels = np.concatenate([trn_labels]*6)

In [15]:
save_array(path+'results/bn_da5_trn_labels24.dat', da_trn_labels)

In [None]:
da_trn_labels = load_array(path+'results/bn_da5_trn_labels24.dat')

Based on some experiments the previous model works well, maybe with bigger dense layers like 512 later ?

In [53]:
# we make 'p' a parameter to try different Dropout amounts
def get_bn_da_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(3, activation='softmax')
        ]

In [17]:
p=0.5

In [18]:
bn_da_model = Sequential(get_bn_da_layers(p))
bn_da_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
conv_val_feat = load_array(path+'results/bn_conv_val_feat.dat')

In [20]:
# quick test, verify batch_size
bn_da_model.fit(da_conv_feat_update, da_trn_labels, batch_size=32, epochs=1, 
             validation_data=(conv_val_feat, val_labels))

Train on 30288 samples, validate on 1683 samples
Epoch 1/1


<keras.callbacks.History at 0x7fe332130be0>

In [21]:
# full run on 50 epochs
bn_da_model.fit(da_conv_feat_update, da_trn_labels, batch_size=32, epochs=50, 
             validation_data=(conv_val_feat, val_labels))

Train on 30288 samples, validate on 1683 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fe3321272b0>

In [22]:
# Let's save those weights.
bn_da_model.save_weights(path+'models/bn_da5_dense256_p05_lr001_conv241_50e.h5')

In [23]:
# Let's load those weights.
bn_da_model.load_weights(path+'models/bn_da5_dense256_p05_lr001_conv241_50e.h5')

OSError: Unable to open file (Unable to open file: name = 'data/testing/models/bn_da5_conv241_50e.h5', errno = 2, error message = 'no such file or directory', flags = 0, o_flags = 0)

In [54]:
p=0.6

In [55]:
bn_da_model = Sequential(get_bn_da_layers(p))
bn_da_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [56]:
# full run on 50 epochs
bn_da_model.fit(da_conv_feat_update, da_trn_labels, batch_size=32, epochs=50, 
             validation_data=(conv_val_feat, val_labels))

Train on 30288 samples, validate on 1683 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fe32283a7b8>

In [27]:
# Let's save those weights.
bn_da_model.save_weights(path+'models/bn_da5_dense3_64_p06_lr001_conv241_50e.h5')

In [28]:
p=0.3

In [29]:
bn_da_model = Sequential(get_bn_da_layers(p))
bn_da_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [30]:
# full run on 50 epochs
bn_da_model.fit(da_conv_feat_update, da_trn_labels, batch_size=32, epochs=50, 
             validation_data=(conv_val_feat, val_labels))

Train on 30288 samples, validate on 1683 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fe32865ae80>

In [31]:
# Let's save those weights.
bn_da_model.save_weights(path+'models/bn_da5_dense256_p03_lr001_conv241_50e.h5')

In [32]:
p=0.7

In [33]:
bn_da_model = Sequential(get_bn_da_layers(p))
bn_da_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [34]:
# full run on 50 epochs
bn_da_model.fit(da_conv_feat_update, da_trn_labels, batch_size=32, epochs=50, 
             validation_data=(conv_val_feat, val_labels))

Train on 30288 samples, validate on 1683 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fe326f3ea20>

In [35]:
# Let's save those weights.
bn_da_model.save_weights(path+'models/bn_da5_dense256_p07_lr001_conv241_50e.h5')

In [36]:
p=0.8

In [37]:
bn_da_model = Sequential(get_bn_da_layers(p))
bn_da_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [38]:
# full run on 50 epochs
bn_da_model.fit(da_conv_feat_update, da_trn_labels, batch_size=32, epochs=50, 
             validation_data=(conv_val_feat, val_labels))

Train on 30288 samples, validate on 1683 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fe3255cc550>

In [40]:
# Let's save those weights.
bn_da_model.save_weights(path+'models/bn_da5_dense256_p08_lr001_conv241_50e.h5')

### Submissions

Don't forget to add clipping for Kaggle submissions as it's very important to get the best cross_entropy loss function.

In [57]:
def do_clip(arr, mx): return np.clip(arr, (1-mx)/9, mx)

In [58]:
val_preds = bn_da_model.predict(conv_val_feat, batch_size=batch_size)

In [59]:
np.mean(keras.metrics.categorical_crossentropy(val_labels, do_clip(val_preds, 0.93)).eval())

0.74608827417631163

In [60]:
conv_test_feat = load_array(path+'results/bn_conv_test_feat.dat')

In [45]:
bn_da_model.save_weights(path+'models/bn_da5_dense256_p06_lr001_conv241_50e.h5')

In [61]:
preds = bn_da_model.predict(conv_test_feat, batch_size=batch_size)

In [62]:
subm = do_clip(preds,0.93)

In [63]:
subm_name = path+'results/subm_24_bn_da_vgg_clip093_1.csv'

In [64]:
classes = sorted(batches.class_indices, key=batches.class_indices.get)

In [65]:
submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'image_name', [a[8:] for a in test_filenames])
submission.head()

Unnamed: 0,image_name,Type_1,Type_2,Type_3
0,0.jpg,0.108437,0.195085,0.696478
1,1.jpg,0.051419,0.93,0.007778
2,10.jpg,0.030744,0.921474,0.047782
3,100.jpg,0.018633,0.895548,0.085818
4,10000.jpg,0.037839,0.93,0.009036


In [66]:
submission.to_csv(subm_name, index=False)

In [67]:
FileLink(subm_name)