### Intel MobileODT Kaggle competition

https://www.kaggle.com/c/intel-mobileodt-cervical-cancer-screening


In [1]:
import os, sys
from __future__ import print_function, division
from importlib import reload 
import utils_p3; reload(utils_p3)
from utils_p3 import *
%matplotlib inline
from IPython.display import FileLink
import tensorflow as tf
import six
import numpy as np
import pandas as pd
import cv2
import glob
import random
from PIL import ImageFile
from keras import applications
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K
#print("TensorFlow version: %s" % tf.__version__)
print("Keras version: %s" % keras.__version__)

Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 1080 Ti (0000:01:00.0)
Using Theano backend.


Keras version: 2.0.3


### Global declarations

In [2]:
HOME_DIR = os.getcwd()
#path = "data/imgs/"
path = "data/testing/"
#path = "data/testing/sample/"
train_valid_fraction = 0.75
image_shape = (224,224)
patience = 3
batch_size = 64
ImageFile.LOAD_TRUNCATED_IMAGES = True
keras.backend.image_data_format() #verify image_data_format for theano vs TF

'channels_first'

In [None]:
%pwd

In [None]:
HOME_DIR

### Setup batches

In [3]:
batches = get_batches(path+'train', batch_size=batch_size, shuffle=False)
val_batches = get_batches(path+'valid', batch_size=batch_size*2, shuffle=False)
test_batches = get_batches(path+'test', batch_size=batch_size*2, shuffle=False)
steps_per_epoch = int(np.ceil(batches.samples/batch_size))
validation_steps = int(np.ceil(val_batches.samples/(batch_size*2)))

Found 5048 images belonging to 3 classes.
Found 1683 images belonging to 3 classes.
Found 4018 images belonging to 1 classes.


In [4]:
(val_classes, trn_classes, val_labels, trn_labels, 
    val_filenames, filenames, test_filenames) = get_classes(path)

Found 5048 images belonging to 3 classes.
Found 1683 images belonging to 3 classes.
Found 4018 images belonging to 1 classes.


## Imagenet conv features with VGG16
from Statefarm_original notebook

Since we have so little data, and it is similar to imagenet images (full color photos), using pre-trained VGG weights is likely to be helpful - in fact it seems likely that we won't need to fine-tune the convolutional layer weights much, if at all.

So we can pre-compute the output of the last convolutional layer, as we did in lesson 3 when we experimented with dropout. (However this means that we can't use full data augmentation, since we can't pre-compute something that changes every image.)

In [5]:
# Import our class
import vgg16_p3; reload(vgg16_p3)
from vgg16_p3 import Vgg16

### Step 1: pre-compute the output of the last convolutional layer

In [11]:
# Grab VGG16 and find the last convolutional layer
vgg = Vgg16()
model=vgg.model
last_conv_idx = [i for i,l in enumerate(model.layers) if type(l) is Convolution2D][-1]
conv_layers = model.layers[:last_conv_idx+1]

In [12]:
# Build a new model that includes everything up to that last convolutional layer
conv_model = Sequential(conv_layers)

In [15]:
# Predict the outputs of that model by calculating the activations of that last convolutional layer
conv_feat = conv_model.predict_generator(batches, int(np.ceil(batches.samples/batch_size)), workers=3)

In [16]:
# As this takes time, save it to load it in the future
save_array(path+'results/conv_feat.dat', conv_feat)

In [17]:
conv_val_feat = conv_model.predict_generator(val_batches, int(np.ceil(val_batches.samples/(batch_size*2))), workers=3)

In [18]:
save_array(path+'results/conv_val_feat.dat', conv_val_feat)

In [19]:
conv_test_feat = conv_model.predict_generator(test_batches, int(np.ceil(test_batches.samples/(batch_size*2))), workers=3)

In [20]:
save_array(path+'results/conv_test_feat.dat', conv_test_feat)

In [None]:
# Data Augmentation: at 1:15:55 in L4 video, we can see Jeremy had a cell for gen_t then computed a da_conv_feat with nb_sample*5

In [21]:
# We can look at the original model and find the last convo layer "conv2d_13" with output shape (none, 512, 14, 14)
# and compare it with the shape of of our new model's output.
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_1 (Lambda)            (None, 3, 224, 224)       0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 3, 226, 226)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 224, 224)      1792      
_________________________________________________________________
zero_padding2d_2 (ZeroPaddin (None, 64, 226, 226)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 224, 224)      36928     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 112, 112)      0         
_________________________________________________________________
zero_padding2d_3 (ZeroPaddin (None, 64, 114, 114)      0         
__________

In [22]:
# It's the same than "conv2d_13" !
conv_val_feat.shape

(1683, 512, 14, 14)

### Step 2: Build new model on top, with dense layers
Since we've pre-computed the output of the last convolutional layer, we need to create a network that takes that as input, and predicts our 3 classes. Let's try using a simplified version of VGG's dense layers.

In [23]:
# we make 'p' a parameter to try different Dropout amounts
def get_bn_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p/2),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(p/2),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(3, activation='softmax')
        ]

In [24]:
p=0.5

In [25]:
bn_model = Sequential(get_bn_layers(p))
bn_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [26]:
bn_model.fit(conv_feat, trn_labels, batch_size=batch_size, epochs=1, 
             validation_data=(conv_val_feat, val_labels))

Train on 5048 samples, validate on 1683 samples
Epoch 1/1


<keras.callbacks.History at 0x7f51f7078f28>

In [27]:
bn_model.fit(conv_feat, trn_labels, batch_size=batch_size, epochs=10, 
             validation_data=(conv_val_feat, val_labels))

Train on 5048 samples, validate on 1683 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f51ee11a0b8>

In [28]:
bn_model = Sequential(get_bn_layers(p))
bn_model.compile(Adam(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])

In [29]:
bn_model.fit(conv_feat, trn_labels, batch_size=batch_size, epochs=25, 
             validation_data=(conv_val_feat, val_labels))

Train on 5048 samples, validate on 1683 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f51d44de898>

### Submissions

Don't forget to add clipping for Kaggle submissions as it's very important to get the best cross_entropy loss function.

In [35]:
def do_clip(arr, mx): return np.clip(arr, (1-mx)/9, mx)

In [37]:
val_preds = bn_model.predict(conv_val_feat, batch_size=batch_size*2)

In [38]:
np.mean(keras.metrics.categorical_crossentropy(val_labels, do_clip(val_preds, 0.93)).eval())

1.4063674341234258

In [None]:
conv_test_feat = load_array(path+'results/conv_test_feat.dat')

In [39]:
preds = bn_model.predict(conv_test_feat, batch_size=batch_size*2)

In [40]:
subm = do_clip(preds,0.93)

In [41]:
subm_name = path+'results/subm_14_vgg.csv'

In [42]:
classes = sorted(batches.class_indices, key=batches.class_indices.get)

In [48]:
submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'img', [a[8:] for a in test_filenames])
submission.head()

Unnamed: 0,img,Type_1,Type_2,Type_3
0,0.jpg,0.007778,0.93,0.007778
1,1.jpg,0.007778,0.007778,0.93
2,10.jpg,0.007778,0.93,0.007778
3,100.jpg,0.057717,0.93,0.007778
4,10000.jpg,0.007778,0.93,0.007778


In [49]:
submission.to_csv(subm_name, index=False)

In [50]:
FileLink(subm_name)