In [1]:
%matplotlib inline

import os
import numpy as np
from skimage import data, io, filters, transform
from skimage.transform import resize
import pandas as pd
import shutil
import matplotlib
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.models import load_model
from keras import backend as K
from keras import applications
from keras import optimizers
import warnings
warnings.filterwarnings('ignore')

# Load functions to read images
from keras.preprocessing import image as image_utils
import time

Using TensorFlow backend.


# Statement of mission

In the notebook "Example pictures Simple Network" we generated augmented image output beforehand of training. The purpose was to be able to have look at the image data after validation, in particular to be able to have a look at wrongly classified pictures. Here, we want to combine this with the pretrained VGG16 network, and also using bottleneck features. This refers to output of networks (not weights!) stored in a file on disk. Running the VGG16 network is very costly, so using bottleneck features could save some time that instead goes to training the dense layers we put on top of VGG16. However, we must make sure we have enough augmented pictures available.

The code is mostly based on what can be found in this blog entry:

https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html

In [2]:
# Define parameters:

# Parameters for the transformations in Data Augmentation
idg_width_shift_range = 0.2
idg_height_shift_range = 0.2
idg_shear_range = 0.2
idg_zoom_range = 0.2
idg_horflip = True

# Parameters for data output (image size, number of trafos)
target_size = (224, 224)
num_versions = 3 # How many versions of each picture to generate
input_shape = target_size + (3, )
test_num_images = 40 # Do set to large value to process all images

# Fraction of data for validation
val_fraction = 0.5

# Parameters for training
batch_size = 20

# threshold for classification
threshold_class = 0.5

In [3]:
# Read labels of the training data
train_labels = pd.read_csv("data/train_labels.csv")

In [4]:
# Make to image augmentation data generators. One without transformation (for validation data)
no_transform_datagen = ImageDataGenerator(rescale = 1./255)

image_prep_gen = ImageDataGenerator(rescale = 1./255,
                                   width_shift_range = idg_width_shift_range,
                                   height_shift_range = idg_height_shift_range,
                                   shear_range = idg_shear_range,
                                   zoom_range = idg_zoom_range,
                                   horizontal_flip = idg_horflip)

# Generate folder structure

The first preprocessing step is to generate the pictures. We will save them in folders data/train_vgg and data/val_vgg to keep them separate from the other network runs.

In [None]:
input_shape


In [7]:
# I am getting weird error messages sometimes when running this cell. Don't know why.
# I suspect it is probably a bad idea running file io in loops; seems to do things
# out of the order as specified here, what's with 

data_root_dir = "data/vgg_impregen_bottleneck_vgg/"

subdirlist = ["train/0/", "train/1/",
              "val/0/", "val/1/",
              "test_0", "test_1"]


dirlist = []
for subdir in subdirlist:
    dirlist.append(data_root_dir + subdir)


# The test folders are not really test data. Instead, they keep a mirror of the validation
# data. The pictures will then be classified once again by our network and depending on
# correctness of result put in a subfolder. This somewhat convoluted scheme is necessary
# because of the syntax and limitation of keras.

for element in dirlist:
    if os.path.exists(element):
        shutil.rmtree(element)

for element in dirlist:
    os.makedirs(element)

In [8]:


for file_counter in range(0, min(test_num_images, train_labels.shape[0])):
    file_id = str(train_labels["name"][file_counter])
    
    # Load image, make it an array so the flow()-method can work with it.
    img = image_utils.load_img("data/train/" + file_id + ".jpg", target_size = target_size)
    img = image_utils.img_to_array(img)
    img = img.reshape((1, ) + img.shape)
    
    # Make sure the file names start with the number of the source file and the label
    file_prefix = file_id.rjust(4, "0") + "_" + str(train_labels["invasive"][file_counter]) + "_"
    
    if np.random.rand() < val_fraction:
        save_to_dir = data_root_dir + "val/" + str(train_labels["invasive"][file_counter]) + "/"
        for batch in no_transform_datagen.flow(x = img, batch_size = 1, save_to_dir = save_to_dir,
                                            save_prefix = file_prefix, save_format = "jpg"):
            break # Stop the image generation after the first image
    else:
        save_to_dir = data_root_dir + "train/" + str(train_labels["invasive"][file_counter]) + "/"
        counter = 0
        for batch in image_prep_gen.flow(x = img, batch_size = 1, save_to_dir = save_to_dir,
                                            save_prefix = file_prefix, save_format = "jpg"):
            counter += 1
            if counter > (num_versions - 1):
                break

In [11]:
num_images_train_0 = len(os.listdir(data_root_dir + "train/0"))
num_images_train_1 = len(os.listdir(data_root_dir + "train/1"))
num_images_val_0 = len(os.listdir(data_root_dir + "val/0"))
num_images_val_1 = len(os.listdir(data_root_dir + "val/1"))

num_images_train_0
#num_images_train_1


                       


30

In [13]:
# Get the number of images in each subfolder
num_images_train_0 = len(os.listdir(data_root_dir + "train/0"))
num_images_train_1 = len(os.listdir(data_root_dir + "train/1"))
num_images_val_0 = len(os.listdir(data_root_dir + "val/0"))
num_images_val_1 = len(os.listdir(data_root_dir + "val/1"))

images_to_train = num_images_train_0 + num_images_train_1

imagenet_model = applications.VGG16(weights="imagenet", include_top=False, input_shape=input_shape)


generator_for_bottleneck = no_transform_datagen.flow_from_directory(data_root_dir + "train",
                                                                   target_size = target_size,
                                                                   batch_size = batch_size,
                                                                   class_mode = None,
                                                                   shuffle = False)
startzeit = time.time()
bottleneck_train = imagenet_model.predict_generator(generator_for_bottleneck, images_to_train)
endzeit = time.time()

print("Duration of training " + str(images_to_train) + " images was " + str(round(endzeit - startzeit, 2)))

# The write command in the blog does not work. Somehow, the write mode must
# be set to "wb" instead of only "w"
np.save(open('bottleneck_030617.npy', 'wb'), bottleneck_train)

Found 48 images belonging to 2 classes.
Duration of training 48 images was 810.42


# Prepare data for convolutional network

In [None]:
batch_size = 16

# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
        rescale=1./255,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

# this is the augmentation configuration we will use for testing:
# only rescaling
test_datagen = ImageDataGenerator(rescale=1./255)

# this is a generator that will read pictures found in
# subfolers of 'data/train', and indefinitely generate
# batches of augmented image data
train_generator = train_datagen.flow_from_directory(
        'data/train',  # this is the target directory
        target_size=(224, 224),  # all images will be resized to 150x150
        batch_size=batch_size,
        class_mode='binary')  # since we use binary_crossentropy loss, we need binary labels

# this is a similar generator, for validation data
validation_generator = test_datagen.flow_from_directory(
        'data/val',
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode='binary')

# Test generator
test_generator = test_datagen.flow_from_directory(
        'data/test',
        target_size=(224, 224),
        batch_size = 1,
        class_mode=None,
        shuffle=False)

# Build and convolutional network on top of image net

In [None]:
img_width, img_height = 224, 224
input_shape = (img_width, img_height, 3)

In [None]:
imagenet_model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))

top_model = Sequential()
top_model.add(Flatten(input_shape=imagenet_model.output_shape[1:]))
top_model.add(Dense(256, activation='relu'))
top_model.add(Dense(1, activation='sigmoid'))

model = Model(inputs=imagenet_model.input, outputs=top_model(imagenet_model.output))

model.compile(loss='binary_crossentropy', optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
              metrics=['accuracy'])

# Fix pretrained layers from imagenet
for layer in model.layers[:17]:
    layer.trainable = False

model.summary()

In [None]:
# Load model which was trained before
model.load_weights('top_model_2017.05.24.h5')

In [None]:
hist = model.fit_generator(
        train_generator,
        steps_per_epoch=2000 // batch_size,
        epochs=5,
        validation_data=validation_generator,
        validation_steps=800 // batch_size)
model.save_weights('top_model_2017.05.25.h5')

# Evaluate

In [None]:
score = model.evaluate_generator(validation_generator, steps = 20)

In [None]:
print(score[1])

# Predict

In [None]:
test_predictions = model.predict_generator(test_generator, test_generator.n, verbose=1)

In [None]:
name = list()
for name_str in test_generator.filenames:
    name_str=name_str[5:-4]
    name.append(int(name_str))

In [None]:
prediction_results = pd.DataFrame(name, columns=["name"])

In [None]:
prediction_results["invasive"] = test_predictions

In [None]:
prediction_results.sort("name", inplace=True)

In [None]:
prediction_results.to_csv("data\\Submission_5_CNN_Pretrained-2017.05.25.csv", index = False)