In [1]:
%matplotlib inline

import os
import numpy as np
from skimage import data, io, filters, transform
from skimage.transform import resize
import pandas as pd
import shutil
import matplotlib
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
from keras import optimizers
import warnings
warnings.filterwarnings('ignore')
# Load functions to read images
from keras.preprocessing import image as image_utils

Using TensorFlow backend.


# Statement of mission

In this notebook a simple (default) network is trained on images of size (150, 150).
The training data is generated before training and written to disk.
The idea behind this is to be able to have a look at images that are not classified correctly.

In [2]:
# Define parameters:

# Parameters for the transformations in Data Augmentation
idg_width_shift_range = 0.2
idg_height_shift_range = 0.2
idg_shear_range = 0.2
idg_zoom_range = 0.2
idg_horflip = True

# Parameters for data output (image size, number of trafos)
target_size = (150, 150)
num_versions = 10
input_shape = target_size + (3, )

# Fraction of data for validation
val_fraction = 0.2

# Parameters for training
batch_size = 10 
file_weights = "weights_examplePicsNetwork.h5"

# threshold for classicfication
threshold_class = 0.5

In [None]:
# Read labels of the training data
train_labels = pd.read_csv("data/train_labels.csv")

In [3]:
# Make to image augmentation data generators. One without transformation (for validation data)
no_transform_datagen = ImageDataGenerator(rescale = 1./255)

image_prep_gen = ImageDataGenerator(rescale = 1./255,
                                   width_shift_range = idg_width_shift_range,
                                   height_shift_range = idg_height_shift_range,
                                   shear_range = idg_shear_range,
                                   zoom_range = idg_zoom_range,
                                   horizontal_flip = idg_horflip)

# Folder structure

The training and validation data is written to folders train\_small and val\_small in order to
keep it separate from other network runs.

In [10]:
# I am getting weird error messages sometimes when running this cell. Don't know why;
# I suspect it is probably a bad idea running file io in loops.

dirlist = ["data/train_small/0/",
           "data/train_small/1/",
           "data/val_small/0/",
           "data/val_small/1/",
           "data/test_0/",
           "data/test_1/"]

for element in dirlist:
    if os.path.exists(element):
        shutil.rmtree(element)

for element in dirlist:
    os.makedirs(element)


# Generate training- and validation data

The training data consists of a subset of the original training images augmented by a number of
transformed images (set by num\_versions). The validation data is not transformed; the intent is
to get a glimpse of the pictures that won't get classified correctly.

In [11]:


for file_counter in range(0, train_labels.shape[0]):
    file_id = str(train_labels["name"][file_counter])
    
    # Load image, make it an array so the flow()-method can work with it.
    img = image_utils.load_img("data_save/train/" + file_id + ".jpg", target_size = target_size)
    img = image_utils.img_to_array(img)
    img = img.reshape((1, ) + img.shape)
    
    # Make sure the file names start with the number of the source file and the label
    file_prefix = file_id.rjust(4, "0") + "_" + str(train_labels["invasive"][file_counter]) + "_"
    
    if np.random.rand() < val_fraction:
        save_to_dir = "data/val_small/" + str(train_labels["invasive"][file_counter]) + "/"
        for batch in no_transform_datagen.flow(x = img, batch_size = 1, save_to_dir = save_to_dir,
                                            save_prefix = file_prefix, save_format = "jpg"):
            break # Stop the image generation after the first image
    else:
        save_to_dir = "data/train_small/" + str(train_labels["invasive"][file_counter]) + "/"
        counter = 0
        for batch in image_prep_gen.flow(x = img, batch_size = 1, save_to_dir = save_to_dir,
                                            save_prefix = file_prefix, save_format = "jpg"):
            counter += 1
            if counter > (num_versions - 1):
                break
    

In [12]:
# Since the image generation has been outsourced to the previous section, I can now
# create data generators without transformations
# However, I am still not sure what batch size is in the context of an image generator
# Ok, seems to be the number of pics looked at before a gradient computations or something.

train_generator = no_transform_datagen.flow_from_directory("data/train_small", target_size = target_size,
                                                     batch_size = batch_size, class_mode = "binary")

val_generator = no_transform_datagen.flow_from_directory("data/val_small", target_size = target_size,
                                                   batch_size = batch_size, class_mode = "binary")



Found 18621 images belonging to 2 classes.
Found 432 images belonging to 2 classes.


In [13]:

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
metrics=['accuracy'])

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# Above are the conv layers, below the dense layers

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
metrics=['accuracy'])


If the weights are already available, load them instead of training the network anew.

In [14]:


# Next on is model fitting. Check if the weights are already saved, in
# this case no fitting is necessary
if os.path.exists(file_weights):
    model.load_weights(file_weights)
else:
    model.fit_generator(
            train_generator,
            steps_per_epoch=18000 // batch_size,
            epochs=5,
            validation_data=val_generator,
            validation_steps=4000 // batch_size)
    model.save_weights(file_weights)
    

In [15]:
# Copy images from val/0 and val/1 into test_0/test and test_1/test for the generators
# (Hmm, the interface makes standard stuff easy, but to me it seems it is just a tad to inflexible)

shutil.copytree(src = "data/val_small/0", dst = "data/test_0/test")
shutil.copytree(src = "data/val_small/1", dst = "data/test_1/test")

'data/test_1/test'

In [16]:
# Make two generators, one for each folder in val_small in order to see when
# the model fails in each class

val_0_generator = no_transform_datagen.flow_from_directory(directory = "data/test_0/",
                                                      target_size = target_size,
                                                      batch_size = 1,
                                                      class_mode = None,
                                                      shuffle = False)
val_1_generator = no_transform_datagen.flow_from_directory(directory = "data/test_1/",
                                                      target_size = target_size,
                                                      batch_size = 1,
                                                      class_mode = None,
                                                      shuffle = False)


Found 173 images belonging to 1 classes.
Found 259 images belonging to 1 classes.


In [17]:
val_0_predictions = model.predict_generator(val_0_generator, val_0_generator.n, verbose = 1)

val_1_predictions = model.predict_generator(val_1_generator, val_1_generator.n, verbose = 1)



In [18]:
if not os.path.exists("data/test_0/test/correct"):
    os.makedirs("data/test_0/test/correct")
if not os.path.exists("data/test_0/test/incorrect"):
    os.makedirs("data/test_0/test/incorrect")

if not os.path.exists("data/test_1/test/correct"):
    os.makedirs("data/test_1/test/correct")
if not os.path.exists("data/test_1/test/incorrect"):
    os.makedirs("data/test_1/test/incorrect")
    
    
# I know there must be a way to iterate over two lists at the same time, but I don't know how
# it can be done
counter = 0
for prediction in val_0_predictions:
    filename = val_0_generator.filenames[counter][5:]
    
    if prediction > threshold_class:
        shutil.move("data/test_0/test/" + filename, "data/test_0/test/incorrect/" + filename)
    else:
        shutil.move("data/test_0/test/" + filename, "data/test_0/test/correct/" + filename)
        
    counter += 1

counter = 0
for prediction in val_1_predictions:
    filename = val_1_generator.filenames[counter][5:]
    
    if prediction <= threshold_class:
        shutil.move("data/test_1/test/" + filename, "data/test_1/test/incorrect/" + filename)
    else:
        shutil.move("data/test_1/test/" + filename, "data/test_1/test/correct/" + filename)
        
    counter += 1