In [4]:
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from os import listdir
import pandas as pd
import numpy as np
from skimage import io

## Splitting data.

First, I am going to split the data in train set and test set. Notice that the train set will be late split again for validation set. 

Right now, I will use 1000 images for testing, and the rest for training/validation

In [5]:
import random
train_data = []  # This will later be split in validation too
test_data = []
for file in listdir("train/train"):
    some_number = random.randint(1,100)
    label = "1" if "dog" in file else "0" 
    if len(test_data) >= 1000 or some_number < 85:
        train_data.append([file, label])
    else:
        test_data.append([file, label])
        
train = pd.DataFrame(train_data, columns=["filename", "class"])
test = pd.DataFrame(test_data, columns = ["filename", "class"])

In [6]:
train.head()

Unnamed: 0,filename,class
0,cat.0.jpg,0
1,cat.1.jpg,0
2,cat.1000.jpg,0
3,cat.10000.jpg,0
4,cat.10001.jpg,0


In [7]:
test.head()

Unnamed: 0,filename,class
0,cat.10.jpg,0
1,cat.100.jpg,0
2,cat.10009.jpg,0
3,cat.10011.jpg,0
4,cat.10012.jpg,0


In [8]:
print("Train size", len(train))
print("Test size", len(test))

for label in ["0", "1"]:
    print("------------")
    print("\tTrain has", len(train[train["class"]==label]), label)
    print("\tTest has", len(test[test["class"]==label]), label)

Train size 24000
Test size 1000
------------
	Train has 11500 0
	Test has 1000 0
------------
	Train has 12500 1
	Test has 0 1


## The data is quite balanced, ~50% are dogs (class 1), ~50% are cats (class 0). As this is a binary problem, we can output a sigmoid as the output function

In [17]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 64
train_image_generator = ImageDataGenerator(rescale=1./255, 
                                           rotation_range=90, 
                                           horizontal_flip=True, 
                                           vertical_flip=True,
                                           validation_split=0.2)

In [18]:
train_generator = train_image_generator.flow_from_dataframe(train, "train/train", seed=42,
                                                    target_size=(IMG_SIZE[0], IMG_SIZE[1]),
                                                    batch_size=BATCH_SIZE,
                                                    class_mode="binary",
                                                    subset="training",
                                                    shuffle=True,      
                                                    save_format="jpeg")

validation_generator = train_image_generator.flow_from_dataframe(train, "train/train", seed=42,
                                                    target_size=(IMG_SIZE[0], IMG_SIZE[1]),
                                                    batch_size=BATCH_SIZE,
                                                    class_mode="binary",
                                                    subset="validation",
                                                    shuffle=False,                  
                                                    save_format="jpeg")

Found 19802 images belonging to 2 classes.
Found 4198 images belonging to 2 classes.


In [19]:
from keras.applications import vgg16
model = vgg16.VGG16(weights='imagenet', 
                    include_top=False, 
                    input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3), 
                    pooling="avg")

### Now, we are going to only train the last 5 layers.

In [20]:
for layer in model.layers[:-8]:
        layer.trainable = False

## Finally, we are going to add a Dense layer of 512 units and then the output layer (a sigmoid function) at the end.

In [21]:
from keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from keras.models import Model, Sequential

# Although this part can be done also with the functional API, I found that for this simple models, this becomes more intuitive
transfer_model_vgg16 = Sequential()

for layer in model.layers:
    transfer_model_vgg16.add(layer)
transfer_model_vgg16.add(Dense(512, activation="relu"))  # Very important to use relu as activation function, search for "vanishing gradiends" :)
transfer_model_vgg16.add(Dense(1, activation="sigmoid")) # Finally our activation layer! we use 2 outputs as we have either cats or dogs

## Lets display our model 

In [22]:
transfer_model_vgg16.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
_________________________________________________________________
block3_conv1 (Conv2D)        (None, 56, 56, 256)       295168    
__________

In [24]:
adam = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.00001)
transfer_model_vgg16.compile(adam,loss="binary_crossentropy",metrics=["accuracy"] )

vgg16_model_history = transfer_model_vgg16.fit_generator(train_generator, 
                                             steps_per_epoch = train_generator.n // BATCH_SIZE,
                                             validation_data = validation_generator,
                                             validation_steps = validation_generator.n // BATCH_SIZE,
                                            epochs=2)

Epoch 1/2
Epoch 2/2


## Lets define a small function to plot our predictions

In [29]:
from IPython.display import Image, display
def plot_prediction(image_path, label):
    display(Image(filename=image_path, width=IMG_SIZE[0], height=IMG_SIZE[1]))
    prediction = "dog"
    confidence = label
    if label < 0.5:
        prediction = "cat"
        confidence = (1-label)
    legend = "The image %s above is a %s with a confidence of %.2f%% %f" % (image_path, prediction, confidence*100, label)
    print(legend)

## And another function to efficiently yield batches of images and (optionally) labels to predict

In [42]:
import cv2
from skimage import io

def build_batches(df, has_labels=True, limit=500, batch_size=BATCH_SIZE, produce="images"):
    """
    produce: Can be either "images" in which case an array of normalized images is returned or 
             "paths" in which case, a string with the full dir is returned
    """
    X = []
    y = []
    paths = []
    i = 0
    for _, row in df.iterrows():
        if has_labels:
            y.append(row["class"])
        raw_image_path = "train/train/" if has_labels else "test/test/"
        raw_image_path += row["filename"]
        raw_image = io.imread(raw_image_path)
        raw_image = cv2.resize(raw_image, (IMG_SIZE[0], IMG_SIZE[1]), interpolation=cv2.INTER_CUBIC)
        X.append(raw_image)
        paths.append(raw_image_path)
        i += 1
        if i == limit:
            break
        if i > 0 and i % batch_size == 0:
            X = np.array(X)
            y = np.array(y)
            X = X / 255
            
            if produce == "images":
                yield X, y
            else:
                yield paths, y
            paths = []
            X = []
            y = []

    X = np.array(X)
    y = np.array(y)
    
    X = X / 255
    
    if produce == "images":
        yield X, y         
    else:
        yield paths, y

In [31]:
samples = 1000
transfer_model_vgg16.evaluate_generator(build_batches(test, limit=samples), steps=samples/BATCH_SIZE, verbose=True)



[0.25907485604286196, 0.9009999990463257]

### Not a bad result, lets plot a couple of those images...

In [None]:
some_predictions = transfer_model_vgg16.predict_generator(build_batches(test, limit=12, batch_size=1), steps=12, verbose=True)

In [None]:
idx = 0
for mini_batch_files, mini_batch_labels in build_batches(test, limit=samples, batch_size=1, produce="paths"):
    mini_batch_file = mini_batch_files[0]
    mini_batch_label = mini_batch_labels[0]
    predicted_label = some_predictions[idx][0]
    idx += 1
    #print(mini_batch_file, mini_batch_label, predicted_label)
    plot_prediction(mini_batch_file, predicted_label)

### Now, it would be interesting to plot images that are NOT correctly predicted... lets do that too.

In [None]:
samples = 1000
some_predictions = transfer_model_vgg16.predict_generator(build_batches(test, limit=samples, batch_size=1), steps=samples, verbose=True)

In [None]:

print("Total predictions", some_predictions.shape)
idx = 0
errors = 0
for mini_batch_files, mini_batch_labels in build_batches(test, limit=samples, batch_size=1, produce="paths"):
    mini_batch_file = mini_batch_files[0]
    mini_batch_label = mini_batch_labels[0]
    predicted_label = some_predictions[idx][0]
    if abs(float(mini_batch_label) - float(predicted_label)) > 0.5:
        errors += 1
        if errors < 10:
            plot_prediction(mini_batch_file, predicted_label)
    idx += 1
print("Total errors...", errors)

In [43]:
my_limit = 12500
i = 0
output_df = []
for file in listdir("test/test/"):    
    output_df.append([file, file.split(".")[0]])
    i += 1
    if i == my_limit:
        break
output = pd.DataFrame(output_df, columns=["filename", "id"])
print(len(output))
output.head()

12500


Unnamed: 0,filename,id
0,1.jpg,1
1,10.jpg,10
2,100.jpg,100
3,1000.jpg,1000
4,10000.jpg,10000


In [None]:
results = transfer_model_vgg16.predict_generator(build_batches(output, limit=-1, has_labels=False, batch_size=64), steps=12500/64, verbose=True)

 28/195 [===>..........................] - ETA: 19:44

In [None]:
results.shape

In [None]:
output["label"] = results

output.head(15)

## Lets plot a couple of predictions...

In [None]:
stop = 10
for idx, row in output.iterrows():
    path = "test/test/" + row["id"] + ".jpg"
    plot_prediction(path, row["label"])
    stop -= 1
    if stop == 0:
        break

## And finally prepare the output file

In [None]:
del output["filename"]

In [None]:
output.head(10)

In [None]:
len(output)

In [None]:
output.to_csv("submission_file.csv", index=False)