In [38]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import pickle
import csv
import numpy as np
from scipy.misc import imread
import shutil
import matplotlib.pyplot as plt

import h5py
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K

from utilities import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
# global variables
train_dir = "data/model_train"
test_dir = "data/model_test"
test_csv = "data/model_test.csv"
num_classes = 7    # number of whales to be considered (in order of occuurence)
max_pred = 5       # number of ranked predictions (default 5)
batch_size = 16    # used for training as well as validation

# create training environment for training data
num_train_imgs = create_small_case(sel_whales = np.arange(1,num_classes+1),          # whales to be considered
                                   small_dir = train_dir, 
                                   sub_dirs = True) 

old directory removed data/model_train
copy 34 images for whale # 1 in ordered list, called w_1287fbc
copy 27 images for whale # 2 in ordered list, called w_98baff9
copy 26 images for whale # 3 in ordered list, called w_7554f44
copy 23 images for whale # 4 in ordered list, called w_1eafe46
copy 22 images for whale # 5 in ordered list, called w_693c9ee
copy 22 images for whale # 6 in ordered list, called w_ab4cae2
copy 22 images for whale # 7 in ordered list, called w_fd1cb9d
176  images of  7  whales copied


In [27]:
# create training environment for validation data
# for first testing make test directory same as train directory (no real solution !!)
num_test_imgs = create_small_case(sel_whales = np.arange(1,num_classes+1),   # whales to be considered
                                  small_dir = test_dir, 
                                  small_csv = test_csv,       # create csv file for later use                                  
                                  sub_dirs = False) 

old directory removed data/model_test
copy 34 images for whale # 1 in ordered list, called w_1287fbc
copy 27 images for whale # 2 in ordered list, called w_98baff9
copy 26 images for whale # 3 in ordered list, called w_7554f44
copy 23 images for whale # 4 in ordered list, called w_1eafe46
copy 22 images for whale # 5 in ordered list, called w_693c9ee
copy 22 images for whale # 6 in ordered list, called w_ab4cae2
copy 22 images for whale # 7 in ordered list, called w_fd1cb9d
write csv file: data/model_test.csv
176  images of  7  whales copied


In [32]:
# create the base pre-trained model
base_model = InceptionV3(weights='imagenet', include_top=False)

# add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='relu')(x)
# and a logistic layer
predictions = Dense(num_classes, activation='softmax')(x)

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional InceptionV3 layers
for layer in base_model.layers:
    layer.trainable = False

# compile the model (should be done *after* setting layers to non-trainable)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# define image generator
train_gen = image.ImageDataGenerator()

# train the model on the new data for a few epochs
train_flow = train_gen.flow_from_directory(train_dir, batch_size = batch_size)
whale_class_map = (train_flow.class_indices)           # get dict mapping whalenames --> class_no
class_whale_map = make_label_dict(directory=train_dir) # get dict mapping class_no --> whalenames
print(whale_class_map)
print(class_whale_map)

Found 176 images belonging to 7 classes.
{'w_7554f44': 3, 'w_1eafe46': 1, 'w_98baff9': 4, 'w_693c9ee': 2, 'w_ab4cae2': 5, 'w_fd1cb9d': 6, 'w_1287fbc': 0}
{0: 'w_1287fbc', 1: 'w_1eafe46', 2: 'w_693c9ee', 3: 'w_7554f44', 4: 'w_98baff9', 5: 'w_ab4cae2', 6: 'w_fd1cb9d'}


In [29]:
model.fit_generator(train_flow, verbose = 2, steps_per_epoch=num_train_imgs//batch_size, epochs=5)                        

# let's predict the test set to see a rough score
labels = make_label_dict(directory=train_dir)

Epoch 1/5
 - 79s - loss: 6.0587
Epoch 2/5
 - 84s - loss: 6.6013
Epoch 3/5
 - 79s - loss: 3.5672
Epoch 4/5
 - 84s - loss: 4.1117
Epoch 5/5
 - 81s - loss: 3.4059


In [30]:
valid_gen = image.ImageDataGenerator(preprocessing_function=preprocess_input)
valid_flow = valid_gen.flow_from_directory(test_dir, target_size = (299,299), class_mode=None)

Found 176 images belonging to 1 classes.


In [31]:
preds = model.predict_generator(valid_flow, verbose = 1)
print(preds.shape)
print(preds[:10])

(176, 7)
[[  1.43390670e-01   1.20498249e-02   9.96106640e-02   1.70189682e-02
    2.39478111e-01   3.71767074e-01   1.16684653e-01]
 [  3.59858155e-01   5.44713298e-03   6.46971539e-02   4.19636928e-02
    1.20005965e-01   3.64625931e-01   4.34020013e-02]
 [  7.32351840e-01   6.97384530e-04   9.58802830e-03   8.39053129e-04
    1.41878715e-02   6.60437346e-02   1.76291972e-01]
 [  1.36970937e-01   1.69535000e-02   8.93628374e-02   1.72086097e-02
    1.72094017e-01   4.66754287e-01   1.00655846e-01]
 [  1.88263923e-01   1.35726733e-02   1.01994358e-01   2.51630675e-02
    1.50598407e-01   3.19591492e-01   2.00816125e-01]
 [  2.46684030e-01   1.36084538e-02   1.70694068e-02   5.85768605e-03
    2.94588115e-02   2.99378067e-01   3.87943566e-01]
 [  2.92610496e-01   1.54833402e-02   4.02200632e-02   1.01385666e-02
    4.00396556e-01   1.20670527e-01   1.20480441e-01]
 [  1.86274096e-01   3.46491598e-02   8.29836577e-02   2.01654173e-02
    1.50789559e-01   3.28331083e-01   1.96807086e-01]

In [33]:
# ge list of model predictions: one ordered list of maxpred whalenames per image
top_k = preds.argsort()[:, -max_preds:][:, ::-1]    
model_preds = [([class_whale_map[i] for i in line]) for line in top_k]  

# get list of true labels: one whalename per image
valid_list = read_csv(file_name = test_csv)    # list with (filename, whalename)
true_labels = []
for fn in valid_flow.filenames:
    offset, filename = fn.split('/')
    whale = [line[1] for line in valid_list if line[0]==filename][0]
    true_labels.append(whale)

print("true labels: \n", np.array(model_preds)[:10])
print("model predictions: \n", np.array(true_labels)[:10])

true labels: 
 [['w_ab4cae2' 'w_98baff9' 'w_1287fbc' 'w_fd1cb9d' 'w_693c9ee']
 ['w_ab4cae2' 'w_1287fbc' 'w_98baff9' 'w_693c9ee' 'w_fd1cb9d']
 ['w_1287fbc' 'w_fd1cb9d' 'w_ab4cae2' 'w_98baff9' 'w_693c9ee']
 ['w_ab4cae2' 'w_98baff9' 'w_1287fbc' 'w_fd1cb9d' 'w_693c9ee']
 ['w_ab4cae2' 'w_fd1cb9d' 'w_1287fbc' 'w_98baff9' 'w_693c9ee']
 ['w_fd1cb9d' 'w_ab4cae2' 'w_1287fbc' 'w_98baff9' 'w_693c9ee']
 ['w_98baff9' 'w_1287fbc' 'w_ab4cae2' 'w_fd1cb9d' 'w_693c9ee']
 ['w_ab4cae2' 'w_fd1cb9d' 'w_1287fbc' 'w_98baff9' 'w_693c9ee']
 ['w_1287fbc' 'w_fd1cb9d' 'w_ab4cae2' 'w_693c9ee' 'w_98baff9']
 ['w_1287fbc' 'w_fd1cb9d' 'w_ab4cae2' 'w_98baff9' 'w_693c9ee']]
model predictions: 
 ['w_1287fbc' 'w_fd1cb9d' 'w_fd1cb9d' 'w_1287fbc' 'w_7554f44' 'w_1eafe46'
 'w_98baff9' 'w_98baff9' 'w_7554f44' 'w_693c9ee']


In [45]:
MAP = mean_average_precision(model_preds, true_labels, max_preds)
print("MAP", MAP)

Dummy_map = Dummy_MAP(probs = 'weighted', distributed_as = test_csv, image_no = len(valid_list))
print("Dummy MAP weighted", Dummy_map)

MAP 0.320359848485
Dummy MAP uniform 0.258712121212
Dummy MAP weighted 0.260700757576
