# Train on separate images (CNN-Only) on InceptionV3 output

In [1]:
import h5py
import numpy as np
import os
import random
from os.path import join
from keras import layers
from keras.models import Model
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras_utils import set_keras_session
set_keras_session()

In [3]:
dataset = 'UCF11'
nb_classes = int(dataset[3:])

In [4]:
def preprocess_images(images):

    # InceptionV3 requires images to be in range -1 to 1.
    return ((images / 255.) - 0.5) * 2

In [5]:
def train_generator(dataset_dir, batch_size):
    
    all_files = os.listdir(dataset_dir)
    
    while True:
        images = []
        labels = []

        for _ in range(batch_size):
            
            random_filename = random.choice(all_files)
            
            with h5py.File(join(dataset_dir, random_filename), 'r') as hf:
                frames = hf['X'][:]
                fr_labels = hf['Y'][:]
                random_idx = np.random.randint(frames.shape[0])
                
                images.append(frames[random_idx])
                labels.append(fr_labels)
                
        assert len(labels) == len(images)
        
        images = np.array(images)
        labels = to_categorical(np.array(labels), nb_classes)
        
        yield preprocess_images(images), labels
        
        

def valid_generator(dataset_dir, batch_size):
    
    all_files = os.listdir(dataset_dir)
    
    while True:
        
        for filename in all_files:

            with h5py.File(join(dataset_dir, filename), 'r') as hf:
                frames = hf['X'][:]
                single_label = hf['Y'][:][0]

                fr_labels = np.array([single_label] * frames.shape[0])

                yield preprocess_images(frames), to_categorical(fr_labels, nb_classes)

In [6]:
separate_dataset_dir = join('datasets', dataset, 'separate_frames_50_h_240_w_320')
train_dir = join(separate_dataset_dir, 'train')
valid_dir = join(separate_dataset_dir, 'valid')

train_samples_count = len(os.listdir(train_dir))
valid_samples_count = len(os.listdir(valid_dir))

with h5py.File(join(train_dir, os.listdir(train_dir)[0])) as hf:
    image_shape = hf['X'][:].shape[1:]
    #inception_shape = hf['inception'][:].shape[1:]
    print('Image shape is', image_shape)

Image shape is (240, 320, 3)


In [7]:
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model
from keras import backend as K

# create the base pre-trained model
base_model = InceptionV3(weights='imagenet', include_top=False)

# add a global spatial average pooling layer
x = base_model.output
x = layers.GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.5)(x)
# and a logistic layer -- let's say we have 200 classes
predictions = layers.Dense(nb_classes, activation='softmax')(x)

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional InceptionV3 layers
for layer in base_model.layers:
    layer.trainable = False

In [8]:
batch_size = 128

from keras import metrics
from functools import partial

top_3_k_categorical_accuracy = partial(metrics.top_k_categorical_accuracy, k=3)
top_3_k_categorical_accuracy.__name__ = 'top_3'

model.compile(optimizer='adam', loss='categorical_crossentropy', 
              metrics=['accuracy', top_3_k_categorical_accuracy])

In [9]:
history = model.fit_generator(train_generator(train_dir, batch_size),
                    steps_per_epoch=train_samples_count * 50 // batch_size, 
                    validation_data=valid_generator(valid_dir, batch_size),
                    validation_steps=valid_samples_count,
                    epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
def take(generator, how_many):
    
    """
    Take the first how_many results from a generator 
    (or less, if the generator won't generate as many results).
    Note that this function is also a generator.
    """
    
    for _, res in zip(range(how_many), generator):
        yield res

Here we collect metrics about the accuracy on the video, since the model only works with single images:
* Approach 1: Compute the mean of all predictions on all frames of a video, then take the best prediction.
* Approach 2: Take the predicted best for each frame (argmax), and count how many times that class is predicted as the best one into the video frames. Then take the class with the highest count.
* Approach 3 (not implemented but tested): instead of the mean, compute the product (since we work with probabilities). Yields a lower result than using the mean.

In [11]:
from collections import Counter

count_videos = 0
count_top_1 = 0
count_top_3 = 0
count_top_1_argmax = 0

for frames, labels in take(valid_generator(valid_dir, batch_size), valid_samples_count):
    
    ### Compute the predicted labels using the model
    
    true_labels = labels[0]
    true_label_idx = np.argmax(true_labels)
    predicted_labels = model.predict(frames)
    
    ### Update counters with Approach 1 (mean)
    
    predicted_labels_mean = np.mean(predicted_labels, axis=0)
    predicted_labels_mean_idx = np.argmax(predicted_labels_mean)
    idx_sorted_top_3 = np.argsort(predicted_labels_mean)[-3:]
    
    if true_label_idx in idx_sorted_top_3:
        count_top_3 += 1
        
    if true_label_idx == predicted_labels_mean_idx:
        count_top_1 += 1
        
    ### Update count with Approach 2 (highest count)
    
    predicted_labels_argmax = np.argmax(predicted_labels, axis=1)
    counter = Counter(predicted_labels_argmax)
    
    if counter.most_common(1)[0][0] == true_label_idx:
        count_top_1_argmax += 1
        
    ### Update number of videos
        
    count_videos += 1

In [12]:
print('Top 1 accuracy (using mean):', count_top_1 / count_videos)
print('Top 3 accuracy (using mean):', count_top_3 / count_videos)
print('Top 1 accuracy (using highest count):', count_top_1_argmax / count_videos)

Top 1 accuracy (using mean): 0.7003154574132492
Top 3 accuracy (using mean): 0.9526813880126183
Top 1 accuracy (using highest count): 0.6971608832807571


2018/03/01: Training with images with full resolution (240 x 320) results in better accuracy