### Load training, validation and testing data¶

In [2]:
from data_helper import load_dataset

In [3]:
IMAGE_FOLDER_PATH = 'dataset/resized/frames/'
LABEL_FOLDER_PATH = 'dataset/labels/'

In [4]:
# Load object recognition labels
print('-'*100)
print('Load object recoginition inputs and labels...')
train_head_image_paths, train_hand_image_paths, train_obj_labels, \
val_head_image_paths, val_hand_image_paths, val_obj_labels, \
test_head_image_paths, test_hand_image_paths, test_obj_labels = load_dataset(image_folder_path=IMAGE_FOLDER_PATH,
                                                                             label_folder_path=LABEL_FOLDER_PATH,
                                                                             label_type='obj',
                                                                             hand_types=['left', 'right'],
                                                                             with_head=True,
                                                                             validation_split_ratio=0.15)
# Load gesture recognition labels
# Since the splitting train/val set doesn't involve randomness,
# the order of data is same as the ones loaded from the above `load_dataset(..., label_type='obj', ...)
print('-'*100)
print('Load gesture recoginition inputs and labels...')
train_head_image_paths, train_hand_image_paths, train_ges_labels, \
val_head_image_paths, val_hand_image_paths, val_ges_labels, \
test_head_image_paths, test_hand_image_paths, test_ges_labels = load_dataset(image_folder_path=IMAGE_FOLDER_PATH,
                                                                             label_folder_path=LABEL_FOLDER_PATH,
                                                                             label_type='ges',
                                                                             hand_types=['left', 'right'],
                                                                             with_head=True,
                                                                             validation_split_ratio=0.15)

# Only take hand image paths for baseline
train_image_paths =  train_hand_image_paths
val_image_paths = val_hand_image_paths
test_image_paths = test_hand_image_paths

----------------------------------------------------------------------------------------------------
Load object recoginition inputs and labels...
----------------------------------------------------------------------------------------------------
[Train (Head)] number of image paths: 12744
[Train (Hand)] number of image paths: 12744
[Train (Label)] number of labels: 12744
----------------------------------------------------------------------------------------------------
[Validation (Head)] number of image paths: 2248
[Validation (Hand)] number of image paths: 2248
[Validation (Label)] number of labels: 2248
----------------------------------------------------------------------------------------------------
[Test (Head)] number of image paths: 12776
[Test (Hand)] number of image paths: 12776
[Test (Label)] number of labels: 12776
----------------------------------------------------------------------------------------------------
Load gesture recoginition inputs and labels...
---------

### Use Tensorflow to build computational graph

In [5]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.slim.nets
import vgg_preprocessing

#### Path configs and hyperparameters

In [6]:
PRETRAINED_HAND_GESTURE_MODEL_PATH = 'model/hand_gesture_vgg_16/hand_gesture_vgg_16_model'
PRETRAINED_HAND_OBJ_MODEL_PATH = 'model/hand_obj_vgg_16/hand_obj_vgg_16_model'
MODEL_PATH = 'model/two_stream_vgg_16_multi_loss/two_stream_vgg_16_multi_loss'

num_classes_obj = 24 # object classes
num_classes_ges = 13 # gesture classes
batch_size = 16 # batch_size=32 is not enough for GPU with only 11170 MiB memory when optimizing 2 VGG nets.
num_workers = 20
max_epochs1 = 30
max_epochs2 = 30
max_patience = 5 # For early stopping
learning_rate1 = 1e-3
learning_rate2 = 1e-5
dropout_keep_prob = 0.5
weight_decay = 5e-4

#### Building blocks of  two-stream CNN model

In [7]:
def stream_vgg_16(inputs,
                  is_training=True,
                  dropout_keep_prob=0.5,
                  spatial_squeeze=False,
                  scope='stream_vgg_16',
                  fc_conv_padding='VALID'):
    
    """ Reference from "https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py"
        A VGG16 net excluding 'fc7' and 'fc8' layers.
        
        Returns: 
            A shape=(?, 4096) deep features if spatial_squeeze == True, else shape=(?, 1, 1, 4096).
            and end_points dict.
    """
    
    with tf.variable_scope(scope, 'stream_vgg_16', [inputs]) as sc:
            end_points_collection = sc.name + '_end_points'
            
            with slim.arg_scope([slim.conv2d, slim.max_pool2d],
                                outputs_collections=end_points_collection):
                
                net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
                net = slim.max_pool2d(net, [2, 2], scope='pool1')
                net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
                net = slim.max_pool2d(net, [2, 2], scope='pool2')
                net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
                net = slim.max_pool2d(net, [2, 2], scope='pool3')
                net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
                net = slim.max_pool2d(net, [2, 2], scope='pool4')
                net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
                net = slim.max_pool2d(net, [2, 2], scope='pool5')
                # Use conv2d instead of fully_connected layers.
                net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
                net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6')
                # Convert end_points_collection into a end_point dict.
                end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                
                if spatial_squeeze:
                    net = tf.squeeze(net, [1, 2], name='fc6/squeezed')
                    end_points[sc.name + '/fc6'] = net
                return net, end_points
            
def muli_label_fusion_fc(inputs,
              num_classes_ges=13,
              num_classes_obj=24,
              is_training=True,
              dropout_keep_prob=0.5,
              spatial_squeeze=True,
              scope='muli_label_fusion_fc'):
    
    """ Reference from "https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py"
        A VGG16 net only include 'fc7' and 'fc8' layers.
        
        Args:
            inputs: A list of tensor with shape like (?, 1, 1, 4096). (the 2, 3 axis must be "1")
            
        Returns:
            The last op containing the log predictions and end_points dict.
    """
    
    with tf.variable_scope(scope, 'muli_label_fusion_fc', [inputs]) as sc:
        end_points_collection = sc.name + '_end_points'
                
        with slim.arg_scope([slim.conv2d],
                            outputs_collections=end_points_collection):
            # [(?, 1, 1, 4096), (?, 1, 1, 4096)] => (?, 1, 1, 8192)
            net = tf.concat(inputs, axis=3)
            # (?, 1, 1, 8192) => (?, 1, 1, 4096)
            net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
            net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7')
            # For gesture logits: (?, 1, 1, 4096) => (?, 1, 1, num_classes_ges)
            ges_net = slim.conv2d(net, num_classes_ges, [1, 1],
                                  activation_fn=None,
                                  normalizer_fn=None,
                                  scope='ges_fc8')
            # For object logits: (?, 1, 1, 4096) => (?, 1, 1, num_classes_obj)
            obj_net = slim.conv2d(net, num_classes_obj, [1, 1],
                                  activation_fn=None,
                                  normalizer_fn=None,
                                  scope='obj_fc8')
            # Convert end_points_collection into a end_point dict.
            end_points = slim.utils.convert_collection_to_dict(end_points_collection)
            
            if spatial_squeeze:
                # (?, 1, 1, num_classes) => (?, num_classes)
                ges_net = tf.squeeze(ges_net, [1, 2], name='ges_fc8/squeezed')
                end_points[sc.name + '/ges_fc8'] = ges_net
                
                obj_net = tf.squeeze(obj_net, [1, 2], name='obj_fc8/squeezed')
                end_points[sc.name + '/obj_fc8'] = obj_net
                
            return ges_net, obj_net, end_points

#### Build our two-stream CNN

In [8]:
%%time

def dataset_map_fn(image_path, ges_label, obj_label, is_training):
    # Load image
    image_string = tf.read_file(image_path)
    image_decoded = tf.image.decode_png(image_string, channels=3)
    image = tf.cast(image_decoded, tf.float32)
    # Preprocess image
    preprocessed_image = tf.cond(is_training,
                                 true_fn=lambda: vgg_preprocessing.preprocess_image(image, 224, 224, is_training=True),
                                 false_fn=lambda: vgg_preprocessing.preprocess_image(image, 224, 224, is_training=False))
    return preprocessed_image, ges_label, obj_label

graph = tf.Graph()
with graph.as_default():
    # ---------------------------------------------------------------------
    # Indicates whether we are in training or in test mode
    # Since VGG16 has applied `dropout`, we need to disable it when testing.
    is_training = tf.placeholder(dtype=tf.bool, name='is_training')
    
    # Training, validation, testing data to feed in.
    image_paths = tf.placeholder(dtype=tf.string, shape=(None,), name='image_paths')
    ges_labels = tf.placeholder(dtype=tf.int32, shape=(None,), name='ges_labels')
    obj_labels = tf.placeholder(dtype=tf.int32, shape=(None,), name='obj_labels')
    
    # Use dataset API to automatically generate batch data by iterator.
    dataset = tf.contrib.data.Dataset.from_tensor_slices((image_paths, ges_labels, obj_labels))
    dataset = dataset.map(lambda image_path, ges_label, obj_label: dataset_map_fn(image_path, ges_label, obj_label, is_training))
    dataset = dataset.shuffle(buffer_size=10000)
    batched_dataset = dataset.batch(batch_size)
    
    # Now we define an iterator that can operator on dataset.
    # The iterator can be reinitialized by calling:
    # sess.run(dataset_init_op, feed_dict={image_paths=train_image_paths, labels=train_labels}) 
    # for 1 epoch on the training set.
    
    # Once this is done, we don't need to feed any value for images and labels
    # as they are automatically pulled out from the iterator queues.

    # A reinitializable iterator is defined by its structure. We could use the
    # `output_types` and `output_shapes` properties of dataset.
    # The dataset will be fed with training, validation or testing data.
    iterator = tf.contrib.data.Iterator.from_structure(batched_dataset.output_types,
                                                       batched_dataset.output_shapes)
    
    # A batch of data to feed into the networks.
    batch_images, batch_ges_labels, batch_obj_labels = iterator.get_next()
    dataset_init_op = iterator.make_initializer(batched_dataset)
    
    # =====================================================================
    # Start to build our two-stream cnn model.
    vgg = tf.contrib.slim.nets.vgg
    # Apply L2 regularization with weight decay.
    with slim.arg_scope(vgg.vgg_arg_scope(weight_decay=weight_decay)):
        # Firstly, build our first stream cnn model -- pretrained hand gesture vgg16 net,
        # excluding 'fc7' and'fc8' layers
        hand_gesture_model_4096_features, _ = stream_vgg_16(batch_images,
                                                            is_training=is_training,
                                                            dropout_keep_prob=dropout_keep_prob,
                                                            spatial_squeeze=False,
                                                            scope='hand_gesture_vgg_16')

        # Secondly, build our second stream cnn model -- pretrained hand obj vgg16 net,
        # excluding 'fc7' and'fc8' layers
        hand_obj_model_4096_features, _ = stream_vgg_16(batch_images,
                                                        is_training=is_training,
                                                        dropout_keep_prob=dropout_keep_prob,
                                                        spatial_squeeze=False,
                                                        scope='hand_obj_vgg_16')

        # Finally, concatenate our 2 stream cnn models with fc layers architecture in vgg16 net.
        ges_logits, obj_logits, _ = muli_label_fusion_fc(inputs=[hand_gesture_model_4096_features, 
                                                                 hand_obj_model_4096_features],
                                                         num_classes_ges=num_classes_ges,
                                                         num_classes_obj=num_classes_obj,
                                                         is_training=is_training,
                                                         dropout_keep_prob=dropout_keep_prob,
                                                         spatial_squeeze=True,
                                                         scope='muli_label_fusion_fc')
        
    # =====================================================================

    # ---------------------------------------------------------------------
    # Restore only the layers up to 'fc6' (included)
    # Calling function `hand_gesture_model_init_fn(sess)` will load all the pretrained weights.
    hand_gesture_model_variables = slim.get_variables(scope='hand_gesture_vgg_16')
    # Since the variable scope name of checkpoint file is 'vgg_16',
    # which is different from our new scope name 'hand_gesture_vgg_16',
    # we need to re-map variable names in order to make `Saver` know which new variable name to restore.
    # If we don't know the variable names of source checkpoint file, we can run the script `inspect_checkpoint.py`, 
    # For example:
    # $ python inspect_checkpoint.py --file_name=model/hand_gesture_vgg_16/hand_gesture_vgg_16_model
    # to inspect the variable names from our source checkpoint file.
    hand_gesture_model_init_fn = tf.contrib.framework.assign_from_checkpoint_fn(
        PRETRAINED_HAND_GESTURE_MODEL_PATH,
        var_list={var.name.replace('hand_gesture_vgg_16', 'vgg_16').split(':')[0]: var for var in hand_gesture_model_variables}
    )
        
    # Same as we done to `hand_gesture_model`.
    hand_obj_model_variables = slim.get_variables(scope='hand_obj_vgg_16')
    hand_obj_model_init_fn = tf.contrib.framework.assign_from_checkpoint_fn(
        PRETRAINED_HAND_OBJ_MODEL_PATH,
        var_list={var.name.replace('hand_obj_vgg_16', 'vgg_16').split(':')[0]: var for var in hand_obj_model_variables}
    )
    
    # ---------------------------------------------------------------------
    # Using tf.losses, any loss is added to the tf.GraphKeys.LOSSES collection
    # We can then call the total loss easily
    ges_loss = tf.losses.sparse_softmax_cross_entropy(labels=batch_ges_labels, logits=ges_logits)
    obj_loss = tf.losses.sparse_softmax_cross_entropy(labels=batch_obj_labels, logits=obj_logits)
    # Same as: ges_loss + obj_loss + regularization losses
    # We will then jointly minimize this loss. (Joint training)
    loss = tf.losses.get_total_loss()
    
    # First we want to train only the reinitialized last layer fc8 for a few epochs.
    # We run minimize the loss only with respect to the `fusion_fc` variables (weight and bias).
    fusion_fc_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate1)
    fusion_fc_train_op = fusion_fc_optimizer.minimize(loss, var_list=slim.get_variables(scope='muli_label_fusion_fc'))
    
    # Then we want to finetune the entire model for a few epochs.
    # We run minimize the loss only with respect to all the variables.
    full_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate2)
    full_train_op = full_optimizer.minimize(loss)
    
    # Evaluation metrics
    ges_prediction = tf.to_int32(tf.argmax(ges_logits, 1))
    obj_prediction = tf.to_int32(tf.argmax(obj_logits, 1))
    
    ges_correct_prediction = tf.equal(ges_prediction, batch_ges_labels)
    obj_correct_prediction = tf.equal(obj_prediction, batch_obj_labels)
    
    ges_accuracy = tf.reduce_mean(tf.cast(ges_correct_prediction, tf.float32))
    obj_accuracy = tf.reduce_mean(tf.cast(obj_correct_prediction, tf.float32))
    
    # Initialize the variables (i.e. assign their default value)
    init_op = tf.global_variables_initializer()
    
    # 'Saver' op to save and restore all the variables
    saver = tf.train.Saver()

CPU times: user 2.37 s, sys: 27.9 ms, total: 2.4 s
Wall time: 2.4 s


### Start training

In [9]:
from tqdm import tqdm

In [10]:
def evaluate(sess, loss, correct_prediction, dataset_init_op, feed_dict):
    """
        Evaluation in training loop.
        Check the performance of the model on either train, val or test (depending on `dataset_init_op`)
        Note: The arguments are tensorflow operators defined in the graph.
    """
    
    # Initialize the correct dataset.
    sess.run(dataset_init_op, feed_dict=feed_dict)

    data_loss = 0
    num_correct = 0
    num_samples = 0
    
    # Evaluate on every batch.
    while True:
        try:
            # Disable `is_training` since we have `dropout` in VGG net.
            _loss, _correct_prediction = sess.run([loss, correct_prediction], feed_dict={is_training: False})

            data_loss += _loss
            num_correct += _correct_prediction.sum() # e.g: [True, False, True].sum() = 2
            num_samples += _correct_prediction.shape[0] # Batch size
            
        except tf.errors.OutOfRangeError:
            break

    data_loss = data_loss / num_samples
    acc = num_correct / num_samples

    return data_loss, acc

In [11]:
# --------------------------------------------------------------------------
# Now that we have built the graph and finalized it, we define the session.
# The session is the interface to *run* the computational graph.
# We can call our training operations with `sess.run(train_op)` for instance
sess = tf.Session(graph=graph)

### Initialize variables or restore variables from checkpoint

In [12]:
RESTORE = True
max_acc = 0.0

if RESTORE:
    print('Restore variables from checkpoint...')
    # If checkpoint exists, restore it to session.
    saver.restore(sess, MODEL_PATH)
    # Regain max validation accuracy from model
    print('Regaining max validation accuracy...')
    %time _, val_acc = evaluate(sess, loss, obj_correct_prediction, dataset_init_op, \
                                feed_dict={image_paths: val_image_paths, \
                                           ges_labels: val_ges_labels, \
                                           obj_labels: val_obj_labels, \
                                           is_training: False})
    max_acc = val_acc
    print('Max validation accuracy: {}'.format(max_acc))
else:
    print('Initialize variables from scratch...')
    # Initialize all variables
    sess.run(init_op)
    # Load the pretrained weights for 2-stream model
    hand_gesture_model_init_fn(sess) 
    hand_obj_model_init_fn(sess)

Restore variables from checkpoint...
INFO:tensorflow:Restoring parameters from model/two_stream_vgg_16_multi_loss/two_stream_vgg_16_multi_loss
Regaining max validation accuracy...
CPU times: user 33.3 s, sys: 2.63 s, total: 36 s
Wall time: 35.3 s
Max validation accuracy: 0.5026690391459074


### Only train `fusion_fc` layers.

In [None]:
patience = 0

# Update only the last layer for a few epochs.
for epoch in tqdm(range(max_epochs1)):
    # Run an epoch over the training data.
    print('-'*110)
    print('Starting epoch {}/{}'.format(epoch+1, max_epochs1))
    # Here we initialize the iterator with the training set.
    # This means that we can go through an entire epoch until the iterator becomes empty.
    sess.run(dataset_init_op, feed_dict={image_paths: train_image_paths,
                                         ges_labels: train_ges_labels,
                                         obj_labels: train_obj_labels,
                                         is_training: True})
    while True:
        try:
            _ = sess.run(fusion_fc_train_op, feed_dict={is_training: True})
        except tf.errors.OutOfRangeError:
            break

    # Check performance every epoch
    train_loss, train_acc = evaluate(sess, loss, obj_correct_prediction, dataset_init_op,
                                     feed_dict={image_paths: train_image_paths,
                                                ges_labels: train_ges_labels,
                                                obj_labels: train_obj_labels,
                                                is_training: True})
    
    val_loss, val_acc = evaluate(sess, loss, obj_correct_prediction, dataset_init_op,
                                 feed_dict={image_paths: val_image_paths,
                                            ges_labels: val_ges_labels,
                                            obj_labels: val_obj_labels,
                                            is_training: False})
    
    print('[Train] loss: {} | accuracy: {}'.format(train_loss, train_acc))
    print('[Validation] loss: {} | accuracy: {}'.format(val_loss, val_acc))
    
    # Save checkpoint
    if val_acc > max_acc:
        patience = 0
        max_acc = val_acc
        save_path = saver.save(sess, MODEL_PATH)
        print("Model updated and saved in file: %s" % save_path)
    else:
        patience += 1
        print('Model not improved at epoch {}/{}. Patience: {}/{}'.format(epoch+1, max_epochs1, patience, max_patience))
    # Early stopping.
    if patience > max_patience:
        print('Max patience exceeded. Early stopping.')
        break

  0%|          | 0/30 [00:00<?, ?it/s]

--------------------------------------------------------------------------------------------------------------
Starting epoch 1/30
[Train] loss: 0.2788430961025487 | accuracy: 0.6783584431889517
[Validation] loss: 0.3968479046830079 | accuracy: 0.44795373665480426


  3%|▎         | 1/30 [09:36<4:38:47, 576.82s/it]

Model updated and saved in file: model/two_stream_cnn_multi_loss/two_stream_cnn_multi_loss
--------------------------------------------------------------------------------------------------------------
Starting epoch 2/30


  7%|▋         | 2/30 [18:40<4:24:30, 566.79s/it]

[Train] loss: 0.28337458681551375 | accuracy: 0.6321406151914627
[Validation] loss: 0.43667007319867823 | accuracy: 0.36610320284697506
Model not improved at epoch 2/30. Patience: 1/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 3/30


 10%|█         | 3/30 [27:35<4:10:50, 557.43s/it]

[Train] loss: 0.2702017653609549 | accuracy: 0.7161016949152542
[Validation] loss: 0.417505411490851 | accuracy: 0.4332740213523132
Model not improved at epoch 3/30. Patience: 2/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 4/30


 13%|█▎        | 4/30 [36:36<3:59:20, 552.35s/it]

[Train] loss: 0.28550542899444276 | accuracy: 0.689579409918393
[Validation] loss: 0.46263894873581746 | accuracy: 0.4359430604982206
Model not improved at epoch 4/30. Patience: 3/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 5/30
[Train] loss: 0.27254550895149127 | accuracy: 0.7187696170747018
[Validation] loss: 0.43909855393752506 | accuracy: 0.44973309608540923


 17%|█▋        | 5/30 [46:03<3:51:59, 556.78s/it]

Model updated and saved in file: model/two_stream_cnn_multi_loss/two_stream_cnn_multi_loss
--------------------------------------------------------------------------------------------------------------
Starting epoch 6/30


 20%|██        | 6/30 [55:11<3:41:37, 554.05s/it]

[Train] loss: 0.28126385108300594 | accuracy: 0.7239485247959824
[Validation] loss: 0.42033609737280847 | accuracy: 0.44395017793594305
Model not improved at epoch 6/30. Patience: 1/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 7/30


 23%|██▎       | 7/30 [1:04:13<3:31:05, 550.68s/it]

[Train] loss: 0.275807647026341 | accuracy: 0.7295197740112994
[Validation] loss: 0.42454573331778583 | accuracy: 0.4297153024911032
Model not improved at epoch 7/30. Patience: 2/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 8/30


 27%|██▋       | 8/30 [1:13:15<3:20:56, 548.02s/it]

[Train] loss: 0.28270302208728637 | accuracy: 0.7039391086001255
[Validation] loss: 0.4525430399752172 | accuracy: 0.4199288256227758
Model not improved at epoch 8/30. Patience: 3/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 9/30
[Train] loss: 0.26118825236656945 | accuracy: 0.7387005649717514
[Validation] loss: 0.42843583661042073 | accuracy: 0.4555160142348754


 30%|███       | 9/30 [1:22:46<3:14:09, 554.72s/it]

Model updated and saved in file: model/two_stream_cnn_multi_loss/two_stream_cnn_multi_loss
--------------------------------------------------------------------------------------------------------------
Starting epoch 10/30


 33%|███▎      | 10/30 [1:31:46<3:03:31, 550.58s/it]

[Train] loss: 0.2575963624266058 | accuracy: 0.7289704959196485
[Validation] loss: 0.4138512411999957 | accuracy: 0.4386120996441281
Model not improved at epoch 10/30. Patience: 1/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 11/30


 37%|███▋      | 11/30 [1:40:52<2:53:53, 549.13s/it]

[Train] loss: 0.24177933450173225 | accuracy: 0.7392498430634024
[Validation] loss: 0.4235157073604679 | accuracy: 0.4470640569395018
Model not improved at epoch 11/30. Patience: 2/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 12/30


 40%|████      | 12/30 [1:49:54<2:44:03, 546.85s/it]

[Train] loss: 0.24713817287449258 | accuracy: 0.71939736346516
[Validation] loss: 0.4542589709427857 | accuracy: 0.4181494661921708
Model not improved at epoch 12/30. Patience: 3/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 13/30


 43%|████▎     | 13/30 [1:59:00<2:34:51, 546.56s/it]

[Train] loss: 0.23675097464317804 | accuracy: 0.7376020087884495
[Validation] loss: 0.408826076899559 | accuracy: 0.45062277580071175
Model not improved at epoch 13/30. Patience: 4/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 14/30


 47%|████▋     | 14/30 [2:08:04<2:25:34, 545.88s/it]

[Train] loss: 0.23609283428033492 | accuracy: 0.7411330822347771
[Validation] loss: 0.4025343847444473 | accuracy: 0.4354982206405694
Model not improved at epoch 14/30. Patience: 5/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 15/30
[Train] loss: 0.2397894070572024 | accuracy: 0.723556183301946
[Validation] loss: 0.3837939893223637 | accuracy: 0.4372775800711744
Model not improved at epoch 15/30. Patience: 6/5
Max patience exceeded. Early stopping.


### Train all layers

In [17]:
patience = 0

# Train the entire model for a few more epochs, continuing with the *same* weights.
for epoch in tqdm(range(max_epochs2)):
    # Run an epoch over the training data.
    print('-'*110)
    print('Starting epoch {}/{}'.format(epoch+1, max_epochs2))
    # Here we initialize the iterator with the training set.
    # This means that we can go through an entire epoch until the iterator becomes empty.
    sess.run(dataset_init_op, feed_dict={image_paths: train_image_paths,
                                         ges_labels: train_ges_labels,
                                         obj_labels: train_obj_labels,
                                         is_training: True})
    while True:
        try:
            _ = sess.run(full_train_op, feed_dict={is_training: True})    
        except tf.errors.OutOfRangeError:
            break

    # Check performance every epoch
    train_loss, train_acc = evaluate(sess, loss, obj_correct_prediction, dataset_init_op,
                                     feed_dict={image_paths: train_image_paths,
                                                ges_labels: train_ges_labels,
                                                obj_labels: train_obj_labels,
                                                is_training: True})
    
    val_loss, val_acc = evaluate(sess, loss, obj_correct_prediction, dataset_init_op,
                                 feed_dict={image_paths: val_image_paths,
                                            ges_labels: val_ges_labels,
                                            obj_labels: val_obj_labels,
                                            is_training: False})
    
    print('[Train] loss: {} | accuracy: {}'.format(train_loss, train_acc))
    print('[Validation] loss: {} | accuracy: {}'.format(val_loss, val_acc))
    
    # Save checkpoint
    if val_acc > max_acc:
        patience = 0
        max_acc = val_acc
        save_path = saver.save(sess, MODEL_PATH)
        print("Model updated and saved in file: %s" % save_path)
    else:
        patience += 1
        print('Model not improved at epoch {}/{}. Patience: {}/{}'.format(epoch+1, max_epochs1, patience, max_patience))
    # Early stopping.
    if patience > max_patience:
        print('Max patience exceeded. Early stopping.')
        break

  0%|          | 0/30 [00:00<?, ?it/s]

--------------------------------------------------------------------------------------------------------------
Starting epoch 1/30


  3%|▎         | 1/30 [12:54<6:14:28, 774.79s/it]

[Train] loss: 0.23845204673543788 | accuracy: 0.732736974262398
[Validation] loss: 0.4419526137491138 | accuracy: 0.44261565836298933
Model not improved at epoch 1/30. Patience: 1/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 2/30
[Train] loss: 0.22054174677126825 | accuracy: 0.7616133082234777
[Validation] loss: 0.3758830139645478 | accuracy: 0.46307829181494664


  7%|▋         | 2/30 [26:19<6:05:41, 783.63s/it]

Model updated and saved in file: model/two_stream_cnn_multi_loss/two_stream_cnn_multi_loss
--------------------------------------------------------------------------------------------------------------
Starting epoch 3/30
[Train] loss: 0.21093015048164132 | accuracy: 0.7744036409290647
[Validation] loss: 0.3608174811902844 | accuracy: 0.4870996441281139


 10%|█         | 3/30 [39:48<5:56:07, 791.40s/it]

Model updated and saved in file: model/two_stream_cnn_multi_loss/two_stream_cnn_multi_loss
--------------------------------------------------------------------------------------------------------------
Starting epoch 4/30
[Train] loss: 0.20130126538399398 | accuracy: 0.7957470182046453
[Validation] loss: 0.36882651021896307 | accuracy: 0.4919928825622776


 13%|█▎        | 4/30 [53:17<5:45:09, 796.50s/it]

Model updated and saved in file: model/two_stream_cnn_multi_loss/two_stream_cnn_multi_loss
--------------------------------------------------------------------------------------------------------------
Starting epoch 5/30


 17%|█▋        | 5/30 [1:06:08<5:28:45, 789.01s/it]

[Train] loss: 0.19344496074070067 | accuracy: 0.8064187068424357
[Validation] loss: 0.4113207931408254 | accuracy: 0.46619217081850534
Model not improved at epoch 5/30. Patience: 1/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 6/30


 20%|██        | 6/30 [1:19:09<5:14:38, 786.60s/it]

[Train] loss: 0.19042110719062663 | accuracy: 0.8125392341494037
[Validation] loss: 0.36272104238275954 | accuracy: 0.4697508896797153
Model not improved at epoch 6/30. Patience: 2/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 7/30


 23%|██▎       | 7/30 [1:32:02<5:00:00, 782.61s/it]

[Train] loss: 0.18490752223259385 | accuracy: 0.8247802887633396
[Validation] loss: 0.3816025112871598 | accuracy: 0.4719750889679715
Model not improved at epoch 7/30. Patience: 3/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 8/30


 27%|██▋       | 8/30 [1:45:03<4:46:43, 781.98s/it]

[Train] loss: 0.1857325349936153 | accuracy: 0.8231324544883867
[Validation] loss: 0.36322162488601384 | accuracy: 0.49110320284697506
Model not improved at epoch 8/30. Patience: 4/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 9/30
[Train] loss: 0.17731385222973498 | accuracy: 0.8434557438794726
[Validation] loss: 0.4357446225300378 | accuracy: 0.5026690391459074


 30%|███       | 9/30 [1:58:32<4:36:34, 790.24s/it]

Model updated and saved in file: model/two_stream_cnn_multi_loss/two_stream_cnn_multi_loss
--------------------------------------------------------------------------------------------------------------
Starting epoch 10/30


 33%|███▎      | 10/30 [2:11:33<4:22:25, 787.28s/it]

[Train] loss: 0.17575880532472746 | accuracy: 0.8462806026365348
[Validation] loss: 0.37600329454683323 | accuracy: 0.5008896797153025
Model not improved at epoch 10/30. Patience: 1/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 11/30


 37%|███▋      | 11/30 [2:24:28<4:08:12, 783.83s/it]

[Train] loss: 0.17373526521022412 | accuracy: 0.8453389830508474
[Validation] loss: 0.42352240047420897 | accuracy: 0.4724199288256228
Model not improved at epoch 11/30. Patience: 2/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 12/30


 40%|████      | 12/30 [2:37:29<3:54:51, 782.84s/it]

[Train] loss: 0.17099376196802898 | accuracy: 0.8557752667922159
[Validation] loss: 0.40206928227719885 | accuracy: 0.4835409252669039
Model not improved at epoch 12/30. Patience: 3/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 13/30


 43%|████▎     | 13/30 [2:50:28<3:41:29, 781.74s/it]

[Train] loss: 0.1700003144880248 | accuracy: 0.8556183301946014
[Validation] loss: 0.45908505053282633 | accuracy: 0.4893238434163701
Model not improved at epoch 13/30. Patience: 4/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 14/30


 47%|████▋     | 14/30 [3:03:27<3:28:14, 780.93s/it]

[Train] loss: 0.16696886105576328 | accuracy: 0.8620527306967984
[Validation] loss: 0.3857013393339313 | accuracy: 0.498220640569395
Model not improved at epoch 14/30. Patience: 5/5
--------------------------------------------------------------------------------------------------------------
Starting epoch 15/30
[Train] loss: 0.16666279400173253 | accuracy: 0.8572661644695543
[Validation] loss: 0.4500828131874261 | accuracy: 0.48087188612099646
Model not improved at epoch 15/30. Patience: 6/5
Max patience exceeded. Early stopping.


### Testing

In [12]:
%%time

saver.restore(sess, MODEL_PATH)

test_loss, test_acc = evaluate(sess, loss, obj_correct_prediction, dataset_init_op,
                               feed_dict={image_paths: test_image_paths,
                                          ges_labels: test_ges_labels,
                                          obj_labels: test_obj_labels,
                                          is_training: False})

print('[Test] loss: {} | accuracy: {}'.format(test_loss, test_acc))

INFO:tensorflow:Restoring parameters from model/two_stream_cnn_multi_loss/two_stream_cnn_multi_loss
[Test] loss: 0.30849285870697174 | accuracy: 0.6404195366311835
CPU times: user 3min 8s, sys: 20.6 s, total: 3min 29s
Wall time: 3min 22s


### Save estimated probabilities of decisions (logits) to plot precision-recall curve

In [13]:
# Load testing dataset
print('Loading testing dataset...')
%time sess.run(dataset_init_op, feed_dict={ \
    image_paths: test_image_paths, \
    ges_labels: test_ges_labels, \
    obj_labels: test_obj_labels, \
    is_training: False \
})

logits_list = []
num_batches = len(test_image_paths) // batch_size + 1

# Evaluate on every batch.
print('Predicting on every batch...')
with tqdm(total=num_batches) as progress:
    while True:
        try:
            # Disable `is_training` since we have `dropout` in VGG net.
            _logits = sess.run(obj_logits, feed_dict={is_training: False})
            logits_list.append(_logits)
            progress.update(1)
        except tf.errors.OutOfRangeError:
            break
    logits_list = np.concatenate(logits_list)

  0%|          | 0/799 [00:00<?, ?it/s]

Loading testing dataset...
CPU times: user 16.7 ms, sys: 16 µs, total: 16.7 ms
Wall time: 14.7 ms
Predicting on every batch...


100%|██████████| 799/799 [03:10<00:00,  4.18it/s] 


In [14]:
with open('{}_logits.npy'.format(MODEL_PATH), 'wb') as file:
    np.save(file, logits_list)