# YOLO-based face detector

## Configuration

In [1]:
MODEL='resnet_v2_50'

TRAINING_STEPS = 300000
STEPS_PER_SUMMARY = 500

DATASET_N_WORKERS=32
BATCH_SIZE=48
BUFFER_SIZE=BATCH_SIZE*10
INPUT_IMAGE_SIZE=[416, 416]
OUTPUT_GRIDS=[
    [52, 52],
    [26, 26],
    [13, 13]
]
EXCLUDE_CLASSES=True
N_AUGMENTED = 9

CONFIDENCE_TRESH = 0.5
IOU_TRESH = 0.6

DROPOUT_KEEP_PROB = 0.5
LEARNING_RATE = 1e-4
LEARNING_RATE_DECAY = 0.99
LEARNING_RATE_DECAY_STEPS = 2000
GRAD_CLIP_VALUE = 20

STEPS_PER_CHECKPOINT = 2500
GPU_MEMORY_FRACTION = 0.8

SUMMARIZE_GRADIENTS = True

TRAINING_DIR = './training'
ALLOW_RESTORING = True

ENABLED_GPUS = [0]

DATASETS_PATH = '/home/facialrec/notebooks/VideointfacialRec/data/'

## Imports

In [2]:
import os

import imgaug
import importlib
import json
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
import time
import random
import shutil
import warnings
from imageio import imread
from detector.utils.data import smart_resize, convert_sample_to_YOLO_preresized, convert_YOLO_result_to_normal, non_max_suppression, smart_resize
from detector.utils.plot import plot_confusion_matrix

_model_py_module = importlib.import_module('detector.models.%s' % MODEL)
warnings.filterwarnings("ignore")

In [3]:
TRAINING_DIR = os.path.join(TRAINING_DIR, MODEL)

## Training and Evaluation Data

### Dataset compilation

In [4]:
import detector.datasets
from detector.datasets import REGISTERED_CLASSES, reset_classes, get_class_id

random.seed(42)

DATASETS = [
#     detector.datasets.FaceScrub_get_loader(os.path.join(DATASETS_PATH, 'faceScrub')),
    detector.datasets.WIDER_get_loader(os.path.join(DATASETS_PATH, 'WIDER')),
]

reset_classes()

DATASET_GETTERS = {i: d('annotation_getter') for i, d in enumerate(DATASETS)}
for item in DATASETS:
    item('info')
N_CLASSES = len(REGISTERED_CLASSES)
if EXCLUDE_CLASSES:
    N_CLASSES = 0

TRAIN_SAMPLES = [(i, entry) for i, d in enumerate(DATASETS) for entry in d('training')]
VALID_SAMPLES = [(i, entry) for i, d in enumerate(DATASETS) for entry in d('valid')]

print('The number of training samples:  ', len(TRAIN_SAMPLES))
print('The number of validation samples:', len(VALID_SAMPLES))
print('The number of classes:           ', N_CLASSES)

The number of training samples:   12880
The number of validation samples: 3226
The number of classes:            0


### Data pipeline

In [5]:
with tf.name_scope('dataset'):
    _seq = imgaug.augmenters.Sequential([
        imgaug.augmenters.Fliplr(0.5),
        imgaug.augmenters.Sometimes(0.5,
            imgaug.augmenters.Affine(
                scale={"x": (0.8, 1.2), "y": (0.8, 1.2)},
                translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)},
                rotate=(-5, 5),
                shear=(-8, 8),
                order=[0, 1],
                cval=(0, 255),
                mode=imgaug.ALL
            )
        ),
        imgaug.augmenters.SomeOf((0, 3),
            [
                imgaug.augmenters.OneOf([
                    imgaug.augmenters.GaussianBlur((0, 1.0)),
                    imgaug.augmenters.AverageBlur(k=(2, 5)),
                    imgaug.augmenters.MedianBlur(k=(3, 7)),
                ]),
                imgaug.augmenters.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5),
                imgaug.augmenters.ContrastNormalization((0.75, 1.25)),
                imgaug.augmenters.Multiply((0.75, 1.25), per_channel=0.2),
                imgaug.augmenters.Add((-10, 10), per_channel=0.5)
            ]
        )
    ])
    _seq_det = _seq.to_deterministic()
    
    def _map_fn(entry, dataset_id):
        try:
            if isinstance(entry, bytes):
                entry = entry.decode()
        
            image = imread(entry)
            if image.ndim == 2:
                image = image[..., np.newaxis]
                image = np.tile(image, [1, 1, 3])
            elif image.ndim == 3 and image.shape[-1] == 4:
                image = image[..., :-1]
            elif image.ndim != 3 or image.shape[-1] != 3:
                return None                        

            getter = DATASET_GETTERS[dataset_id]
            result = getter(entry)
            if result is None:
                return None
            
            bboxes, classes = result
            if classes is None or EXCLUDE_CLASSES:
                has_classes = False
            else:
                has_classes = True
                classes = [get_class_id(c) for c in classes]
                
            instances = [[image, has_classes, bboxes, classes]]
            
            bboxes_list_aug = imgaug.BoundingBoxesOnImage([imgaug.BoundingBox(x1=x, y1=y, x2=x+w, y2=y+h) for x,y,w,h in bboxes], shape=image.shape)
            
            images_aug = _seq_det.augment_images([image for j in range(N_AUGMENTED)])
            bboxes_list_aug = _seq_det.augment_bounding_boxes([bboxes_list_aug for j in range(N_AUGMENTED)])
            
            for image_aug, bboxes_aug in zip(images_aug, bboxes_list_aug):
                bboxes_aug = [(min(max(b.x1, 0), image_aug.shape[1]), min(max(b.y1, 0), image_aug.shape[0]),
                               min(max(b.x2, 0), image_aug.shape[1]), min(max(b.y2, 0), image_aug.shape[0])) for b in bboxes_aug.bounding_boxes]
                
                bboxes_aug = [(x1,y1,x2-x1,y2-y1) for x1,y1,x2,y2 in bboxes_aug]
                
                instances.append([image_aug, has_classes, bboxes_aug, classes])
            
            samples = []
            for instance in instances:
                image, has_classes, bboxes, classes = instance
                
                result = smart_resize(image, INPUT_IMAGE_SIZE[0], INPUT_IMAGE_SIZE[1], ret_image_shifts=True)
                image, resize_params = result

                sample = [image]

                if N_CLASSES > 0:
                    sample = sample + [np.array([has_classes])]

                for output_grid in OUTPUT_GRIDS:
                    if has_classes:
                        result = convert_sample_to_YOLO_preresized(image, resize_params, output_grid, bboxes, classes)
                    else:
                        result = convert_sample_to_YOLO_preresized(image, resize_params, output_grid, bboxes)

                    if result is None:
                        return None

                    if has_classes:
                        conf_probs_map, bboxes_map, classes_map = result
                        sample = sample + [conf_probs_map, bboxes_map, classes_map]
                    else:
                        conf_probs_map, bboxes_map = result
                        sample = sample + [conf_probs_map, bboxes_map]
                        if N_CLASSES > 0:
                            sample = sample + [np.zeros_like(conf_probs_map, dtype=np.int32)]
                samples.append(sample)

            return tuple(zip(*samples))
            
        except Exception as ex:
            #print(entry, repr(ex))
            #return None
            raise

    #            image
    _dtypes = [tf.uint8]
    _shapes = [INPUT_IMAGE_SIZE+[3]]
    
    if N_CLASSES > 0:
        #                  has_classes
        _dtypes = _dtypes + [tf.bool]
        _shapes = _shapes + [[1]]
    
    for output_grid in OUTPUT_GRIDS:
        #                      obj conf |  bbox 
        _dtypes = _dtypes + [tf.float32, tf.float32]
        _shapes = _shapes + [output_grid+[1], output_grid+[4]]
    
        if N_CLASSES > 0:
            #                has_classes |  classes
            _dtypes = _dtypes + [tf.int32]
            _shapes = _shapes + [output_grid+[1]]

    dataset_samples_tf_phr = tf.placeholder(tf.string, name='samples')
    dataset_ids_tf_phr = tf.placeholder(tf.int32, name='ids')

    dataset = tf.data.Dataset().from_tensor_slices((dataset_samples_tf_phr, dataset_ids_tf_phr))
    dataset = dataset.map(lambda entry, dataset_id: tf.py_func(_map_fn, [entry, dataset_id], _dtypes), DATASET_N_WORKERS)
    dataset = dataset.apply(tf.contrib.data.ignore_errors())
#     dataset = dataset.cache()
    dataset = dataset.flat_map(lambda *samples: tf.data.Dataset.from_tensor_slices(samples))
    dataset = dataset.map(lambda *sample: tuple(tf.reshape(item, shape) for item, shape in zip(sample, _shapes)), DATASET_N_WORKERS)
    dataset = dataset.shuffle(buffer_size=BUFFER_SIZE)
    dataset = dataset.batch(batch_size=BATCH_SIZE)
    dataset = dataset.repeat()
    dataset = dataset.prefetch(buffer_size=1)

train_iterator = dataset.make_initializable_iterator('train')
valid_iterator = dataset.make_initializable_iterator('valid')

train_batch = train_iterator.get_next()
valid_batch = valid_iterator.get_next()

## Network

### Computational graph

In [6]:
_out_channels = 1+4
if N_CLASSES > 0:
    if N_CLASSES == 2:
        _out_channels += 1
    else:
        _out_channels += N_CLASSES

_feature_embedd_channels = 256

with tf.name_scope('detector'):
    keep_prob = tf.placeholder_with_default(1., [], name='keep_prob')
    is_training_mode = tf.placeholder_with_default(False, [], name='is_training_mode')
    data_loader_mode = tf.placeholder_with_default('train-pipe', [], name='data_loader_mode')
    
    _batch = tf.case([(tf.equal(data_loader_mode, 'train-pipe'), lambda: train_batch),
                      (tf.equal(data_loader_mode, 'valid-pipe'), lambda: valid_batch)],
                     exclusive=True)
    _batch = tuple(tf.reshape(_batch[i], [-1] + shape[1:].as_list()) for i, shape in enumerate(dataset.output_shapes))
    
    with tf.name_scope('inputs'):
        inputs_image = tf.identity(_batch[0], name='image')
        
    with tf.name_scope('targets'):
        if N_CLASSES > 0:
            targets_has_classes = tf.identity(_batch[1], name='has_classes')
            counter = 2
        else:
            counter = 1

        targets_nodes = []
        for i, grid in enumerate(OUTPUT_GRIDS):
            with tf.name_scope('grid-%ix%i' % tuple(grid)):
                targets_conf_probs = tf.identity(_batch[counter], name='conf_probs')
                counter+=1
                
                targets_bboxes = tf.identity(_batch[counter], name='bboxes_xy')
                counter+=1
                
                if N_CLASSES > 0:
                    targets_classes = tf.identity(_batch[counter], name='classes')
                    counter+=1

                    targets_nodes.append((targets_conf_probs, targets_bboxes, targets_classes))
                else:
                    targets_nodes.append((targets_conf_probs, targets_bboxes))
        
    with tf.name_scope('model'):
        _nodes = _model_py_module.model(tf.cast(inputs_image, tf.float32), OUTPUT_GRIDS, is_training_mode, keep_prob)
        
        _output_nodes = []
        _prev_features_map = None
        for _net, output_grid in reversed(list(zip(_nodes, OUTPUT_GRIDS))):
            with tf.variable_scope('grid-%ix%i' % tuple(output_grid)):
                with slim.arg_scope([slim.conv2d], padding='SAME'):
                    _net = slim.conv2d(_net, _feature_embedd_channels, [1, 1])
                    
                    if _prev_features_map is not None:
                        _net = _net + tf.image.resize_nearest_neighbor(_prev_features_map, [_prev_features_map.shape[1]*2, _prev_features_map.shape[2]*2])
                        _net = slim.conv2d(_net, _feature_embedd_channels, [3, 3])
                    
                    _prev_features_map = _net
                    
                    _net = slim.conv2d(_net, _out_channels, [1, 1], activation_fn=None, weights_initializer=tf.zeros_initializer())
                    
                    _output_nodes.append(_net)

                assert _net.shape[1] == output_grid[0] and _net.shape[2] == output_grid[1], \
                    'Incorrect ouput grid shape: must be [%d, %d], but [%d, %d] found.' % tuple(output_grid + [_net.shape[1], _net.shape[2]])
        _output_nodes = list(reversed(_output_nodes))
    
    with tf.name_scope('outputs'):
        output_nodes = []
        for _net, grid in zip(_output_nodes, OUTPUT_GRIDS):
            with tf.name_scope('grid-%ix%i' % tuple(grid)):
                outputs_conf_probs_logits = _net[:, :, :, 0:1]
                outputs_conf_probs = tf.identity(tf.nn.sigmoid(outputs_conf_probs_logits), name='conf_probs')

                outputs_bboxes = _net[:, :, :, 1:5]
                
                outputs_bboxes_xy = tf.sigmoid(outputs_bboxes[:, :, :, :2])
                outputs_bboxes_wh_log = outputs_bboxes[:, :, :, 2:]
                outputs_bboxes_wh = tf.exp(outputs_bboxes_wh_log)
                
                outputs_bboxes = tf.identity(tf.concat([outputs_bboxes_xy, outputs_bboxes_wh], axis=-1), name='bboxes')

                if N_CLASSES > 0:
                    outputs_classes_logits = _net[:, :, :, 5:]
                    if N_CLASSES == 1 or N_CLASSES == 2:
                        outputs_classes_probs = tf.sigmoid(outputs_classes_logits)

                        outputs_classes = tf.identity(tf.cast(outputs_classes_probs >= 0.5, tf.int32)[..., 0], name='classes')
                    else:
                        outputs_classes_probs = tf.nn.softmax(outputs_classes_logits, dim=-1)

                        outputs_classes = tf.identity(tf.argmax(outputs_classes_probs, axis=-1), name='classes')

                    output_nodes.append(((outputs_conf_probs_logits, outputs_conf_probs),
                                         (outputs_bboxes_wh_log, outputs_bboxes),
                                         (outputs_classes_logits, outputs_classes_probs, outputs_classes)))
                else:
                    output_nodes.append(((outputs_conf_probs_logits, outputs_conf_probs),
                                         (outputs_bboxes_wh_log, outputs_bboxes)))

### Pretrained parameters

In [7]:
print('Getting model\'s pretrained weights ', flush=True, end='')
try:
    _model_py_module.get_weights()
    model_initial_weights_loader = _model_py_module.get_restore_op()
    print('[OK]', flush=True)
except Exception as ex:
    model_initial_weights_loader = None
    print('[Failed]', flush=True)
    print(repr(ex))

Getting model's pretrained weights [OK]


## Training

### Loss function

In [8]:
training_scope = tf.name_scope('training')
with training_scope:
    with tf.name_scope('losses'):
        for output_grid, _targets_nodes, _outputs_nodes in zip(OUTPUT_GRIDS, targets_nodes, output_nodes):
            with tf.name_scope('grid-%ix%i' % tuple(output_grid)):
                if N_CLASSES > 0:
                    targets_conf_probs, targets_bboxes, targets_classes = _targets_nodes
                    
                    ((outputs_conf_probs_logits, outputs_conf_probs),
                     (outputs_bboxes_wh_log, outputs_bboxes),
                     (outputs_classes_logits, outputs_classes_probs, outputs_classes)) = _outputs_nodes
                    
                    _targets_classes = tf.cast(targets_classes, tf.int32)
                    
                    _targets_classes = _targets_classes[..., 0]
                    _targets_classes_one_hot = tf.one_hot(_targets_classes, N_CLASSES)
                    if N_CLASSES == 2:
                        _targets_classes_probs = tf.cast(tf.expand_dims(_targets_classes, axis=-1), tf.float32)
                    else:
                        _targets_classes_probs = tf.cast(_targets_classes_one_hot, tf.float32)
                else:
                    targets_conf_probs, targets_bboxes = _targets_nodes
                    ((outputs_conf_probs_logits, outputs_conf_probs),
                     (outputs_bboxes_wh_log, outputs_bboxes)) = _outputs_nodes
                    
                _targets_conf_probs = tf.cast(targets_conf_probs, tf.float32)
                _targets_bboxes = tf.cast(targets_bboxes, tf.float32)
                _targets_bboxes_xy = _targets_bboxes[..., :2]
                _targets_bboxes_wh = _targets_bboxes[..., 2:]
                    
                _targets_conf_probs_bin = tf.cast(_targets_conf_probs >= CONFIDENCE_TRESH, tf.float32)
                _outputs_conf_probs_bin = tf.cast(outputs_conf_probs >= CONFIDENCE_TRESH, tf.float32)
                        
                _detectors_mask = tf.equal(tf.minimum(_targets_conf_probs_bin, _outputs_conf_probs_bin), 1)
                _detectors_mask = _detectors_mask[..., 0]

                _targets_bboxes_xy_actual = tf.boolean_mask(_targets_bboxes_xy, _detectors_mask)
                _targets_bboxes_wh_actual = tf.boolean_mask(_targets_bboxes_wh, _detectors_mask)

                _outputs_bboxes_xy_actual = tf.boolean_mask(outputs_bboxes[..., :2], _detectors_mask)
                _outputs_bboxes_wh_log_actual = tf.boolean_mask(outputs_bboxes_wh_log, _detectors_mask)

                # conf_probs_loss

                _n_objects = tf.reduce_mean(tf.reduce_sum(_targets_conf_probs_bin, axis=[1, 2, 3]))
                _n_no_objects = tf.reduce_mean(tf.reduce_sum(1-_targets_conf_probs_bin, axis=[1, 2, 3]))

                _n_median_class = tf.contrib.distributions.percentile([_n_no_objects, _n_objects], 50)
                _n_max_class = tf.reduce_max([_n_objects, _n_no_objects])

                _object_scale = (_n_max_class / _n_objects)
                _no_object_scale = (_n_max_class / _n_no_objects)

                _no_objects_loss = -(1 - _targets_conf_probs) * tf.log(tf.maximum(1-outputs_conf_probs, 1e-6))
                _no_objects_loss = tf.reduce_sum(_no_objects_loss, axis=[1, 2, 3])

                _objects_loss = -_targets_conf_probs * tf.log(tf.maximum(outputs_conf_probs, 1e-6))
                _objects_loss = tf.reduce_sum(_objects_loss, axis=[1, 2, 3])

                conf_probs_loss = (_object_scale * _objects_loss + _no_object_scale * _no_objects_loss) / np.mean(output_grid)
                conf_probs_loss = tf.reduce_mean(conf_probs_loss)

                tf.losses.add_loss(tf.cond(tf.is_finite(conf_probs_loss), lambda: conf_probs_loss, lambda: tf.constant(0, tf.float32)))

                # coordinates_loss

                _coordinates_scale = 5

                xy_loss = _coordinates_scale * tf.losses.mean_squared_error(_targets_bboxes_xy_actual, _outputs_bboxes_xy_actual, loss_collection=None)
                tf.losses.add_loss(tf.cond(tf.is_finite(xy_loss), lambda: xy_loss, lambda: tf.constant(0, tf.float32)))

                wh_loss = _coordinates_scale * tf.losses.mean_squared_error(tf.log(_targets_bboxes_wh_actual), _outputs_bboxes_wh_log_actual, loss_collection=None)
                tf.losses.add_loss(tf.cond(tf.is_finite(wh_loss), lambda: wh_loss, lambda: tf.constant(0, tf.float32)))

                # classes_loss

                if N_CLASSES > 0:                    
                    _detectors_mask_actual = tf.boolean_mask(_detectors_mask, targets_has_classes[:, 0])
                    _targets_classes_probs_actual = tf.boolean_mask(_targets_classes_probs, targets_has_classes[:, 0])
                    outputs_classes_probs_actual = tf.boolean_mask(outputs_classes_probs, targets_has_classes[:, 0])
                    
                    _targets_classes_probs_actual = tf.boolean_mask(_targets_classes_probs, _detectors_mask_actual)
                    outputs_classes_probs_actual = tf.boolean_mask(outputs_classes_probs, _detectors_mask_actual)

                    _n_classes = tf.reduce_sum(tf.boolean_mask(_targets_classes_one_hot, _detectors_mask_actual), axis=0, keepdims=True)
                    _n_median_class = tf.contrib.distributions.percentile(_n_classes, 50, axis=[-1], keep_dims=True)

                    _classes_scale = (_n_median_class / tf.maximum(_n_classes, 1e-6))

                    if N_CLASSES == 2: # bce
                        classes_loss = -(_classes_scale[..., 1:] * _targets_classes_probs_actual * tf.log(tf.maximum(outputs_classes_probs_actual, 1e-6)) +
                                         _classes_scale[..., 0:1] * (1-_targets_classes_probs_actual) * tf.log(tf.maximum(1-outputs_classes_probs_actual, 1e-6)))
                    else: # cce
                        classes_loss = -tf.reduce_sum(_classes_scale * _targets_classes_probs_actual * tf.log(tf.maximum(outputs_classes_probs_actual, 1e-6)), axis=-1)

                    classes_loss = tf.reduce_mean(classes_loss)

                    tf.losses.add_loss(tf.cond(tf.is_finite(classes_loss), lambda: classes_loss, lambda: tf.constant(0, tf.float32)))

        loss = tf.losses.get_total_loss(False)
        _train_loss = tf.losses.get_total_loss()

### Getting a training operation

In [9]:
with training_scope:
    step_var = tf.Variable(0, trainable=False)
    step_inc_op = step_var.assign(step_var + 1)
  
    _params = tf.trainable_variables()
    
    _excludes = _model_py_module.exclude_params()
    if _excludes:
        _params = list(filter(lambda x: any([item not in x.name for item in _excludes]), _params))

    with tf.name_scope('optimizer'):
        with tf.name_scope('params'):
            lr_var = tf.Variable(LEARNING_RATE, trainable=False)
            
            if LEARNING_RATE_DECAY and LEARNING_RATE_DECAY_STEPS:
                lr_var = tf.train.exponential_decay(lr_var, step_var, LEARNING_RATE_DECAY_STEPS, LEARNING_RATE_DECAY, staircase=False)

        _update_ops = tf.get_collection(slim.ops.GraphKeys.UPDATE_OPS)
        _optimizer = tf.train.AdamOptimizer(lr_var)
        train_op = slim.learning.create_train_op(_train_loss, _optimizer,
                                                 clip_gradient_norm=(GRAD_CLIP_VALUE if GRAD_CLIP_VALUE is not None else 0),
                                                 update_ops=_update_ops,
                                                 variables_to_train=_params,
                                                 summarize_gradients=SUMMARIZE_GRADIENTS)
            

INFO:tensorflow:Var resnet_v2_50/block3/unit_6/bottleneck_v2/conv3/biases has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_1/bottleneck_v2/preact/gamma has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_1/bottleneck_v2/preact/beta has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_1/bottleneck_v2/shortcut/biases has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_1/bottleneck_v2/conv1/BatchNorm/gamma has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_1/bottleneck_v2/conv1/BatchNorm/beta has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_1/bottleneck_v2/conv2/BatchNorm/gamma has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_1/bottleneck_v2/conv2/BatchNorm/beta has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_1/bottleneck_v2/conv3/biases has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_2/bottleneck_v2/preact/gamma has no gradient
INFO:tensorflow:Var resnet_v2_50/block4/unit_2/bottleneck_v2/pre

### Accuracy metrics calculation

In [10]:
with training_scope:
    with tf.name_scope('metrics'):
        conf_probs_accuracy = []
        conf_probs_precision = []
        conf_probs_recall = []
        conf_probs_f1_score = []
        
        bboxes_IoU = []
        
        if N_CLASSES > 0:
            classes_accuracy = []

            classes_confusion_matrix = []
            
            classes_confusion_matrix_img = []

        for output_grid, _targets_nodes, _outputs_nodes in zip(OUTPUT_GRIDS, targets_nodes, output_nodes):
            with tf.name_scope('grid-%ix%i' % tuple(output_grid)):
                if N_CLASSES > 0:
                    targets_conf_probs, targets_bboxes, targets_classes = _targets_nodes
                    
                    ((outputs_conf_probs_logits, outputs_conf_probs),
                     (outputs_bboxes_wh_log, outputs_bboxes),
                     (outputs_classes_logits, outputs_classes_probs, outputs_classes)) = _outputs_nodes
                    
                    _targets_classes = tf.cast(targets_classes, tf.int32)
                    
                    _targets_classes = _targets_classes[..., 0]
                    _targets_classes_one_hot = tf.one_hot(_targets_classes, N_CLASSES)
                    if N_CLASSES == 2:
                        _targets_classes_probs = tf.cast(tf.expand_dims(_targets_classes, axis=-1), tf.float32)
                    else:
                        _targets_classes_probs = tf.cast(_targets_classes_one_hot, tf.float32)
                else:
                    targets_conf_probs, targets_bboxes = _targets_nodes
                    ((outputs_conf_probs_logits, outputs_conf_probs),
                     (outputs_bboxes_wh_log, outputs_bboxes)) = _outputs_nodes
                    
                _targets_conf_probs = tf.cast(targets_conf_probs, tf.float32)
                _targets_bboxes = tf.cast(targets_bboxes, tf.float32)
                _targets_bboxes_xy = _targets_bboxes[..., :2]
                _targets_bboxes_wh = _targets_bboxes[..., 2:]
                    
                _targets_conf_probs_bin = tf.cast(_targets_conf_probs >= CONFIDENCE_TRESH, tf.float32)
                _outputs_conf_probs_bin = tf.cast(outputs_conf_probs >= CONFIDENCE_TRESH, tf.float32)
                        
                _detectors_mask = tf.equal(tf.minimum(_targets_conf_probs_bin, _outputs_conf_probs_bin), 1)
                _detectors_mask = _detectors_mask[..., 0]

                _targets_bboxes_xy_actual = tf.boolean_mask(_targets_bboxes_xy, _detectors_mask)
                _targets_bboxes_wh_actual = tf.boolean_mask(_targets_bboxes_wh, _detectors_mask)

                _outputs_bboxes_xy_actual = tf.boolean_mask(outputs_bboxes[..., :2], _detectors_mask)
                _outputs_bboxes_wh_log_actual = tf.boolean_mask(outputs_bboxes_wh_log, _detectors_mask)
                _outputs_bboxes_wh_actual = tf.boolean_mask(outputs_bboxes[..., 2:], _detectors_mask)
                
                # Confidence accuracy

                _targets_conf_probs_bin_flatten = tf.reshape(_targets_conf_probs_bin, [-1])
                outputs_conf_probs_bin_flatten = tf.reshape(_outputs_conf_probs_bin, [-1])

                _true_positives = tf.reduce_sum(tf.minimum(_targets_conf_probs_bin_flatten, outputs_conf_probs_bin_flatten))
                _true_negatives = tf.reduce_sum(tf.minimum(1-_targets_conf_probs_bin_flatten, 1-outputs_conf_probs_bin_flatten))
                _false_positives = tf.reduce_sum(tf.minimum(1-_targets_conf_probs_bin_flatten, outputs_conf_probs_bin_flatten))
                _false_negatives = tf.reduce_sum(tf.minimum(_targets_conf_probs_bin_flatten, 1-outputs_conf_probs_bin_flatten))

                conf_probs_accuracy.append(tf.reduce_mean((_true_positives+_true_negatives)/tf.maximum(_true_positives+_false_positives+_false_negatives+_true_negatives, 1e-9)))
                conf_probs_precision.append(tf.reduce_mean(_true_positives/tf.maximum(_true_positives+_false_positives, 1e-9)))
                conf_probs_recall.append(tf.reduce_mean(_true_positives/tf.maximum(_true_positives+_false_negatives, 1e-9)))
                conf_probs_f1_score.append(2*(conf_probs_precision[-1] * conf_probs_recall[-1]) / tf.maximum(conf_probs_precision[-1] + conf_probs_recall[-1], 1e-9))

                # IoU

                # intersection-over-union

                # correction of negative values of bboxes

                _targets_bboxes_xy_actual_corrected = tf.maximum(_targets_bboxes_xy_actual, 0)
                _targets_bboxes_wh_actual_corrected = tf.maximum(_targets_bboxes_wh_actual, 0)
                _outputs_bboxes_xy_actual_corrected = tf.maximum(_outputs_bboxes_xy_actual, 0)
                _outputs_bboxes_wh_actual_corrected = tf.maximum(_outputs_bboxes_wh_actual, 0)

                _targets_bboxes_wh_actual_half = _targets_bboxes_wh_actual_corrected / 2.
                _targets_mins  = _targets_bboxes_xy_actual_corrected - _targets_bboxes_wh_actual_half
                _targets_maxes = _targets_bboxes_xy_actual_corrected + _targets_bboxes_wh_actual_half

                _outputs_bboxes_wh_actual_half = _outputs_bboxes_wh_actual_corrected / 2.
                _outputs_mins  = _outputs_bboxes_xy_actual_corrected - _outputs_bboxes_wh_actual_half
                _outputs_maxes = _outputs_bboxes_xy_actual_corrected + _outputs_bboxes_wh_actual_half       

                _intersect_mins = tf.maximum(_targets_mins, _outputs_mins)
                _intersect_maxes = tf.minimum(_targets_maxes, _outputs_maxes)
                _intersect_wh = tf.maximum(0., _intersect_maxes - _intersect_mins)

                _intersect_areas = _intersect_wh[..., 0] * _intersect_wh[..., 1]

                _targets_areas = _targets_bboxes_wh_actual_corrected[..., 0] * _targets_bboxes_wh_actual_corrected[..., 1]
                _outputs_areas = _outputs_bboxes_wh_actual_corrected[..., 0] * _outputs_bboxes_wh_actual_corrected[..., 1]

                _union_areas = _targets_areas + _outputs_areas - _intersect_areas

                _IoU_scores = tf.expand_dims(_intersect_areas / tf.maximum(_union_areas, 1e-6), axis=-1)
                _IoU_scores = tf.maximum(tf.minimum(_IoU_scores, 1), 0)

                _bboxes_IoU = tf.reduce_mean(_IoU_scores)
                bboxes_IoU.append(tf.cond(tf.is_finite(_bboxes_IoU), lambda: _bboxes_IoU, lambda: tf.constant(0, tf.float32)))

                # Classes accuracy

                if N_CLASSES > 0:
                    _targets_classes_actual = tf.cast(tf.boolean_mask(_targets_classes, _detectors_mask), tf.int32)
                    _outputs_classes_actual = tf.cast(tf.boolean_mask(outputs_classes, _detectors_mask), tf.int32)

                    classes_accuracy.append(tf.reduce_mean(tf.cast(tf.equal(_targets_classes_actual, _outputs_classes_actual), tf.float32)))
                    classes_confusion_matrix.append(tf.confusion_matrix(_targets_classes_actual, _outputs_classes_actual, num_classes=N_CLASSES))
                    
                    def _plot_confusion_matrix_wrapper(classes_confusion_matrix):
                        return plot_confusion_matrix(classes_confusion_matrix, REGISTERED_CLASSES)

                    _img = tf.py_func(_plot_confusion_matrix_wrapper, [classes_confusion_matrix[-1]], tf.uint8)
                    classes_confusion_matrix_img.append(tf.expand_dims(_img, axis=0))

        conf_probs_accuracy_mean = tf.reduce_mean(conf_probs_accuracy, axis=0)
        conf_probs_precision_mean = tf.reduce_mean(conf_probs_precision, axis=0)
        conf_probs_recall_mean = tf.reduce_mean(conf_probs_recall, axis=0)
        conf_probs_f1_score_mean = tf.reduce_mean(conf_probs_f1_score, axis=0)
        
        bboxes_IoU_mean = tf.reduce_mean(bboxes_IoU, axis=0)
        
        if N_CLASSES > 0:
            classes_accuracy_mean = tf.reduce_mean(classes_accuracy, axis=0)

            classes_confusion_matrix_overal = tf.reduce_sum(classes_confusion_matrix, axis=0)

            def _plot_confusion_matrix_wrapper(classes_confusion_matrix):
                return plot_confusion_matrix(classes_confusion_matrix, REGISTERED_CLASSES)

            classes_confusion_matrix_img_overal = tf.py_func(_plot_confusion_matrix_wrapper, [classes_confusion_matrix_overal], tf.uint8)
            classes_confusion_matrix_img_overal = tf.expand_dims(classes_confusion_matrix_img_overal, axis=0)

### Training summary

In [11]:
_summaries = []
_summaries.append(tf.summary.scalar('loss/total', loss))
_summaries.append(tf.summary.scalar('loss/conf-probs', conf_probs_loss))
_summaries.append(tf.summary.scalar('loss/xy', xy_loss))
_summaries.append(tf.summary.scalar('loss/wh', wh_loss))

if N_CLASSES > 0:
    _summaries.append(tf.summary.scalar('loss/classes', classes_loss))

_summaries.append(tf.summary.scalar('learning-rate', lr_var))

# _summaries.append(tf.summary.scalar('conf-probs-accuracy', conf_probs_accuracy))
_summaries.append(tf.summary.scalar('conf-probs/precision', conf_probs_precision_mean))
_summaries.append(tf.summary.scalar('conf-probs/recall', conf_probs_recall_mean))
_summaries.append(tf.summary.scalar('conf-probs/f1-score', conf_probs_f1_score_mean))

for output_grid, precision, recall, f1_score in zip(OUTPUT_GRIDS, conf_probs_precision, conf_probs_recall, conf_probs_f1_score):
    _summaries.append(tf.summary.scalar('conf-probs/grid-%ix%i/precision' % tuple(output_grid), precision))
    _summaries.append(tf.summary.scalar('conf-probs/grid-%ix%i/recall' % tuple(output_grid), recall))
    _summaries.append(tf.summary.scalar('conf-probs/grid-%ix%i/f1-score' % tuple(output_grid), f1_score))

_summaries.append(tf.summary.scalar('bboxes/IoU', bboxes_IoU_mean))

for output_grid, iou in zip(OUTPUT_GRIDS, bboxes_IoU):
    _summaries.append(tf.summary.scalar('bboxes/grid-%ix%i/IoU' % tuple(output_grid), iou))

if N_CLASSES > 0:
    _summaries.append(tf.summary.scalar('classes/accuracy', classes_accuracy_mean))
    _summaries.append(tf.summary.image('classes/confusion-matrix', classes_confusion_matrix_img_overal))
    for output_grid, accuracy, img in zip(OUTPUT_GRIDS, classes_accuracy, classes_confusion_matrix_img):
        _summaries.append(tf.summary.scalar('classes/grid-%ix%i/accuracy' % tuple(output_grid), accuracy))
        _summaries.append(tf.summary.image('classes/grid-%ix%i/confusion-matrix' % tuple(output_grid), img))

for output_grid, _targets_nodes, _outputs_nodes in zip(OUTPUT_GRIDS, targets_nodes, output_nodes):
    _targets_conf_probs = tf.cast(_targets_nodes[0], tf.float32)
    _targets_conf_probs_bin = tf.cast(_targets_conf_probs >= CONFIDENCE_TRESH, tf.float32)
    
    _outputs_conf_probs = tf.cast(_outputs_nodes[0][1], tf.float32)
    _outputs_conf_probs_bin = tf.cast(_outputs_conf_probs >= CONFIDENCE_TRESH, tf.float32)
    
    _summaries.append(tf.summary.image('conf-probs/grid-%ix%i/map' % tuple(output_grid), tf.cast(_outputs_conf_probs*255, tf.uint8)))
    _summaries.append(tf.summary.image('conf-probs/grid-%ix%i/map-output' % tuple(output_grid), tf.cast(_outputs_conf_probs_bin*255, tf.uint8)))
    _summaries.append(tf.summary.image('conf-probs/grid-%ix%i/map-target' % tuple(output_grid), tf.cast(_targets_conf_probs_bin*255, tf.uint8)))

_summaries.append(tf.summary.image('images', inputs_image))

train_summary_op = tf.summary.merge_all()
valid_summary_op = tf.summary.merge(_summaries)

### Training loop

In [12]:
checkpoint_path = os.path.join(TRAINING_DIR, 'model.ckpt')
saver = tf.train.Saver(tf.global_variables())

In [None]:
if not os.path.exists(TRAINING_DIR):
    os.makedirs(TRAINING_DIR)

gpu_options = tf.GPUOptions(allow_growth=False, per_process_gpu_memory_fraction=GPU_MEMORY_FRACTION)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    try:
        print('Initializing parameters ', flush=True, end='')
        
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        _train_ids, _train_entries = zip(*TRAIN_SAMPLES)
        _valid_ids, _valid_entries = zip(*VALID_SAMPLES)
        sess.run(train_iterator.initializer, {dataset_ids_tf_phr: _train_ids, dataset_samples_tf_phr: _train_entries})
        sess.run(valid_iterator.initializer, {dataset_ids_tf_phr: _valid_ids, dataset_samples_tf_phr: _valid_entries})
    
        print('[OK]', flush=True)
    except:
        print('[Failed]', flush=True) 
        raise

    if model_initial_weights_loader is not None:
        model_initial_weights_loader(sess)
            
    ckpt = tf.train.get_checkpoint_state(TRAINING_DIR)
    if ALLOW_RESTORING and ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        shutil.rmtree(TRAINING_DIR)
        saver.save(sess, checkpoint_path)

    with open(os.path.join(TRAINING_DIR, 'classes.json'), 'w') as f:
        json.dump(REGISTERED_CLASSES, f)
    tf.train.write_graph(sess.graph_def, TRAINING_DIR, 'graph.pb', as_text=False)
    
    _train_summary_writer = tf.summary.FileWriter(os.path.join(TRAINING_DIR, 'summary', 'train'), sess.graph)
    _valid_summary_writer = tf.summary.FileWriter(os.path.join(TRAINING_DIR, 'summary', 'valid'), sess.graph)
    
    try:
        _step = sess.run(step_var)
        if _step == 0:
            _train_loss, _train_summary = sess.run([loss, valid_summary_op], {data_loader_mode: 'train-pipe'})
            _valid_loss, _valid_summary = sess.run([loss, valid_summary_op], {data_loader_mode: 'valid-pipe'})
            _train_summary_writer.add_summary(_train_summary, _step)
            _valid_summary_writer.add_summary(_valid_summary, _step)

            print('Initial train loss = %.6f, valid loss = %.6f.' % (_train_loss, _valid_loss), flush=True)

        print('Start trainging.', flush=True)
        
        start = time.time()
        for _ in range(_step, TRAINING_STEPS):
            sess.run(step_inc_op)
            _step = sess.run(step_var)

            sess.run([train_op], {is_training_mode: True,
                                  data_loader_mode: 'train-pipe',
                                  keep_prob: DROPOUT_KEEP_PROB})
            
            if _step % STEPS_PER_SUMMARY == 0:
                _train_loss, _train_summary = sess.run([loss, train_summary_op], {data_loader_mode: 'train-pipe'})
                _valid_loss, _valid_summary = sess.run([loss, valid_summary_op], {data_loader_mode: 'valid-pipe'})
                _train_summary_writer.add_summary(_train_summary, _step)
                _valid_summary_writer.add_summary(_valid_summary, _step)    
 
                elapsed = time.time() - start
                start = time.time()
                print('Step #%i: train loss = %.6f, valid loss = %.6f, elapsed %.3f sec.' % (_step, _train_loss, _valid_loss, elapsed), flush=True)

            if _step % STEPS_PER_CHECKPOINT == 0:
                saver.save(sess, checkpoint_path, global_step=_step)

        print('Training process is finished.', flush=True)
    except Exception as ex:
        last_error = ex
        raise
    finally:
        saver.save(sess, checkpoint_path, global_step=_step)
        tf.train.write_graph(sess.graph_def, TRAINING_DIR, 'graph.pb', as_text=False)

Initializing parameters [OK]
INFO:tensorflow:Restoring parameters from ./pretrained/resnet_v2_50/resnet_v2_50.ckpt
INFO:tensorflow:Restoring parameters from ./training/resnet_v2_50/model.ckpt-75000
Start trainging.
Step #75500: train loss = 33.575405, valid loss = 47.858009, elapsed 890.270 sec.
Step #76000: train loss = 38.721603, valid loss = 30.411934, elapsed 816.105 sec.
Step #76500: train loss = 28.355383, valid loss = 57.116844, elapsed 956.569 sec.
Step #77000: train loss = 17.806036, valid loss = 37.367558, elapsed 805.750 sec.
Step #77500: train loss = 25.175249, valid loss = 31.464621, elapsed 799.229 sec.
Step #78000: train loss = 27.120445, valid loss = 41.684307, elapsed 771.507 sec.
Step #78500: train loss = 33.074959, valid loss = 48.926884, elapsed 829.357 sec.
Step #79000: train loss = 30.642330, valid loss = 32.543900, elapsed 773.247 sec.
Step #79500: train loss = 19.716675, valid loss = 36.876942, elapsed 731.665 sec.
Step #80000: train loss = 36.518223, valid loss