In [1]:
import argparse
import os
import logging
import sys
import itertools

import torch
from torch.utils.data import DataLoader, ConcatDataset
from torch.optim.lr_scheduler import CosineAnnealingLR, MultiStepLR
import torch.nn as nn
import torch
import numpy as np
from typing import List, Tuple
import torch.nn.functional as F
from torch.nn import Conv2d, Sequential, ModuleList, BatchNorm2d
from torchvision import transforms
import math
from PIL import Image
from collections import namedtuple
from torch.utils.data import Dataset, DataLoader

# VISION
from vision.datasets.voc_dataset import VOCDataset
from vision.utils.misc import str2bool, Timer, freeze_net_layers, store_labels
from vision.ssd.ssd import MatchPrior
from vision.nn.multibox_loss import MultiboxLoss
from vision.ssd.config import mobilenetv1_ssd_config
from vision.ssd.data_preprocessing import TrainAugmentation, TestTransform

# dataset dependencies
from PIL import Image
from torchvision import transforms, utils
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

%tb

No traceback available to show.


In [2]:

def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0):
    return Sequential(
        Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
               groups=in_channels, stride=stride, padding=padding),
        BatchNorm2d(in_channels),
        nn.ReLU6(),
        Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
    )

def conv_bn(inp, oup, stride, use_batch_norm=True):
    if use_batch_norm:
        return nn.Sequential(
            nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
            nn.BatchNorm2d(oup),
            nn.ReLU6(inplace=True)
        )
    else:
        return nn.Sequential(
            nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
            nn.ReLU6(inplace=True)
        )


def conv_1x1_bn(inp, oup, use_batch_norm=True):
    if use_batch_norm:
        return nn.Sequential(
            nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
            nn.BatchNorm2d(oup),
            nn.ReLU6(inplace=True)
        )
    else:
        return nn.Sequential(
            nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
            nn.ReLU6(inplace=True)
        )

In [3]:
class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio, use_batch_norm=True):
        super(InvertedResidual, self).__init__()

        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = round(inp * expand_ratio)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            if use_batch_norm:
                self.conv = nn.Sequential(
                    # dw
                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                    nn.BatchNorm2d(hidden_dim),
                    nn.ReLU6(inplace=True),
                    # pw-linear
                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                    nn.BatchNorm2d(oup),
                )
            else:
                self.conv = nn.Sequential(
                    # dw
                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                    nn.ReLU6(inplace=True),
                    # pw-linear
                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                )
        else:
            if use_batch_norm:
                self.conv = nn.Sequential(
                    # pw
                    nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                    nn.BatchNorm2d(hidden_dim),
                    nn.ReLU6(inplace=True),
                    # dw
                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                    nn.BatchNorm2d(hidden_dim),
                    nn.ReLU6(inplace=True),
                    # pw-linear
                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                    nn.BatchNorm2d(oup),
                )
            else:
                self.conv = nn.Sequential(
                    # pw
                    nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                    nn.ReLU6(inplace=True),
                    # dw
                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                    nn.ReLU6(inplace=True),
                    # pw-linear
                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


In [4]:
class MobileNetV2(nn.Module):
    def __init__(self, n_class=1000, input_size=224, width_mult=1., dropout_ratio=0.2,
                 use_batch_norm=True):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t, c, n, s
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # building first layer
        assert input_size % 32 == 0
        input_channel = int(input_channel * width_mult)
        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, 2)]
        # building inverted residual blocks
        for t, c, n, s in interverted_residual_setting:
            output_channel = int(c * width_mult)
            for i in range(n):
                if i == 0:
                    self.features.append(block(input_channel, output_channel, s,
                                               expand_ratio=t, use_batch_norm=use_batch_norm))
                else:
                    self.features.append(block(input_channel, output_channel, 1,
                                               expand_ratio=t, use_batch_norm=use_batch_norm))
                input_channel = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(input_channel, self.last_channel,
                                         use_batch_norm=use_batch_norm))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)

        # building classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_ratio),
            nn.Linear(self.last_channel, n_class),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = x.mean(3).mean(2)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


In [5]:
class JointLoss(nn.Module):
    def __init__(self, device, image_size = 300, anchors_dims=[
                                                             [1/19, 1/19],
                                                             [0.1, 0.1],
                                                             [0.2, 0.2],
                                                             [1/3, 1/3],
                                                             [0.5, 0.5],
                                                             [1.0, 1.0]
                                                                ],
                 lambda_noobj=0.5, lambda_coor=5, lambda_landmarks=2):
        """

        :param anchors_dims: (list) of size num_of_anchor_box
                                     containing [width, height]  anchors
        :param lambda_noobj: from YOLO
        :param lambda_coor: from YOLO
        """
        super(JointLoss, self).__init__()
        self.device = device
        self.anchors_dims = anchors_dims
        self.mse_loss = nn.MSELoss(reduction='mean')
        self.smooth_l1_loss = nn.SmoothL1Loss(reduction='mean')

        self.lambda_noobj = lambda_noobj
        self.lambda_coor = lambda_coor
        self.image_size = image_size
        self.lambda_landmarks = lambda_landmarks

    def forward(self, prediction, target):
        """
        Joint loss for landmarks(NME) and bounding boxes(localization loss, confidence loss)

        :param prediction: (tuple) containing predictions for bounding boxes and landmarks
        bounding boxes prediction:  torch.size(batch_size, grid_size, grid_size, num_anchor_box, 4+1)
        landmarks prediction: torch.size(batch_size, num_anchor_box, 68, 2 or 3)

        :param target: (tuple) containing target values  for bounding boxes and landmarks
        bounding boxes targets: torch.size(batch_size, max_num_of_faces, 4+1)
        landmarks targets: torch.size(batch_size, max_num_of_faces, 68, 2 or 3)

        :return: normalized mean error(float), localization loss(float), confidence loss(float),
        best predicted bbox matched with gt (torch.tensor)
        
        torch.Size([1, 3000, 136])  landmarks prediction: torch.size(batch_size, grid_size, grid_size, num_anchor_box, 68, 2 or 3)
        torch.Size([1, 68, 2])  landmarks target SHAPE must be torch.size(batch_size, max_num_of_faces, 68, 2 or 3)
        """
        bbox_prediction, landmarks_prediction = prediction
        bbox_prediction, landmarks_prediction = bbox_prediction.to(self.device), landmarks_prediction.to(self.device)
        gt_boxes, gt_conf, obj_mask, noobj_mask, gt_landmarks = self.get_mask(prediction, target)
        # choose only landmarks which corresponds to cells with face in it
        landmarks_pred = landmarks_prediction[obj_mask]
        gt_landmarks = gt_landmarks[obj_mask]
        # calculate Normalized Mean Error
        nme = self.lambda_landmarks * self.nme(gt_landmarks, landmarks_pred, gt_boxes[:, :, :, :, 2:4][obj_mask])
        # calculate localization error
        bbox_pred = bbox_prediction[:, :, :, :, :4][obj_mask].to(self.device)
        gt_boxes = gt_boxes[obj_mask]
        loc_loss = self.lambda_coor * self.smooth_l1_loss(bbox_pred, gt_boxes)
        # calculate confidence loss
        # get conf mask where gt and where there is no gt
        conf_pred = bbox_prediction[:, :, :, :, 4]  # torch.Size([batch_size, grid_size, grid_size, num_of_anchors])
        conf_loss = self.lambda_noobj * self.mse_loss(conf_pred[noobj_mask],
                                                      gt_conf[noobj_mask]) + self.mse_loss(
            conf_pred[obj_mask], gt_conf[obj_mask])

        return nme, loc_loss, conf_loss, self.non_maximum_suppression(bbox_prediction, landmarks_prediction)

    def get_mask(self, prediction, target):
        """
        Tool for calculating the loss
        Calculates masks (filter then used as indexes) for prediction,
         build target values with the same shape as prediction
         and best predicted bboxes
        :param prediction: (tuple) the same as in forward method
        :param target: (tuple) the same as in forward method
        :return: gt_boxes: torch.size(batch_size, grid_size, grid_size, num_anchors, 4),
                 gt_conf: torch.size(batch_size, grid_size, grid_size, num_anchors),
                 mask: torch.size(batch_size, grid_size, grid_size, num_anchors),
                 conf_mask: torch.size(batch_size, grid_size, grid_size, num_anchors),
                 gt_landmarks: torch.size(batch_size, grid_size, grid_size, num_anchors, 68, 2 or 3)
        """
        """
        ground truth bbox - [c_x, c_y, w, h] all values in [0,1] w.r.t the whole image 
        anchor boxes - [0.5, 0.5, w, h] 0.5 w.r.t cell ; w,h in [0,1] w.r.t the whole image
                        store all values w.r.t the whole image 
                        (transform 0.5 to value in [0,1] w.r.t. the whole image in the 'self._get_anchor_boxes)
        predicted bbox - [ln(c_x), ln(c_y), ln(w), ln(h)] c_x, c_y w.r.t. cell w, h w.r.t. image

        1) IoU between gt and anchor, gt -> anchor form: 
            [c_x, c_y, w, h] -> [c_x/9 - int(c_x/9), c_y - int(c_y/9), w, h]  
      
        """
        bbox_target, landmarks_target = target
        bbox_prediction, landmarks_prediction = prediction

        batch_size, a, num_anchors = bbox_prediction.size(0), bbox_prediction.size(1), bbox_prediction.size(2)

        # to mark bboxes with high IoU between predicted and target
        # in other words, mark (with 1) bbox with the face in it w.r.t. ground truth(gt)
        obj_mask = torch.zeros(batch_size,a, num_anchors)
        noobj_mask = torch.ones(batch_size,a, num_anchors)
        # to store ground truth confidence scores and box coordinates
        gt_conf = torch.zeros(batch_size,a, num_anchors)
        gt_boxes = torch.zeros(batch_size, a, num_anchors, 4)
        gt_landmarks = torch.zeros(batch_size, a, num_anchors, 68 * 2)

        for batch_idx in range(batch_size):
            for target_idx in range(bbox_target.shape[1]):
                # there is no target, continue
                if bbox_target[batch_idx, target_idx].sum() == 0:
                    continue

                # get ground truth box coordinates
                gt_x = bbox_target[batch_idx, target_idx, 0]
                gt_y = bbox_target[batch_idx, target_idx, 1]
                gt_w = bbox_target[batch_idx, target_idx, 2]
                gt_h = bbox_target[batch_idx, target_idx, 3]

                # get grid box indices of ground truth box
                # coordinates gt_x*grid_size and gt_y*grid_size w.r.t. cell size (one cell 1x1)
                gt_i = int(gt_x * grid_size)
                gt_j = int(gt_y * grid_size)
                gt_box = torch.tensor([gt_x, gt_y, gt_w, gt_h]).unsqueeze(0).to(
                    self.device)  # torch.size(0,4)
                # get anchor box that has the highest iou with ground truth
                anchor_boxes = self._get_anchor_boxes(gt_i, gt_j, grid_size)
                anchor_iou = self._get_iou(gt_box, anchor_boxes)
                # best matching anchor box
                best_anchor_idx = torch.argmax(anchor_iou)

                # mark best predicted box
                obj_mask[batch_idx, gt_j, gt_i, best_anchor_idx] = 1
                noobj_mask[batch_idx, gt_j, gt_i, best_anchor_idx] = 0

                gt_conf[batch_idx, gt_j, gt_i, best_anchor_idx] = 1
                gt_boxes[batch_idx, gt_j, gt_i, best_anchor_idx] = torch.log1p(gt_box)
                gt_landmarks[batch_idx, gt_j, gt_i, best_anchor_idx] = landmarks_target[batch_idx, target_idx].view(
                    68 * 2)

        obj_mask = obj_mask.byte()  # to use then as indexes of tensor
        noobj_mask = noobj_mask.byte()  # to use then as indexes of tensor

        return gt_boxes.to(self.device), gt_conf.to(self.device), obj_mask, noobj_mask, gt_landmarks.to(
            self.device)

    def nme(self, gt_landmarks, pred_landmarks, boxes_shapes):
        """
        Normalized mean error (NME) defined as the Euclidean distance
        between the predicted and ground truth 2D landmarks averaged over
        68 landmarks and normalized by the bounding box dimensions

        :param gt_landmarks: torch.size(batch_size, 68*2)
        :param pred_landmarks: torch.size(batch_size, 68*2)
        :param boxes_shapes: [[width, height], ...]
        :return: (float)
        """
        nme = 0.0
        batch_size = gt_landmarks.shape[0]
        gt_landmarks = gt_landmarks.view(batch_size, 68, 2)
        pred_landmarks = pred_landmarks.view(batch_size, 68, 2)
        for batch_idx in range(batch_size):
            sum = 0
            for i in range(68):
                euclidean_dist = torch.dist(gt_landmarks[batch_idx, i], pred_landmarks[batch_idx, i], 2)
                sum += euclidean_dist
            normalization_factor = math.sqrt(
                boxes_shapes[batch_idx][0] * boxes_shapes[batch_idx][1])
            nme += sum / (normalization_factor * 68 * batch_size)
        return nme

    def _get_iou(self, box1, box2):
        """
        Calculates IoU for two tensors of bboxes
        :param box1: torch.size(num_of_boxes_1, 4)
        :param box2: torch.size(num_of_boxes_2, 4)
        :return: torch.size(max(num_of_boxes_1, num_of_boxes_2), 4)
        """

        b1 = self._format_bbox(box1)
        b2 = self._format_bbox(box2)
        b1_x1, b1_x2, b1_y1, b1_y2 = b1[:, 0], b1[:, 1], b1[:, 2], b1[:, 3]
        b2_x1, b2_x2, b2_y1, b2_y2 = b2[:, 0], b2[:, 1], b2[:, 2], b2[:, 3]

        intersect_x1 = torch.max(b1_x1, b2_x1)
        intersect_y1 = torch.max(b1_y1, b2_y1)
        intersect_x2 = torch.min(b1_x2, b2_x2)
        intersect_y2 = torch.min(b1_y2, b2_y2)

        intersect_area = (intersect_x2 - intersect_x1 + 1) * (intersect_y2 - intersect_y1 + 1)

        # union area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = intersect_area / (b1_area + b2_area - intersect_area + 1e-16)
        return iou

    def _get_anchor_boxes(self, g_i, g_j, grid_size, center_x=0.5, center_y=0.5):
        """
        Creates list of anchor boxes with given dimensions (height, width) w.r.t image size
        anchor box =  [center_x, center_y, width, height] w.r.t. the whole image

        :param center_x: x coordinate of the center of the anchor box w.r.t. cell
        :param center_y: y coordinate of the center of the anchor box w.r.t. cell
        :return: (tensor) torch.size(len(of anchor_aspect_ratios), 4)
        """

        anchors = []
        center_x = (center_x + g_i) / grid_size
        center_y = (center_y + g_j) / grid_size
        for dims in self.anchors_dims:
            anchors.append([center_x, center_y, dims[0], dims[1]])
        return torch.tensor(anchors).to(self.device)

    def _format_bbox(self, box):
        """
        Convert [[c_x, c_y, w, h], ...] to [[x_top_left_0, y_top_left_0, x_bottom_right_0, y_bottom_right_0], ...]
        :param box: (torch.tensor) [[c_x, c_y, w, h], ...]
        :return: (torch.tensor) [[x_top_left_0, y_top_left_0, x_bottom_right_0, y_bottom_right_0], ...]
        """

        x1, x2 = (box[:, 0] - box[:, 2] / 2).unsqueeze(0), (
                box[:, 0] + box[:, 2] / 2).unsqueeze(0)
        y1, y2 = (box[:, 1] - box[:, 3] / 2).unsqueeze(0), (
                box[:, 1] + box[:, 3] / 2).unsqueeze(0)
        return torch.cat((torch.t(x1 * self.image_size), torch.t(x2 * self.image_size), torch.t(y1 * self.image_size),
                          torch.t(y2 * self.image_size)), 1)

    def non_maximum_suppression(self, bbox_prediction, landmarks_prediction, conf_thresh=0.5, iou_thresh=0.5):
        batch_size, grid_size, num_anchors = bbox_prediction.size(0), bbox_prediction.size(1), bbox_prediction.size(3)

        bbox_prediction = bbox_prediction.view(batch_size, grid_size * grid_size * num_anchors, 4 + 1)
        bbox_prediction = torch.cat((torch.expm1(bbox_prediction[:, :, :4]), bbox_prediction[:, :, 4:]), dim=2)

        landmarks_prediction = landmarks_prediction.view(batch_size, grid_size * grid_size * num_anchors, 68 * 2)

        conf_mask = (bbox_prediction[:, :, 4] > conf_thresh).float().unsqueeze(2)
        bbox_pred = bbox_prediction * conf_mask
        landmarks_pred = landmarks_prediction * conf_mask

        bbox = []
        landmarks = []
        for i in range(batch_size):
            image_bbox = bbox_pred[i]
            image_landmarks = landmarks_pred[i]
            max_conf_idx = torch.argmax(image_bbox[:, 4])
            ious = self._get_iou(image_bbox[max_conf_idx].unsqueeze(0), image_bbox)
            mask = ious < iou_thresh
            bbox.append(image_bbox[mask.byte()])
            bbox.append(image_bbox[max_conf_idx].unsqueeze(0))
            landmarks.append(image_landmarks[mask.byte()])
            landmarks.append(image_landmarks[max_conf_idx].unsqueeze(0))
        bbox = torch.cat(bbox, dim=0)
        bbox = torch.cat((self._format_bbox(bbox), bbox[:, 4:]), dim=1)
        landmarks = torch.cat(landmarks, dim=0)
        return bbox, landmarks.view(-1, 68, 2)

In [133]:
class CustomSSD(nn.Module):
    def __init__(self, num_classes=1, num_points=68, width_mult=1.0, use_batch_norm=True):
        super(CustomSSD, self).__init__()
        self.base = MobileNetV2().features
        self.source_layer_indexes = [GraphPath(14, 'conv', 3),19,]
        
        #change convolutions for inverted residual to change number of operations 
        self.extras = ModuleList([
            InvertedResidual(1280, 512, stride=2, expand_ratio=0.2),
            InvertedResidual(512, 256, stride=2, expand_ratio=0.25),
            InvertedResidual(256, 256, stride=2, expand_ratio=0.5),
            Conv2d(256, 64, kernel_size=2, stride=1, padding=0)
        ])
        
        self.regression_headers = ModuleList([
            SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * 4,
                            kernel_size=3, padding=1),
            SeperableConv2d(in_channels=1280, out_channels=6 * 4, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
            Conv2d(in_channels=64, out_channels=6 * 4, kernel_size=1),
        ])
        
        self.classification_headers = ModuleList([
            SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * num_classes, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=1280, out_channels=6 * num_classes, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
            Conv2d(in_channels=64, out_channels=6 * num_classes, kernel_size=1),
        ])
        
        self.landmarks_headers = ModuleList([
            SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * num_points * 2, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=1280, out_channels=6 * num_points * 2, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=512, out_channels=6 * num_points * 2, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=256, out_channels=6 * num_points * 2, kernel_size=3, padding=1),
            SeperableConv2d(in_channels=256, out_channels=6 * num_points * 2, kernel_size=3, padding=1),
            Conv2d(in_channels=64, out_channels=6 * num_points * 2, kernel_size=1),
        ])
        
        self.source_layer_add_ons = nn.ModuleList([t[1] for t in self.source_layer_indexes
                                                   if isinstance(t, tuple) and not isinstance(t, GraphPath)])
        
        self.num_classes = num_classes
        self.num_points = num_points
        
    
    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        confidences = []
        locations = []
        landmarks = []
        start_layer_index = 0
        header_index = 0
        for end_layer_index in self.source_layer_indexes:
            if isinstance(end_layer_index, GraphPath):
                path = end_layer_index
                end_layer_index = end_layer_index.s0
                added_layer = None
            elif isinstance(end_layer_index, tuple):
                added_layer = end_layer_index[1]
                end_layer_index = end_layer_index[0]
                path = None
            else:
                added_layer = None
                path = None
            for layer in self.base[start_layer_index: end_layer_index]:
                x = layer(x)
            if added_layer:
                y = added_layer(x)
            else:
                y = x
            if path:
                sub = getattr(self.base[end_layer_index], path.name)
                for layer in sub[:path.s1]:
                    x = layer(x)
                y = x
                for layer in sub[path.s1:]:
                    x = layer(x)
                end_layer_index += 1
            start_layer_index = end_layer_index
            confidence, location, landmark = self.compute_header(header_index, y)
            header_index += 1
            confidences.append(confidence)
            locations.append(location)
            landmarks.append(landmark)

        for layer in self.base[end_layer_index:]:
            x = layer(x)

        for layer in self.extras:
            x = layer(x)
            confidence, location, landmark = self.compute_header(header_index, x)
            header_index += 1
            confidences.append(confidence)
            locations.append(location)
            landmarks.append(landmark)

        confidences = torch.cat(confidences, 1)
        locations = torch.cat(locations, 1)
        landmarks = torch.cat(landmarks, 1)
        return confidences, locations, landmarks

    def compute_header(self, i, x):
        confidence = self.classification_headers[i](x)
        confidence = confidence.permute(0, 2, 3, 1).contiguous()
        confidence = confidence.view(confidence.size(0), -1, self.num_classes)

        location = self.regression_headers[i](x)
        location = location.permute(0, 2, 3, 1).contiguous()
        location = location.view(location.size(0), -1, 4)
        
        landmarks = self.landmarks_headers[i](x)
        landmarks = landmarks.permute(0, 2, 3, 1).contiguous()
        landmarks = landmarks.view(location.size(0), -1, self.num_points * 2)

        return confidence, location, landmarks

    def load(self, model):
        self.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))

    def save(self, model_path):
        torch.save(self.state_dict(), model_path)

        

In [134]:
from PIL import Image
from torchvision import transforms, utils
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

def read_and_resize(filename):
    img = Image.open(filename)
    img = img.resize((300, 300))
    transform = transforms.Compose([            
     transforms.Resize(300),                    
     transforms.ToTensor(),   
     transforms.Normalize(mean=(0.5,), std=(0.5,))
    ])
    img_t = transform(img)
    return img_t

def get_bbox(landmarks):
    """
    Finds bounding box coordinates in format [xmin, ymin, width, height] from landmarks
    :param landmarks: torch.size(68,2)
    :return: torch.size(4)
    """

    x = landmarks[:, 0]
    y = landmarks[:, 1]

    xmin, xmax = min(x), max(x)
    ymin, ymax = min(y), max(y)

    width = xmax - xmin
    height = ymax - ymin
    return [xmin, ymin, width, height]

class FaceLandmarksDataset(Dataset):
    """Face dataset."""

    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image_name = self.dataset['file_name'][idx]
        image = read_and_resize(image_name)
        coordinates_int = []
        for i in range(0, len(self.dataset.iloc[idx][1:])):
            if i % 2 == 0:
                (x, y) = ((int) (self.dataset.iloc[idx][1:][i]), (int) (self.dataset.iloc[idx][1:][i + 1]))
                coordinates_int.append((x,y))
        
        xmin, ymin, width, height = get_bbox(landmarks = torch.tensor(coordinates_int))
                                       
        normalized_points = []
        for (x,y) in coordinates_int:
            normalized_points.append(((x - xmin.double())/width.double(), (y - ymin.double())/height.double()))
       
        return (image, [(torch.tensor([xmin, ymin, width, height]), torch.tensor(normalized_points))])


In [232]:

# for i in range(1):
#     batch = next(iter(train_loader))
#     print(len(batch))
#     print(batch[0].shape)   #INPUT image
#     print(batch[1][0][0].shape) # Boxes 
#     print(batch[1][0][1].shape) # Coordintes of landmarks
#     condifence, locations,landmarks = net(batch[0])

# images, boxes, labels = data
# images = images.to(device)
# boxes = boxes.to(device)
# labels = labels.to(device)
def train(loader, net, criterion, optimizer, device, debug_steps=100, epoch=-1):
    net.train(True)
    running_loss = 0.0
    running_regression_loss = 0.0
    running_classification_loss = 0.0
            
    for i, data in enumerate(loader):
        images, gt_boxes = data
        print(len(gt_boxes), type(gt_boxes[0]), " Boxes len")
        images = images.to(device)
        gt_locations = gt_boxes[0][0].to(device)
        gt_landmarks = gt_boxes[0][1].to(device)
        
        optimizer.zero_grad()
        confidence, locations, landmarks_prediction = net(images)
        """
            Joint loss for landmarks(NME) and bounding boxes(localization loss, confidence loss)
            :param prediction: (tuple) containing predictions for bounding boxes and landmarks
            bounding boxes prediction:  torch.size(batch_size, grid_size, grid_size, num_anchor_box, 4+1)
            landmarks prediction: torch.size(batch_size, grid_size, grid_size, num_anchor_box, 68, 2 or 3)

            :param target: (tuple) containing target values  for bounding boxes and landmarks
            bounding boxes targets: torch.size(batch_size, max_num_of_faces, 4+1)
            landmarks targets: torch.size(batch_size, max_num_of_faces, 68, 2 or 3)

            :return: normalized mean error(float), localization loss(float), confidence loss(float),
            best predicted bbox matched with gt (torch.tensor)
        """
        
        landmarks_prediction = landmarks_prediction.resize_((1,len(landmarks_prediction[0]),68,2))
        print(landmarks_prediction.shape, " landmarks prediction: torch.size(batch_size, grid_size, grid_size, num_anchor_box, 68, 2 or 3)")
        print(gt_landmarks.shape, " landmarks target SHAPE must be torch.size(batch_size, max_num_of_faces, 68, 2 or 3)")
        print()
        bounding_boxes_prediction = torch.cat( (locations, condifence), dim=2)
        
        prediction = (bounding_boxes_prediction, landmarks_prediction)
        target = (gt_locations, gt_landmarks)
        #TODO convert locations to bounding box predictions
        nme, loc_loss, conf_loss, bbox_confirmed, landmarks_confirmed = criterion( prediction,  )  # TODO CHANGE BOXES
        #Output from joint loss format
        #nme, loc_loss, conf_loss, self.non_maximum_suppression(bbox_prediction, landmarks_prediction)
        loss = nme + loc_loss + conf_loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_nme_loss += nme.item()
        running_loc_loss += loc_loss.item()
        running_conf_loc_loss += conf_loss.item()
        if i and i % debug_steps == 0:
            avg_loss = running_loss / debug_steps
            avg_nme_loss = running_nme_loss / debug_steps
            avg_loc_loss = running_loc_loss / debug_steps
            avg_conf_loc_loss = running_conf_loc_loss / debug_steps
            logging.info(
                f"Epoch: {epoch}, Step: {i}, " +
                f"Average Loss: {avg_loss:.4f}, " +
                f"Average nme Loss {avg_nme_loss:.4f}, " +
                f"Average loc Loss: {avg_loc_loss:.4f}" +
                f"Average conf Loss: {avg_conf_loc_loss:.4f}"
            )
            running_loss = 0
            running_nme_loss = 0
            running_loc_loss = 0
            running_conf_loc_loss = 0

In [233]:
def test(loader, net, criterion, device):
    net.eval()
    running_loss = 0.0
    running_regression_loss = 0.0
    running_classification_loss = 0.0
    num = 0
    for _, data in enumerate(loader):
        images, boxes, labels = data
        images = images.to(device)
        boxes = boxes.to(device)
        labels = labels.to(device)
        num += 1

        with torch.no_grad():
            confidence, locations = net(images)
            regression_loss, classification_loss = criterion(confidence, locations, labels, boxes)
            loss = regression_loss + classification_loss

        running_loss += loss.item()
        running_regression_loss += regression_loss.item()
        running_classification_loss += classification_loss.item()
    return running_loss / num, running_regression_loss / num, running_classification_loss / num


In [234]:
scheduler = "cosine"
lr = 0.01
t_max = 200 # Params for Cosine Annealing

mb2_width_mult = 1.0 #Width Multiplifier for MobilenetV2
# Params for SGD
momentum = 0.9
weight_decay = 5e-4
gamma = 0.1

# Params for Multi-step Scheduler
milestones = "80,100" #milestones for MultiStepLR

batch_size = 32
num_epochs = 20
num_workers = 0
validation_epochs = 0 
debug_steps = 100 #Set the debug log output frequency

# Train params
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

DEVICE = torch.device("cpu")

if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    logging.info("Use Cuda.")


2019-11-17 21:45:48,419 - root - INFO - Use Cuda.


In [235]:
GraphPath = namedtuple("GraphPath", ['s0', 'name', 's1'])

In [236]:
config = mobilenetv1_ssd_config

logging.info("Prepare training datasets.")

# PREPARE DATASET
batch_size = 1
test_labels = pd.read_csv('./LS3D-W/test.csv')
train_dataset = FaceLandmarksDataset(test_labels)
train_loader = DataLoader (train_dataset, batch_size = batch_size, shuffle = True, pin_memory = True)
logging.info("Train dataset size: {}".format(len(train_dataset)))

2019-11-17 21:45:48,730 - root - INFO - Prepare training datasets.
2019-11-17 21:45:48,820 - root - INFO - Train dataset size: 3449


In [237]:
net = CustomSSD()
min_loss = -10000.0
last_epoch = -1
#net.init_from_base_net(base_net)
net.to(DEVICE)
print("Network created")

Network created


In [238]:
for i in range(1):
    batch = next(iter(train_loader))
    print(len(batch))
    print(batch[0].shape)   #INPUT image
    print(batch[1][0][0].shape) # Boxes 
    print(batch[1][0][1].shape) # Coordintes of landmarks
    condifence, locations,landmarks = net(batch[0])


2
torch.Size([1, 3, 300, 300])
torch.Size([1, 4])
torch.Size([1, 68, 2])


In [239]:
#Заменить на своё
criterion = JointLoss(device = DEVICE)

params = [
    {'params': net.base.parameters(), 'lr': lr},
    {'params': itertools.chain(
        net.source_layer_add_ons.parameters(),
        net.extras.parameters()
    ), 'lr': lr},
    {'params': itertools.chain(
        net.regression_headers.parameters(),
        net.classification_headers.parameters(),
        net.landmarks_headers.parameters()
    )}
]   
# Optimizer
optimizer = torch.optim.SGD(params, lr = lr, momentum = momentum,
                            weight_decay = weight_decay)

#args.scheduler == 'cosine':
logging.info("Uses CosineAnnealingLR scheduler.")
scheduler = CosineAnnealingLR(optimizer, t_max, last_epoch=last_epoch)

logging.info(f"Start training from epoch {last_epoch + 1}.")
for epoch in range(last_epoch + 1, num_epochs):
    scheduler.step()
    train(train_loader, net, criterion, optimizer,
          device=DEVICE, debug_steps=debug_steps, epoch=epoch)

    if epoch % validation_epochs == 0 or epoch == num_epochs - 1:
        val_loss, val_regression_loss, val_classification_loss = test(val_loader, net, criterion, DEVICE)
        logging.info(
            f"Epoch: {epoch}, " +
            f"Validation Loss: {val_loss:.4f}, " +
            f"Validation Regression Loss {val_regression_loss:.4f}, " +
            f"Validation Classification Loss: {val_classification_loss:.4f}"
        )
        model_path = os.path.join(checkpoint_folder, f"{net}-Epoch-{epoch}-Loss-{val_loss}.pth")
        net.save(model_path)
        logging.info(f"Saved model {model_path}")

2019-11-17 21:45:50,286 - root - INFO - Uses CosineAnnealingLR scheduler.
2019-11-17 21:45:50,288 - root - INFO - Start training from epoch 0.
1 <class 'list'>  Boxes len


RuntimeError: cannot resize variables that require grad