In [6]:
from __future__ import division

import torch 
import torch.nn as nn
import torch.nn.functional as F 
from torch.autograd import Variable
import numpy as np
from util import * 

The _cfg_ file describes the layout of the network, block by block.
There are 4 typrs of layers in _cfg_ file:
* Convolutional
* Shortcut
* Upsample
* Route
that need to be umplemented.

Also there are **yolo** block, that corresponds to the Detection layer (anchors, classes...) and **net** block, that only describes information about the network input and training parameters.

In [2]:
def parse_cfg(cfgfile):
    """
    Takes a configuration file
    
    Returns a list of blocks. Each blocks describes a block in the neural
    network to be built. Block is represented as a dictionary in the list
    
    """
    file = open(cfgfile, 'r')
    lines = file.read().split('\n')                        # store the lines in a list
    lines = [x for x in lines if len(x) > 0]               # get read of the empty lines 
    lines = [x for x in lines if x[0] != '#']              # get rid of comments
    lines = [x.rstrip().lstrip() for x in lines]           # get rid of fringe whitespaces
    
    block = {}
    blocks = []

    for line in lines:
        if line[0] == "[":               # This marks the start of a new block
            if len(block) != 0:          # If block is not empty, implies it is storing values of previous block.
                blocks.append(block)     # add it the blocks list
                block = {}               # re-init the block
            block["type"] = line[1:-1].rstrip()     
        else:
            key,value = line.split("=") 
            block[key.rstrip()] = value.lstrip()
    blocks.append(block)

    return blocks

In [3]:
# Define useful layers
class EmptyLayer(nn.Module):
    def __init__(self):
        super(EmptyLayer, self).__init__()
        
class DetectionLayer(nn.Module):
    def __init__(self, anchors):
        super(DetectionLayer, self).__init__()
        self.anchors = anchors

In [9]:
def create_modules(blocks):
    """
    Now we are going to use the list returned by the above parse_cfg
    to construct PyTorch modules for the blocks present in the config file
    """
    net_info = blocks[0]     #Captures the information about the input and pre-processing    
    module_list = nn.ModuleList()
    prev_filters = 3         # to store depth for conv layer
    output_filters = []      # same, but for all previous layers (beacuse Route)
    
    for index, x in enumerate(blocks[1:]):
        module = nn.Sequential()

        #check the type of block
        #create a new module for the block
        #append to module_list
        
        #If it's an convolutional layer
        if (x["type"] == "convolutional"):
            #Get the info about the layer
            activation = x["activation"]
            try:
                batch_normalize = int(x["batch_normalize"])
                bias = False
            except:
                batch_normalize = 0
                bias = True

            filters= int(x["filters"])
            padding = int(x["pad"])
            kernel_size = int(x["size"])
            stride = int(x["stride"])

            if padding:
                pad = (kernel_size - 1) // 2  #so that central pixel of kernel is on first pixel of image
            else:
                pad = 0

            #Add the convolutional layer
            conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias)
            module.add_module("conv_{0}".format(index), conv)

            #Add the Batch Norm Layer
            if batch_normalize:
                bn = nn.BatchNorm2d(filters)
                module.add_module("batch_norm_{0}".format(index), bn)

            #Check the activation. 
            #It is either Linear or a Leaky ReLU for YOLO
            if activation == "leaky":
                activn = nn.LeakyReLU(0.1, inplace = True)
                module.add_module("leaky_{0}".format(index), activn)

        #If it's an upsampling layer
        #We use Bilinear2dUpsampling
        elif (x["type"] == "upsample"):
            stride = int(x["stride"])   #??? we don't use stride value?
            upsample = nn.Upsample(scale_factor = 2, mode = "bilinear")
            module.add_module("upsample_{}".format(index), upsample)
            
        #If it is a route layer
        elif (x["type"] == "route"):
            x["layers"] = x["layers"].split(',')
            #Start  of a route
            start = int(x["layers"][0])
            #end, if there exists one.
            try:
                end = int(x["layers"][1])
            except:
                end = 0
            #Convert positive anotation to negative
            if start > 0: 
                start = start - index
            if end > 0:
                end = end - index
            route = EmptyLayer()   #Put a dummy layer in place of a route layer. Concatination will be performed
                                   #directly in forward function of nn.Module object representing darknet.
            module.add_module("route_{0}".format(index), route)
            #If we are concatenating maps
            if end < 0:
                filters = output_filters[index + start] + output_filters[index + end]
            else:
                filters= output_filters[index + start]
                
        #shortcut corresponds to skip connection
        elif x["type"] == "shortcut":
            shortcut = EmptyLayer()  #same as above, but addition does not require updating output_filters
            module.add_module("shortcut_{}".format(index), shortcut)
            
        #Yolo is the detection layer
        elif x["type"] == "yolo":
            mask = x["mask"].split(",")
            mask = [int(x) for x in mask]         # store mask as list

            anchors = x["anchors"].split(",")
            anchors = [int(a) for a in anchors]   # store anchors as list containing...
            anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]   #...a pair of height and width
            anchors = [anchors[i] for i in mask]  # use only anchors indexed by mask

            detection = DetectionLayer(anchors)
            module.add_module("Detection_{}".format(index), detection)
        
        module_list.append(module)
        prev_filters = filters
        output_filters.append(filters)
    return (net_info, module_list)

In [8]:
#TESTING
blocks = parse_cfg("cfg/yolov3.cfg")
print(create_modules(blocks))

({'hue': '.1', 'saturation': '1.5', 'channels': '3', 'momentum': '0.9', 'angle': '0', 'policy': 'steps', 'scales': '.1,.1', 'burn_in': '1000', 'batch': '64', 'height': '608', 'learning_rate': '0.001', 'decay': '0.0005', 'type': 'net', 'subdivisions': '16', 'steps': '400000,450000', 'exposure': '1.5', 'width': '608', 'max_batches': '500200'}, ModuleList(
  (0): Sequential(
    (conv_0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (batch_norm_0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_0): LeakyReLU(negative_slope=0.1, inplace)
  )
  (1): Sequential(
    (conv_1): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (batch_norm_1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_1): LeakyReLU(negative_slope=0.1, inplace)
  )
  (2): Sequential(
    (conv_2): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (batch_n

In [7]:
class Darknet(nn.Module):
    def __init__(self, cfgfile):
        super(Darknet, self).__init__()
        self.blocks = parse_cfg(cfgfile)
        self.net_info, self.module_list = create_modules(self.blocks)
    
    def forward(self, x, CUDA):  #TODO remove CUDA bool
        modules = self.blocks[1:]
        outputs = {}   #We cache the outputs for the route layer
        
        write = 0     #Flag, indicating that we already have encountered the first detection
                      #and can concatinate further detections (on differrent scales) to it
        for i, module in enumerate(modules):        
            module_type = (module["type"])
            
            if module_type == "convolutional" or module_type == "upsample":
                x = self.module_list[i](x)
            
            elif module_type == "route":
                layers = module["layers"]
                layers = [int(a) for a in layers]

                if (layers[0]) > 0:      #Convert to negative annotation
                    layers[0] = layers[0] - i

                if len(layers) == 1:
                    x = outputs[i + (layers[0])]   #Forward the output of (i - layers_0) layer

                else:
                    if (layers[1]) > 0:  #Convert to negative annotation
                        layers[1] = layers[1] - i

                    map1 = outputs[i + layers[0]]
                    map2 = outputs[i + layers[1]]

                    x = torch.cat((map1, map2), 1) #Concatenate outputs of corresponding layers

            elif module_type == "shortcut":
                from_ind = int(module["from"])
                x = outputs[i - 1] + outputs[i + from_ind]   #FIXME should check dimentions

            elif module_type == 'yolo':        

                anchors = self.module_list[i][0].anchors
                #Get the input dimensions
                inp_dim = int (self.net_info["height"])

                #Get the number of classes
                num_classes = int (module["classes"])

                #Transform 
                x = x.data
                x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)
                if not write:              #if no collector has been intialised. 
                    detections = x
                    write = 1

                else:       
                    detections = torch.cat((detections, x), 1)

            outputs[i] = x
            
        return detections

In [14]:
#TESTING
def get_test_input():
    img = cv2.imread("dog-cycle-car.png")
    img = cv2.resize(img, (416,416))          # Resize to the input dimension
    img_ =  img[:,:,::-1].transpose((2,0,1))  # BGR -> RGB | H X W C -> C X H X W 
    img_ = img_[np.newaxis,:,:,:]/255.0       # Add a channel at 0 (for batch) | Normalise
    img_ = torch.from_numpy(img_).float()     # Convert to float
    img_ = Variable(img_)                     # Convert to Variable
    return img_

model = Darknet("cfg/yolov3.cfg")
inp = get_test_input()
if torch.cuda.is_available():
    model.cuda()
    inp = inp.cuda()
pred = model(inp, torch.cuda.is_available())
print (pred)

  "See the documentation of nn.Upsample for details.".format(mode))


RuntimeError: invalid argument 2: size '[1 x 255 x 3025]' is invalid for input with 689520 elements at /opt/conda/conda-bld/pytorch_1533672544752/work/aten/src/TH/THStorage.cpp:84

Continue:
https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-3/