In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import pandas as pd

In [None]:
data_path = "data/cars"
data_path = Path(data_path)
images_path = data_path / "training_images"
annotations_file_path = data_path / "annotations.csv"

In [None]:
from example.train_cars import load_cars_df

INFO:pytorch_accelerated:Setting random seeds


In [None]:
train_df, valid_df, lookups = load_cars_df(annotations_file_path, images_path)

# Anchor section

TODO: Now that we understand anchor boxes, lets look at how we can evaluate whether our chosen anchors are suitable for our problem and, if not, find some sensible choices for our dataset.

The approach here is largely adapted from the autoanchor approach used in Yolov5, which was also used with Yolov7.

## Evaluating current anchor boxes

The simplest approach would be to simply use the same anchors as used for COCO, which are already bundled with the defined architectures.

In [125]:
from yolov7 import create_yolov7_model

In [127]:
model = create_yolov7_model('yolov7', pretrained=False)

In [129]:
model.detection_head.anchor_grid

tensor([[[[[[ 12.,  16.]]],


          [[[ 19.,  36.]]],


          [[[ 40.,  28.]]]]],




        [[[[[ 36.,  75.]]],


          [[[ 76.,  55.]]],


          [[[ 72., 146.]]]]],




        [[[[[142., 110.]]],


          [[[192., 243.]]],


          [[[459., 401.]]]]]])

By default these are the ones from coco. Here we can see that we have 3 groups, one for each layer of the feature pyramid network. The numbers correspond to the width and height of the anchors that will be generated.

The FPN (Feature Pyramid Network) has three outputs and each output's role is to detect objects according to their scale. For example:

- P3/8 is for detecting smaller objects.
- P4/16 is for detecting medium objects.
- P5/32 is for detecting bigger objects.
So when you're going to detect smaller objects you need to use smaller anchor boxes and for medium objects you should use medium scale anchor boxes, so on

TODO: Should have already explained what anchors are in a different section.

In [128]:
current_anchors = model.detection_head.anchor_grid.clone().cpu().view(-1, 2); current_anchors

tensor([[ 12.,  16.],
        [ 19.,  36.],
        [ 40.,  28.],
        [ 36.,  75.],
        [ 76.,  55.],
        [ 72., 146.],
        [142., 110.],
        [192., 243.],
        [459., 401.]])

To evaluate our current anchor boxes, we can calculate the best possible recall, which would occur if the model was able to successfully match an appropriate anchor box with a ground truth. 

### Find normalized bounding boxes

To evaluate our anchor boxes, we first need some knowedge of the shapes and sizes of the objects in our dataset. We can do this by finding the width and height of all ground truth boxes in the training set. We can calculate these as demonstrated below:

In [90]:
train_annotations_df = train_df.query('has_annotation == True').copy()

In [91]:
train_annotations_df['h'] = train_annotations_df['ymax'] -  train_annotations_df['ymin']
train_annotations_df['w'] = train_annotations_df['xmax'] -  train_annotations_df['xmin']

In [92]:
train_annotations_df

Unnamed: 0,image,xmin,ymin,xmax,ymax,class_name,has_annotation,image_id,class_id,h,w
0,vid_4_1000.jpg,281.259045,187.035071,327.727931,223.225547,car,True,0,0.0,36.190476,46.468886
1,vid_4_10000.jpg,15.163531,187.035071,120.329957,236.430180,car,True,1,0.0,49.395109,105.166425
2,vid_4_10040.jpg,239.192475,176.764801,361.968162,236.430180,car,True,3,0.0,59.665380,122.775687
4,vid_4_10060.jpg,16.630970,186.546010,132.558611,238.386422,car,True,4,0.0,51.840412,115.927641
5,vid_4_10100.jpg,447.568741,160.625804,582.083936,232.517696,car,True,6,0.0,71.891892,134.515195
...,...,...,...,...,...,...,...,...,...,...,...
554,vid_4_9860.jpg,0.000000,198.321729,49.235251,236.223284,car,True,994,0.0,37.901554,49.235251
555,vid_4_9880.jpg,329.876184,156.482351,536.664239,250.497895,car,True,995,0.0,94.015544,206.788055
556,vid_4_9900.jpg,0.000000,168.295823,141.797524,239.176652,car,True,996,0.0,70.880829,141.797524
557,vid_4_9960.jpg,487.428988,172.233646,616.917699,228.839864,car,True,999,0.0,56.606218,129.488711


As we will need to resize our images during training, these bounding boxes will also need to be resized. The easiest way for us to do this is to normalize these values, so then we can just multiply by our desired image size.

However, to do this, we will need the height and width of our images. Sometimes, we have this information ahead of time, in which case we can use this knowledge directly. Otherwise, We can do this as follows:

In [95]:
from PIL import Image
from tqdm.contrib.concurrent import process_map

In [96]:

def find_image_size(image_path):
    image = Image.open(image_path)
    w, h = image.size
    return (image_path.parts[-1], (w, h))

image_sizes = process_map(find_image_size, [images_path/p for p in train_df.image.unique()])


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 324/324 [00:00<00:00, 4657.43it/s]


In [103]:
image_sizes_lookup = dict(image_sizes)

In [118]:
image_sizes_df = pd.DataFrame(image_sizes_lookup).T.reset_index().rename(columns={'index': 'image', 0: 'image_w', 1:'image_h'})

In [119]:
image_sizes_df

Unnamed: 0,image,image_w,image_h
0,vid_4_1000.jpg,676,380
1,vid_4_10000.jpg,676,380
2,vid_4_10040.jpg,676,380
3,vid_4_10060.jpg,676,380
4,vid_4_10100.jpg,676,380
...,...,...,...
319,vid_4_13060.jpg,676,380
320,vid_4_13100.jpg,676,380
321,vid_4_13240.jpg,676,380
322,vid_4_13280.jpg,676,380


We can now merge this with our existing dataframe

In [121]:
train_annotations_df = pd.merge(train_annotations_df, image_sizes_df, on='image')

We can now easily calulate our normalized values

In [123]:
train_annotations_df['normalized_h'] = train_annotations_df['h']/train_annotations_df['image_h']
train_annotations_df['normalized_w'] = train_annotations_df['w']/train_annotations_df['image_w']

In [124]:
train_annotations_df

Unnamed: 0,image,xmin,ymin,xmax,ymax,class_name,has_annotation,image_id,class_id,h,w,image_w,image_h,normalized_h,normalized_w
0,vid_4_1000.jpg,281.259045,187.035071,327.727931,223.225547,car,True,0,0.0,36.190476,46.468886,676,380,0.095238,0.068741
1,vid_4_10000.jpg,15.163531,187.035071,120.329957,236.430180,car,True,1,0.0,49.395109,105.166425,676,380,0.129987,0.155572
2,vid_4_10040.jpg,239.192475,176.764801,361.968162,236.430180,car,True,3,0.0,59.665380,122.775687,676,380,0.157014,0.181621
3,vid_4_10060.jpg,16.630970,186.546010,132.558611,238.386422,car,True,4,0.0,51.840412,115.927641,676,380,0.136422,0.171491
4,vid_4_10100.jpg,447.568741,160.625804,582.083936,232.517696,car,True,6,0.0,71.891892,134.515195,676,380,0.189189,0.198987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,vid_4_9860.jpg,0.000000,198.321729,49.235251,236.223284,car,True,994,0.0,37.901554,49.235251,676,380,0.099741,0.072833
393,vid_4_9880.jpg,329.876184,156.482351,536.664239,250.497895,car,True,995,0.0,94.015544,206.788055,676,380,0.247409,0.305899
394,vid_4_9900.jpg,0.000000,168.295823,141.797524,239.176652,car,True,996,0.0,70.880829,141.797524,676,380,0.186528,0.209760
395,vid_4_9960.jpg,487.428988,172.233646,616.917699,228.839864,car,True,999,0.0,56.606218,129.488711,676,380,0.148964,0.191551


### Calculating BPR

Now that we have the normalized width and height of all ground truth boxes in our training set, we can evaluate our current anchor boxes as follows:

In [173]:
# anchor width and height multiple threshold used to select label-anchor matches when computing loss
LOSS_ANCHOR_MULTIPLE_THRESHOLD = 4


def calculate_best_possible_recall(anchors, normalized_gt_wh, image_sizes, target_image_size=640):
    # image sizes array of [w, h] , either np.array([[w, h]]) or per image
    
     # find target image sizes, assuming resizing so that the longest side is the target size
    target_image_sizes = (
        target_image_size * image_sizes / image_sizes.max(1, keepdims=True)
    )
    
    # find wh of boxes for target size
    wh = target_image_sizes * normalized_gt_wh
    
    tiny_boxes_exist = (wh < 3).any(1).sum()
    if tiny_boxes_exist:
        print(
            f"WARNING: Extremely small objects found. {tiny_boxes_exist} of {len(wh)} labels are < 3 pixels in size."
        )
    
    wh = wh[(wh >= 2.0).any(1)]  # filter > 2 pixels
    
    symmetric_size_ratios = torch.min(wh[:, None]/anchors[None], anchors[None]/wh[:, None]) # ensure 0-1 range
    worst_side_size_ratio = symmetric_size_ratios.min(-1).values
    best_anchor_ratio = worst_side_size_ratio.max(-1).values
    best_possible_recall = (best_anchor_ratio > 1. / LOSS_ANCHOR_MULTIPLE_THRESHOLD).float().mean()
    
    return best_possible_recall
    
    
    

In [174]:
normalized_gt_wh = train_annotations_df[['normalized_w', 'normalized_h']].values

In [175]:
image_sizes = train_annotations_df[['image_w', 'image_h']].values

In [176]:
calculate_best_possible_recall(current_anchors, 
                                normalized_gt_wh,
                                image_sizes
                                )

tensor(1.)

In [140]:
image_sizes

array([[676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676, 380],
       [676,

Alternatively, as all of our images are the same size in this case, we could simply specify a single image size.

In [142]:
import numpy as np

In [169]:
calculate_best_possible_recall(current_anchors, 
                                normalized_gt_wh,
                                np.array([[676, 380]])
                                )

tensor(1.)

From this, we can see that the current anchor boxes are a good fit for this dataset; which makes sense, as the images are quite similar to those in COCO.

### How does this work?

At this point, you may be wondering, how exactly do we calculate the best possible recall. To answer this, let's go through the process manually.

First, we need to resize the width and height of our ground truth boxes based on the images that we are training on - for this architecture, this is recommended to be 640. To preserve the aspect ratios of the objects in our images, the recommended approach to resizing is to scale the image so that the longest size is equal to our target size. Following this approach, we can calculate the target size for each image as demonstrated below:

#### Resize images and labels

In [147]:
target_image_size = 640

In [150]:
target_image_sizes = (
    target_image_size * image_sizes / image_sizes.max(1, keepdims=True)
); target_image_sizes[:5]

array([[640.        , 359.76331361],
       [640.        , 359.76331361],
       [640.        , 359.76331361],
       [640.        , 359.76331361],
       [640.        , 359.76331361]])

Now that we have scaled our images, we also need to apply the same scaling to our ground truth labels. As these are normalized already, we can simply multiply these by our new image sizes.

In [151]:
gt_wh = target_image_sizes * normalized_gt_wh

Now comes the tricky bit, we would like to ensure that at least one anchor can be matches to each ground truth box. Whilst we could do this by framing it as an optimization problem - how do we match each ground truth box with its optimal anchor - this would introduce a lot of complexity for what we are trying to do.

Given an anchor box, we need a simpler way of measuring how well it can be made to fit a ground truth box. Let's examine one approach that can be taken to do this, starting with the width and height of a single ground truth box.


In [152]:
gt_box_wh = gt_wh[0]; gt_box_wh

array([43.99421122, 34.26317273])

For each anchor box, we can inspect the ratios of its height and width when compared to the height and width of our ground truth target, and use this to understand where the biggest differences are.

In [155]:
current_anchors/gt_box_wh

tensor([[ 0.2728,  0.4670],
        [ 0.4319,  1.0507],
        [ 0.9092,  0.8172],
        [ 0.8183,  2.1889],
        [ 1.7275,  1.6052],
        [ 1.6366,  4.2611],
        [ 3.2277,  3.2104],
        [ 4.3642,  7.0922],
        [10.4332, 11.7035]], dtype=torch.float64)

As the scale of these ratios will depend on whether the anchor box sides are greater or smaller than the sides of our ground truth box, we can ensure that our magnitudes are in the range [0, 1] by also calculating the reciprocal and taking the minimum ratios for each anchor. 


In [158]:
symmetric_size_ratios = torch.min(current_anchors/gt_box_wh, gt_box_wh/current_anchors); symmetric_size_ratios

tensor([[0.2728, 0.4670],
        [0.4319, 0.9518],
        [0.9092, 0.8172],
        [0.8183, 0.4568],
        [0.5789, 0.6230],
        [0.6110, 0.2347],
        [0.3098, 0.3115],
        [0.2291, 0.1410],
        [0.0958, 0.0854]], dtype=torch.float64)

From this, we now have an indication of how well, independently, the width and height of each anchor box 'fits' to our ground truth box. 

Now, our challenge is how to consider the match of the the width and height together!

One way we can approach this is, to take the minimum ratio for each anchor; representing the side that worst matches our ground truth.

In [159]:
worst_side_size_ratio = symmetric_size_ratios.min(-1).values; worst_side_size_ratio

tensor([0.2728, 0.4319, 0.8172, 0.4568, 0.5789, 0.2347, 0.3098, 0.1410, 0.0854],
       dtype=torch.float64)

The reason why we have selected the worst fitting side here, is because we know that the other side matches our target *at least* as well as the one selected; we can think of this as the worst case scenario!

Now, let's select the anchor box which matches the best out of these options, this is simply the largest value.

In [160]:
best_anchor_ratio = worst_side_size_ratio.max(-1).values; best_anchor_ratio

tensor(0.8172, dtype=torch.float64)

Out of the worst fitting options, this is our selected match!


TODO: Ensure this is after the loss function section

Recalling that the loss function only looks to match anchor boxes that are within 4 times greater or smaller than the size of the ground truth target, we can now verify whether this anchor is within this range and would be considered a successful match.

We can do that as demonstrated below, taking the reciprical of our loss multiple, to ensure that it is in the same range as our value:

In [161]:
best_anchor_ratio > 1. / ANCHOR_THRESHOLD

tensor(True)

From this, we can see that at least one of our anchors could be successfully matched to our selected ground truth target!

Now that we understand the sequence of steps, we can now apply the same logic to all of our ground truth boxes to see how many matches we can obtain with our current set of anchors:

In [179]:
symmetric_size_ratios = torch.min(current_anchors[None]/gt_wh[:, None], gt_wh[:, None]/current_anchors[None])
worst_side_size_ratio = symmetric_size_ratios.min(-1).values
best_anchor_ratio = worst_side_size_ratio.max(-1).values

In [181]:
best_anchor_ratio > 1. / ANCHOR_THRESHOLD

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

Now that we have calculated, for each ground truth box, whether it has a match. We can take the mean number of matches to find out best possible recall; in our case, this is 1, as we saw earlier!

In [182]:
best_possible_recall = (best_anchor_ratio > 1. / ANCHOR_THRESHOLD).float().mean(); best_possible_recall

tensor(1.)

Whilst we have tried to match an anchor to every ground truth box in the training set for this example, in practice, we often filter out incredibly small boxes (less than 3 pixels in either height or width), as these boxes are usually too small to be considered useful!

## Selecting new anchor boxes

Whilst using the pre-defined anchors may be a good choice for similar datasets, this may not be appropriate for all datasets, for example, those that contain lots of small objects.