# Minimal code example to run

## Warning: the imports take a long time

## Requirements

Uncomment lines below to install everything needed

In [1]:
#!pip install numpy

#!pip install requests

#!pip install Pillow

#!pip install matplotlib

#!pip install ipywidgets

#!pip install torch==1.9.0
#!pip install torchvision

#!pip install pycocotools

#!pip install scikit-image

#!pip install transformers

#!pip install pytorch_lightning

#!pip install timm==0.4.12

In [2]:
#!git clone https://github.com/facebookresearch/detr.git

In [3]:
#!conda install -c pytorch pytorch torchvision

In [4]:
#!conda install cython scipy
#!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'

In [5]:
#!pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
#!pip install torchtext==0.9.1

## Imports

In [6]:
import math
import numpy as np
import sys

from PIL import Image
import matplotlib.pyplot as plt

import torch
from torch import nn
from torchvision.models import resnet50
import torchvision.transforms as T
torch.set_grad_enabled(False);

import os
import torch.utils.data
from torch.utils.data import DataLoader
import torchvision
from PIL import Image, ImageDraw, ImageFont
from pycocotools.coco import COCO

import skimage.io as io
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
import torchvision

import json
from collections import OrderedDict

from transformers import DetrFeatureExtractor, DetrConfig, DetrForObjectDetection, DetrModel, DetrPreTrainedModel

import pytorch_lightning as pl
from pytorch_lightning import Trainer

In [7]:
sys.path.insert(0,"/Users/apolline1/Documents/DTU/Courses/Semester1/minimal_example/detr/")
#print(sys.path)
from datasets import get_coco_api_from_dataset
from datasets.coco_eval import CocoEvaluator

## Mini training

### Parameters

In [8]:
num_classes=6
batch_size = 1
num_workers=0
gpu=0
max_epochs = 1
fdr = False
logit_threshold = 1.5
p_threshold = 0.7
lr = 1e-4 ; lr_backbone = 1e-5 ; weight_decay=1e-4
gradient_clip_val = 0.1

In [9]:
coco=COCO("content/content_train/trainJson.json")
print("Category Ids:",coco.getCatIds())
cats = coco.loadCats(coco.getCatIds())
catnames=[cat['name'] for cat in cats]
print('Categories: {}'.format(catnames))

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Category Ids: [1, 2, 3, 4, 5]
Categories: ['Person', 'Bike', 'Helmet', 'Phone', 'Airbag']


### CocoDetection class

In [10]:
class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, feature_extractor, train=True):
        ann_file = os.path.join(img_folder, "content_train/trainJson.json" if train else "content_test/testJson.json")
        img_folder = os.path.join(img_folder, "content_train/trainData" if train else "content_test/testData")
        if os.path.lexists(os.path.join(img_folder, ".DS_Store")): 
          os.remove(os.path.join(img_folder, ".DS_Store"))
        super(CocoDetection, self).__init__(img_folder, ann_file)
        self.feature_extractor = feature_extractor

    def __getitem__(self, idx):
        # read in PIL image and target in COCO format
        img, target = super(CocoDetection, self).__getitem__(idx)
        
        # preprocess image and target (converting target to DETR format, resizing + normalization of both image and target)
        image_id = self.ids[idx]
        target = {'image_id': image_id, 'annotations': target}
        encoding = self.feature_extractor(images=img, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension
        target = encoding["labels"][0] # remove batch dimension

        return pixel_values, target

### Process the data in COCO format

In [11]:
feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")

train_dataset = CocoDetection(img_folder= "content/", feature_extractor=feature_extractor)
val_dataset = CocoDetection(img_folder= "content/", feature_extractor=feature_extractor, train=False)

print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(val_dataset))

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Number of training examples: 10
Number of validation examples: 3


### DataLoader objects for training and validation

In [12]:
def collate_fn(batch):
  pixel_values = [item[0] for item in batch]
  encoding = feature_extractor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt")
  labels = [item[1] for item in batch]
  batch = {}
  batch['pixel_values'] = encoding['pixel_values']
  batch['pixel_mask'] = encoding['pixel_mask']
  batch['labels'] = labels
  return batch

train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=batch_size, num_workers=num_workers, shuffle=False)
val_dataloader = DataLoader(val_dataset, collate_fn=collate_fn, batch_size=batch_size, num_workers=num_workers)

### Create DETR object class

In [13]:
class Detr(pl.LightningModule):

     def __init__(self, lr, lr_backbone, weight_decay):
         super().__init__()
         # replace COCO classification head with custom head
         self.model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", 
                                                             num_labels=num_classes,
                                                             ignore_mismatched_sizes=True)
         # see https://github.com/PyTorchLightning/pytorch-lightning/pull/1896
         # see https://github.com/huggingface/transformers/issues/12643 : ignore warnings
         self.lr = lr
         self.lr_backbone = lr_backbone
         self.weight_decay = weight_decay

     def forward(self, pixel_values, pixel_mask):
       outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

       return outputs
     
     def common_step(self, batch, batch_idx):
       pixel_values = batch["pixel_values"]
       pixel_mask = batch["pixel_mask"]
       #labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]
       labels = [{k: v for k, v in t.items()} for t in batch["labels"]]

       outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)

       loss = outputs.loss
       loss_dict = outputs.loss_dict

       return loss, loss_dict

     def training_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)
        for k,v in loss_dict.items():
          self.log("train_" + k, v.item())

        return loss

     def validation_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss)
        for k,v in loss_dict.items():
          self.log("validation_" + k, v.item())

        return loss

     def configure_optimizers(self):
        param_dicts = [
              {"params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
              {
                  "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
                  "lr": self.lr_backbone,
              },
        ]
        optimizer = torch.optim.AdamW(param_dicts, lr=self.lr,
                                  weight_decay=self.weight_decay)
        
        return optimizer

     def train_dataloader(self):
        return train_dataloader

     def val_dataloader(self):
        return val_dataloader

In [14]:
model = Detr(lr=lr, lr_backbone=lr_backbone, weight_decay=weight_decay)
# ignore warning messages

Some weights of DetrForObjectDetection were not initialized from the model checkpoint at facebook/detr-resnet-50 and are newly initialized because the shapes did not match:
- class_labels_classifier.weight: found shape torch.Size([92, 256]) in the checkpoint and torch.Size([7, 256]) in the model instantiated
- class_labels_classifier.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Forward pass on an item

In [15]:
# Get an item to show before, during, after training
batch = next(iter(val_dataloader))
print(batch.keys())

dict_keys(['pixel_values', 'pixel_mask', 'labels'])


In [16]:
outputs = model(pixel_values=batch['pixel_values'], pixel_mask=batch['pixel_mask'])
print("outputs logits shape", outputs.logits.shape)

outputs logits shape torch.Size([1, 100, 7])


### Evaluation

In [17]:
base_ds = get_coco_api_from_dataset(val_dataset)
iou_types = ['bbox']
coco_evaluator = CocoEvaluator(base_ds, iou_types) # initialize evaluator with ground truths

print("Running evaluation 0...")
for i_b, b in enumerate(val_dataloader):
    # get the inputs
    pixel_values = b["pixel_values"]
    pixel_mask = b["pixel_mask"]
    labels = [{k: v for k, v in t.items()} for t in b["labels"]] # these are in DETR format, resized + normalized

    # forward pass
    outputs = model.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
    results = feature_extractor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api
    res = {target['image_id'].item(): output for target, output in zip(labels, results)}
    coco_evaluator.update(res)

coco_evaluator.synchronize_between_processes()
coco_evaluator.accumulate()
coco_evaluator.summarize()

Running evaluation 0...
Accumulating evaluation results...
DONE (t=0.01s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.009
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.005
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.005
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.200
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
 Average Recall     (AR)

"-1.000" is returned when no bounding box fits the criteria for the evaluation metric

### Drawing annotations

In [18]:
CLASSES = ['N/A', 'person', 'bike', 'helmet', 'phone', 'airbag', 'N/A']
COLORS = ['red', 'green', 'blue', 'black', 'white', 'yellow']
fnt = ImageFont.truetype("Pillow/Tests/fonts/Arial.ttf", 40)

In [19]:
bl = batch['labels'][0]
img_id = bl['image_id'].item()
orig_size = bl['orig_size']
size = bl['size']
print("\n\nimg_id", img_id, "orig_size", orig_size, "size", size)



img_id 1 orig_size tensor([1440, 1920]) size tensor([ 800, 1066])


In [20]:
image = train_dataset.coco.loadImgs(img_id)[0]
annotations = train_dataset.coco.imgToAnns[img_id]

def draw_annotations(image=image, img_id=img_id, annotations=annotations):
  with Image.open(os.path.join("content/content_train/trainData/", image['file_name'])) as im:
    draw = ImageDraw.Draw(im, "RGBA")
    for annotation in annotations:
      box = annotation['bbox']
      class_idx = annotation['category_id']
      x,y,w,h = tuple(box)
      draw.rectangle((x,y,x+w,y+h), outline=COLORS[class_idx-1], width=2)
      draw.text((x, y), str(catnames[class_idx-1]), fill='white', font=fnt)
    # save
    im.save("annotated_img"+str(img_id)+".png", "PNG")
  return "annotated_img"+str(img_id)+".png file created"

draw_annotations(image, img_id, annotations)

'annotated_img1.png file created'

### Drawing output annotations

In [21]:
# utils functions
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
        (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return b

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = np.array(box_cxcywh_to_xyxy(out_bbox))
    b = np.multiply(b,np.array([img_w, img_h, img_w, img_h]))
    return b

In [22]:
def plot_results(img_id, pil_img, probs, boxes, train_nb):
    draw = ImageDraw.Draw(pil_img, "RGBA")
    colors = COLORS*20
    for p, (xmin,ymin,xmax,ymax), c in zip(probs, boxes.tolist(), colors):
      if max(p)>p_threshold-0.35: # had to lower threshold to have boxes to display 
        cl = p.argmax()
        if (cl!=0 and cl!=6):
          draw.rectangle((xmin,ymin,xmax,ymax), outline=colors[cl-1], width=2)
          text_c = f'{CLASSES[cl]}: {p[cl]:0.2f}'
          draw.text((xmin,ymin), text_c, fill='white', font=fnt)
    pil_img.save('output'+str(train_nb)+'_img'+str(img_id)+'.png', "PNG")

In [23]:
# Probabilities
logits = outputs.logits[0]
softmax = nn.Softmax(dim=-1)
probs = softmax(logits)

# Boxes
pred_boxes = outputs.pred_boxes[0]
bsize=(orig_size[1],orig_size[0])
boxes = np.array([rescale_bboxes(b,bsize) for b in pred_boxes])

# Image
pil_img = train_dataset.coco.loadImgs(img_id)[0]
pil_img = Image.open(os.path.join("content/content_train/trainData/", pil_img['file_name']))

In [24]:
plot_results(img_id, pil_img, probs, boxes, 0)

## Training

In [25]:
trainer = Trainer(gpus=gpu, max_epochs=max_epochs, gradient_clip_val=gradient_clip_val, fast_dev_run=fdr)
trainer.fit(model, train_dataloader, val_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Missing logger folder: /Users/apolline1/Documents/DTU/Courses/Semester1/minimal_example/lightning_logs

  | Name  | Type                   | Params
-------------------------------------------------
0 | model | DetrForObjectDetection | 41.5 M
-------------------------------------------------
41.3 M    Trainable params
222 K     Non-trainable params
41.5 M    Total params
166.042   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

### Evaluation after 1 training

In [26]:
base_ds = get_coco_api_from_dataset(val_dataset)
iou_types = ['bbox']
coco_evaluator = CocoEvaluator(base_ds, iou_types) # initialize evaluator with ground truths

print("Running evaluation 1...")
for i_b, b in enumerate(val_dataloader):
    # get the inputs
    pixel_values = b["pixel_values"]
    pixel_mask = b["pixel_mask"]
    labels = [{k: v for k, v in t.items()} for t in b["labels"]] # these are in DETR format, resized + normalized

    # forward pass
    outputs = model.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
    results = feature_extractor.post_process(outputs, orig_target_sizes) # convert outputs of model to COCO api
    res = {target['image_id'].item(): output for target, output in zip(labels, results)}
    coco_evaluator.update(res)

coco_evaluator.synchronize_between_processes()
coco_evaluator.accumulate()
coco_evaluator.summarize()

Running evaluation 1...
Accumulating evaluation results...
DONE (t=0.01s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.089
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.279
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.089
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.056
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.122
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.122
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
 Average Recall     (AR)

### Draw again

In [27]:
batch = next(iter(val_dataloader))
outputs = model(pixel_values=batch['pixel_values'], pixel_mask=batch['pixel_mask'])

In [28]:
# Probabilities
logits = outputs.logits[0]
softmax = nn.Softmax(dim=-1)
probs = softmax(logits)

# Boxes
pred_boxes = outputs.pred_boxes[0]
bsize=(orig_size[1],orig_size[0])
boxes = np.array([rescale_bboxes(b.detach().numpy(),bsize) for b in pred_boxes])

# Image
pil_img = train_dataset.coco.loadImgs(img_id)[0]
pil_img = Image.open(os.path.join("content/content_train/trainData/", pil_img['file_name']))

In [29]:
plot_results(img_id, pil_img, probs, boxes, 1)