# Implementing Faster R-CNN

In this example, we will demostrate `Faster R-CNN detection` using the pretrained PyTorch Faster R-CNN model. The model is trained using the COCO dataset.

In [7]:
import os
import cv2
import numpy as np
import requests
import torchvision
import torchvision.transforms as transforms

Next, we'll declare a list of COCO class names (necessary for visualizing the detected objects later):

In [8]:
classes = [
    'background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'street sign',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack',
    'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee',
    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
    'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk',
    'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'hair brush']

Then, we will download the input image

In [6]:
# Download object detection image
image_file = 'source_2.png'
if not os.path.isfile(image_file):
    url = "https://raw.githubusercontent.com/ivan-vasilev/advanced-deep-learning-with-python/refs/heads/master/chapter04-detection-segmentation/source_2.png"
    r = requests.get(url)
    with open(image_file, 'wb') as f:
        f.write(r.content)

Next, we'll load the `torchvision.models.detection.fasterrcnn_resnet50_fpn` pretrained `model`:

In [9]:
# load the pytorch pretrained model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# set the model in evaluation mode
model.eval()

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /home/victord/.cache/torch/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [03:09<00:00, 882kB/s]  


FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(original_name=FrozenBatchNorm2d)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(original_name=FrozenBatchNorm2d)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(original_name=FrozenBatchNorm2d)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(original_name=FrozenBatchNorm2d)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
       

Then, we will feed the input `img` into the `model` to produce the detected objects:

In [10]:
# read the image file
img = cv2.imread(image_file)

# transform the img input to a tensor
transform = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])
nn_input = transform(img)
output = model([nn_input])

In [12]:
output

[{'boxes': tensor([[ 636.3980,  526.3130, 1224.0668,  774.1372],
          [ 406.3749,  301.5048, 1217.5696,  723.2188],
          [ 720.1866,  472.6105,  773.8168,  538.6634],
          [ 228.4937,  367.2188,  260.3882,  466.9784],
          [ 801.7681,  487.6225,  836.8633,  532.1195],
          [  55.8751,  559.5872,   75.2253,  584.2229],
          [1165.8805,  168.5358, 1278.8893,  718.4623],
          [ 257.7322,  538.1931,  276.0692,  585.7459],
          [ 218.3346,  491.3564,  235.4633,  520.0047],
          [ 247.2475,  372.5798,  263.3246,  461.0223],
          [ 724.8757,  576.4719,  769.4236,  602.6824],
          [ 578.4518,  495.9499, 1237.0792,  764.7579],
          [ 481.7475,  462.7123,  503.3111,  500.9771],
          [1163.2694,  147.4709, 1278.8864,  728.4131],
          [ 294.9775,  488.2884,  305.6277,  511.2444],
          [ 385.0872,  337.9223,  862.5121,  691.5908],
          [ 220.8729,  496.0852,  232.1509,  516.4416],
          [ 309.4469,  494.8350,  326.1

Next, we will filter the detected bounding boxes and we will plot them along with the class labels over the input image:

In [13]:
colors = np.random.uniform(0, 255, size=(len(classes), 3))
# iterate over the network output for all boxes
for box, box_class, score in zip(output[0]["boxes"], output[0]["labels"], output["scores"]):
    # filter the boxes by score
    if score > 0.5:
        # transform bounding box format
        box = [(box[0], box[1]), (box[2], box[3])]
        # select class color
        color = colors[box_class]
        # extract class name
        class_name = classes[box_class]
        # draw the bounding box
        cv2.rectangle(img=image, text=class_name, pt1=box[0], pt2=box[1], color=color, thickness=2)
        # idsplay the box class label 
        cv2.putText(img=image, text=class_name, org=box[0], fontFace=cv2.FONT_HERSHEY_SIMPLEX, 
                    fontScale=1, color=color, thickness=2)

TypeError: list indices must be integers or slices, not str