In [3]:
import os
import torch
import torchvision
import numpy as np
from PIL import Image
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

## Customize PennFudan dataset

In [5]:
class PennFudanDataset(torch.utils.data.Dataset):
  def __init__(self, root, transform):
    self.root = root
    self.transform = transform
    self.img = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
    self.mask = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

  def __getitem__(self, index):
    # load images and thier masks
    image_path = os.path.join(self.root, "PHGImages", self.img[index])
    mask_path = os.path.join(self.root, "PedMasks", self.mask[index])
    # image is converted to RGB but mask is not
    #   because each color corresponds to a different instance
    image = Image.open(image_path).convert('RGB')
    mask = Image.open(mask_path)
    mask = np.array(mask)

    # instances are encoded as different colors
    obj_index = np.unique(mask)
    # the first index is background
    obj_index = obj_index[1:]

    # split the color-encoded mask into a set of binary masks
    masks = (mask == obj_index[:, None, None])

    # get the boundary of each mask (bounding box coordinates)
    num_objs = len(obj_index)
    bounding_boxs = []
    for i in range(num_objs):
      pos = np.where(masks[i])
      xmin = np.min(pos[1])
      xmax = np.max(pos[1])
      ymin = np.min(pos[0])
      ymax = np.max(pos[0])
      bounding_boxs.append([xmin, xmax, ymin, ymax])
    
    # convert to tensor
    bounding_boxs = torch.as_tensor(bounding_boxs, dtype=torch.float32)
    labels = torch.ones((num_objs, ), dtype=torch.int64)
    masks = torch.as_tensor(masks, dtype=torch.uint8)

    image_id = torch.tensor([index])
    area = (bounding_boxs[:, 1] - bounding_boxs[:, 0]) * (bounding_boxs[:, 3] - bounding_boxs[:,2])

    # suppose all instances are not crowded
    isCrowded = torch.zeros((num_objs, ), dtype=torch.int64)

    target = {}
    target["boxes"] = bounding_boxs
    target["labels"] = labels
    target["masks"] = masks
    target["image_id"] = image_id
    target["area"] = area
    target["isCrowded"] = isCrowded

    if self.transform is not None:
      image, target = self.transform(image, target)
    
    return image, target

  def __len__(self):
    return len(self.img)

## Define the model: Two routes
The model uses Mask R_CNN which is based on top of Faster R_CNN.

Faster R_CNN is a model that predicts both bounding boxes and class scores for potential objects in the image.

### Route 1: Finetuning from a pretrained model

In [2]:
# load a pre-trained model trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained = True)

# replace the number of classes to 2: 1 for person, 0 for background
num_classes = 2
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


  0%|          | 0.00/160M [00:00<?, ?B/s]

### Route 2: Modifying the model to add a different backbone

In [4]:
# load a pre-trained model only with features
backbone = torchvision.models.mobilenet_v2(pretrained = True).features

# need to know the number of the output of the backbone
backbone.out_channels = 1280 # for mobilenet_v2 is 1280

# make the RPN generate 5*3 anchors per spatial location, with 5 different size 
#   and 3 different aspect ratios
anchor_generator = AnchorGenerator(size=((32, 64, 128, 256, 512),), aspect_ratios=((0.5, 1.0, 2.0)))

# define the feature map using to perform the region of interest cropping 
#   and the size of the crop after cropping
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'], output_size=7, sampling_ratio=2)

# put together inside a FasterRCNN model
model = FasterRCNN(backbone, num_classes=2, rpn_anchor_generator=anchor_generator, box_roi_pool=roi_pooler)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth


  0%|          | 0.00/13.6M [00:00<?, ?B/s]

TypeError: ignored