In [1]:
# 数据集下载地址： https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip

In [2]:
import os
import numpy as np
import torch
from PIL import Image

In [3]:
class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        self.imgs = list(sorted(os.listdir(os.path.join(root, 'PNGImages'))))
        self.masks = list(sorted(os.listdir(os.path.join(root, 'PedMasks'))))
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        
        img = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path)
        
        mask = np.array(mask)
        obj_ids = np.unique(mask)
        obj_ids = obj_ids[1:]
        
        masks = mask == obj_ids[:, None, None]
        
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        if self.transforms is not None:
            img, target = self.transforms(img, target)
        
        return img, target
    
    def __len__(self):
        return len(self.imgs)

we will be using Mask R-CNN, which is based on top of Faster R-CNN
## 1、Finetuning from a pretrained model

In [4]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

num_classes = 2
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
'''
Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
'''

'\nDownloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth\n'

## 2、修改模型添加其他backbone

In [5]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

backbone = torchvision.models.mobilenet_v2(pretrained=True).features

backbone.out_channels = 1280

anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),), 
                                   aspect_ratios=((0.5, 1.0, 2.0),))

roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0], 
                                                output_size=7, 
                                                sampling_ratio=2)
# 将各个部分放到FasterRCNN模型中
model = FasterRCNN(backbone, 
                   num_classes=2, 
                   rpn_anchor_generator=anchor_generator, 
                   box_roi_pool=roi_pooler)

##  PennFudan Dataset的实例分割模型（Instance segmentation)
由于我们的数据集非常小，我们希望从预训练的模型中进行微调，因此我们将遵循方法1。

在这里，我们还要计算实例分割掩码，因此我们将使用Mask R-CNN

In [6]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained (COCO上经过预训练的模型
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    
    # 获取分类器的输入特征数
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # 用新的heads替换预训练的head(head：head是获取网络输出内容的网络，利用之前提取的特征，head利用这些特征，做出预测)
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    #获取mask 分类器的输入特征数
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    
    # 使用新的预测器替换mask predictor
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, num_classes)
    
    return model

In [7]:
# data aug
import transforms as T

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

In [8]:
# https://github.com/pytorch/vision/tree/master/references/detection

In [11]:
# Testing forward() method
import utils

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
data_loader = torch.utils.data.DataLoader(
 dataset, batch_size=2, shuffle=True, num_workers=0,
 collate_fn=utils.collate_fn)
# For Training
images,targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images,targets)   # Returns losses and detections
# For inference
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)           # Returns predictions

  "The default behavior for interpolate/upsample with float scale_factor changed "


In [15]:
from engine import train_one_epoch, evaluate
import utils


def main():
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 2
    # use our dataset and defined transformations
    dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False))

    # split the dataset in train and test set
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=2, shuffle=True, num_workers=0,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, shuffle=False, num_workers=0,
        collate_fn=utils.collate_fn)

    # get the model using our helper function
    model = get_model_instance_segmentation(num_classes)

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                                momentum=0.9, weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 10

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)

    print("That's it!")

In [17]:
if __name__ == "__main__":
    main()

## all

In [21]:
# Sample code from the TorchVision 0.3 Object Detection Finetuning Tutorial
# http://pytorch.org/tutorials/intermediate/torchvision_tutorial.html

import os
import numpy as np
import torch
from PIL import Image

import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

from engine import train_one_epoch, evaluate
import utils
import transforms as T


class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        # load images ad masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # note that we haven't converted the mask to RGB,
        # because each color corresponds to a different instance
        # with 0 being background
        mask = Image.open(mask_path)

        mask = np.array(mask)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model


def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)


def main():
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 2
    # use our dataset and defined transformations
    dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False))

    # split the dataset in train and test set
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=2, shuffle=True, num_workers=0,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, shuffle=False, num_workers=0,
        collate_fn=utils.collate_fn)

    # get the model using our helper function
    model = get_model_instance_segmentation(num_classes)

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                                momentum=0.9, weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 10

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)

    print("That's it!")
    
if __name__ == "__main__":
    main()


Epoch: [0]  [ 0/60]  eta: 0:00:17  lr: 0.000090  loss: 2.9089 (2.9089)  loss_classifier: 0.5286 (0.5286)  loss_box_reg: 0.3546 (0.3546)  loss_mask: 2.0232 (2.0232)  loss_objectness: 0.0009 (0.0009)  loss_rpn_box_reg: 0.0015 (0.0015)  time: 0.2936  data: 0.0482  max mem: 3628
Epoch: [0]  [10/60]  eta: 0:00:12  lr: 0.000936  loss: 1.6176 (1.7423)  loss_classifier: 0.4559 (0.3923)  loss_box_reg: 0.3457 (0.3088)  loss_mask: 0.7256 (1.0134)  loss_objectness: 0.0216 (0.0222)  loss_rpn_box_reg: 0.0025 (0.0055)  time: 0.2536  data: 0.0633  max mem: 3997
Epoch: [0]  [20/60]  eta: 0:00:09  lr: 0.001783  loss: 1.0891 (1.2462)  loss_classifier: 0.2159 (0.2844)  loss_box_reg: 0.2708 (0.2893)  loss_mask: 0.3175 (0.6467)  loss_objectness: 0.0219 (0.0208)  loss_rpn_box_reg: 0.0031 (0.0051)  time: 0.2394  data: 0.0588  max mem: 3997
Epoch: [0]  [30/60]  eta: 0:00:07  lr: 0.002629  loss: 0.6634 (1.0563)  loss_classifier: 0.1118 (0.2278)  loss_box_reg: 0.2791 (0.2998)  loss_mask: 0.2180 (0.5066)  loss_ob

Epoch: [2]  [ 0/60]  eta: 0:00:13  lr: 0.005000  loss: 0.1530 (0.1530)  loss_classifier: 0.0111 (0.0111)  loss_box_reg: 0.0194 (0.0194)  loss_mask: 0.1217 (0.1217)  loss_objectness: 0.0000 (0.0000)  loss_rpn_box_reg: 0.0008 (0.0008)  time: 0.2248  data: 0.0546  max mem: 3997
Epoch: [2]  [10/60]  eta: 0:00:10  lr: 0.005000  loss: 0.2091 (0.2237)  loss_classifier: 0.0313 (0.0314)  loss_box_reg: 0.0526 (0.0523)  loss_mask: 0.1331 (0.1358)  loss_objectness: 0.0003 (0.0006)  loss_rpn_box_reg: 0.0036 (0.0036)  time: 0.2142  data: 0.0424  max mem: 3997
Epoch: [2]  [20/60]  eta: 0:00:08  lr: 0.005000  loss: 0.2143 (0.2341)  loss_classifier: 0.0282 (0.0320)  loss_box_reg: 0.0536 (0.0599)  loss_mask: 0.1331 (0.1378)  loss_objectness: 0.0003 (0.0006)  loss_rpn_box_reg: 0.0024 (0.0038)  time: 0.2118  data: 0.0397  max mem: 3997
Epoch: [2]  [30/60]  eta: 0:00:06  lr: 0.005000  loss: 0.2324 (0.2379)  loss_classifier: 0.0282 (0.0329)  loss_box_reg: 0.0547 (0.0626)  loss_mask: 0.1274 (0.1381)  loss_ob

Epoch: [4]  [ 0/60]  eta: 0:00:14  lr: 0.000500  loss: 0.2139 (0.2139)  loss_classifier: 0.0373 (0.0373)  loss_box_reg: 0.0566 (0.0566)  loss_mask: 0.1153 (0.1153)  loss_objectness: 0.0009 (0.0009)  loss_rpn_box_reg: 0.0037 (0.0037)  time: 0.2438  data: 0.0663  max mem: 3997
Epoch: [4]  [10/60]  eta: 0:00:12  lr: 0.000500  loss: 0.2015 (0.2039)  loss_classifier: 0.0340 (0.0279)  loss_box_reg: 0.0405 (0.0458)  loss_mask: 0.1154 (0.1261)  loss_objectness: 0.0002 (0.0011)  loss_rpn_box_reg: 0.0035 (0.0030)  time: 0.2464  data: 0.0577  max mem: 3997
Epoch: [4]  [20/60]  eta: 0:00:09  lr: 0.000500  loss: 0.1938 (0.2037)  loss_classifier: 0.0283 (0.0283)  loss_box_reg: 0.0444 (0.0476)  loss_mask: 0.1154 (0.1244)  loss_objectness: 0.0002 (0.0008)  loss_rpn_box_reg: 0.0023 (0.0026)  time: 0.2360  data: 0.0499  max mem: 3997
Epoch: [4]  [30/60]  eta: 0:00:06  lr: 0.000500  loss: 0.1925 (0.2011)  loss_classifier: 0.0285 (0.0283)  loss_box_reg: 0.0460 (0.0458)  loss_mask: 0.1146 (0.1236)  loss_ob

Epoch: [6]  [ 0/60]  eta: 0:00:16  lr: 0.000050  loss: 0.1734 (0.1734)  loss_classifier: 0.0267 (0.0267)  loss_box_reg: 0.0381 (0.0381)  loss_mask: 0.1062 (0.1062)  loss_objectness: 0.0006 (0.0006)  loss_rpn_box_reg: 0.0018 (0.0018)  time: 0.2730  data: 0.0669  max mem: 3997
Epoch: [6]  [10/60]  eta: 0:00:12  lr: 0.000050  loss: 0.2064 (0.2027)  loss_classifier: 0.0306 (0.0328)  loss_box_reg: 0.0501 (0.0481)  loss_mask: 0.1116 (0.1183)  loss_objectness: 0.0006 (0.0009)  loss_rpn_box_reg: 0.0021 (0.0027)  time: 0.2537  data: 0.0586  max mem: 3997
Epoch: [6]  [20/60]  eta: 0:00:10  lr: 0.000050  loss: 0.1847 (0.1943)  loss_classifier: 0.0264 (0.0287)  loss_box_reg: 0.0288 (0.0415)  loss_mask: 0.1147 (0.1207)  loss_objectness: 0.0002 (0.0007)  loss_rpn_box_reg: 0.0021 (0.0026)  time: 0.2494  data: 0.0599  max mem: 3997
Epoch: [6]  [30/60]  eta: 0:00:07  lr: 0.000050  loss: 0.1656 (0.1874)  loss_classifier: 0.0201 (0.0277)  loss_box_reg: 0.0253 (0.0379)  loss_mask: 0.1109 (0.1184)  loss_ob

Epoch: [8]  [ 0/60]  eta: 0:00:14  lr: 0.000050  loss: 0.1583 (0.1583)  loss_classifier: 0.0210 (0.0210)  loss_box_reg: 0.0259 (0.0259)  loss_mask: 0.1074 (0.1074)  loss_objectness: 0.0011 (0.0011)  loss_rpn_box_reg: 0.0029 (0.0029)  time: 0.2433  data: 0.0594  max mem: 4041
Epoch: [8]  [10/60]  eta: 0:00:10  lr: 0.000050  loss: 0.1583 (0.1579)  loss_classifier: 0.0210 (0.0213)  loss_box_reg: 0.0251 (0.0254)  loss_mask: 0.1074 (0.1089)  loss_objectness: 0.0001 (0.0004)  loss_rpn_box_reg: 0.0020 (0.0020)  time: 0.2090  data: 0.0422  max mem: 4041
Epoch: [8]  [20/60]  eta: 0:00:08  lr: 0.000050  loss: 0.1708 (0.1671)  loss_classifier: 0.0221 (0.0232)  loss_box_reg: 0.0257 (0.0294)  loss_mask: 0.1112 (0.1121)  loss_objectness: 0.0002 (0.0004)  loss_rpn_box_reg: 0.0015 (0.0019)  time: 0.2186  data: 0.0454  max mem: 4041
Epoch: [8]  [30/60]  eta: 0:00:06  lr: 0.000050  loss: 0.1817 (0.1785)  loss_classifier: 0.0278 (0.0260)  loss_box_reg: 0.0364 (0.0339)  loss_mask: 0.1151 (0.1157)  loss_ob