In [1]:
import os
import sys

from PIL import Image

import torch
import torchvision

from torchvision import transforms
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

In [2]:
# # 向 sys.path 中临时添加模块文件存储位置的完整路径
cwd = os.getcwd()
sys.path.append(os.path.join(cwd, "vision", "references", "detection"))

In [3]:
from engine import train_one_epoch, evaluate
import utils

<br>
<br>

In [4]:
class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        # load images and masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # note that we haven't converted the mask to RGB,
        # because each color corresponds to a different instance
        # with 0 being background
        mask = Image.open(mask_path)

        mask = np.array(mask)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            # img, target = self.transforms(img, target)
            img = self.transforms(img)

        return img, target

    def __len__(self):
        return len(self.imgs)

<br>
<br>

In [5]:
def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

<br>
<br>

In [6]:
from torchvision.transforms import functional as F
from torch import Tensor

from typing import List, Tuple, Dict, Optional, Union

class MyRandomHorizontalFlip(transforms.RandomHorizontalFlip):
    def forward(self, 
                image: Tensor, 
                target: Optional[Dict[str, Tensor]] = None
                ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
        
        if torch.rand(1) < self.p:
            image = F.hflip(image)
            if target is not None:
                _, _, width = F.get_dimensions(image)
                target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]]
                if "masks" in target:
                    target["masks"] = target["masks"].flip(-1)
                if "keypoints" in target:
                    keypoints = target["keypoints"]
                    keypoints = _flip_coco_person_keypoints(keypoints, width)
                    target["keypoints"] = keypoints
        return image, target

<br>

In [7]:
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    # def __call__(self, image, target):
    def __call__(self, image):
        for t in self.transforms:
            if isinstance(image, tuple):
                image = image[0]
            image = t(image)
            
            """
            if i==0: 
                continue;
            for k, v in target.items():
                target[k] = t(v)
                # target[k] = t(transforms.ToPILImage(v))   
                # 原始的 v 都是 Tensor，需要使用 transforms.ToPILImage() 方法 进行转换，
                # 否则会报错 TypeError: Tensor is not a torch image.
            
            # target = t(target)
            # 上面这一句会报错：TypeError: img should be PIL Image. Got <class 'dict'>
            # 解决方法参考自：
            # https://python.iitter.com/other/105872.html
            """
        return image#, target

In [8]:
def get_transform(train):
    my_transforms = []
    # my_transforms.append(transforms.ToPILImage())
    
    if train:
        my_transforms.append(MyRandomHorizontalFlip(0.5))
    
    my_transforms.append(transforms.ToTensor())
    # Toensor() 要放到最后，详见：
    # https://discuss.pytorch.org/t/t-compose-typeerror-call-takes-2-positional-arguments-but-3-were-given/62529
    # https://python.iitter.com/other/105872.html
    
    return Compose(my_transforms)

<br>
<br>
<br>

In [9]:
def main():
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 2
    # use our dataset and defined transformations
    dataset = PennFudanDataset('./data/PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('./data/PennFudanPed', get_transform(train=False))

    # split the dataset in train and test set
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=2, shuffle=True, num_workers=0,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, shuffle=False, num_workers=0,
        collate_fn=utils.collate_fn)

    # get the model using our helper function
    model = get_model_instance_segmentation(num_classes)

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                                momentum=0.9, weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 10

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)

    print("That's it!")

In [10]:
main()

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Epoch: [0]  [ 0/60]  eta: 0:04:35  lr: 0.000090  loss: 3.3705 (3.3705)  loss_classifier: 0.9240 (0.9240)  loss_box_reg: 0.2794 (0.2794)  loss_mask: 2.1456 (2.1456)  loss_objectness: 0.0167 (0.0167)  loss_rpn_box_reg: 0.0049 (0.0049)  time: 4.5896  data: 0.0190  max mem: 2135
Epoch: [0]  [10/60]  eta: 0:00:40  lr: 0.000936  loss: 2.3859 (2.4675)  loss_classifier: 0.5348 (0.5664)  loss_box_reg: 0.1756 (0.1972)  loss_mask: 1.4276 (1.6468)  loss_objectness: 0.0533 (0.0486)  loss_rpn_box_reg: 0.0055 (0.0084)  time: 0.8026  data: 0.0200  max mem: 3876
Epoch: [0]  [20/60]  eta: 0:00:24  lr: 0.001783  loss: 1.1219 (1.7532)  loss_classifier: 0.1936 (0.3668)  loss_box_reg: 0.1637 (0.1818)  loss_mask: 0.8076 (1.1399)  loss_objectness: 0.0319 (0.0509)  loss_rpn_box_reg: 0.0095 (0.0139)  time: 0.4248  data: 0.0197  max mem: 3876
Epoch: [0]  [30/60]  eta: 0:00:16  lr: 0.002629  loss: 0.8934 (1.4774)  loss_classifier: 0.1221 (0.2889)  loss_box_reg: 0.1686 (0.1841)  loss_mask: 0.4902 (0.9512)  loss_ob

Epoch: [2]  [ 0/60]  eta: 0:00:28  lr: 0.005000  loss: 0.4179 (0.4179)  loss_classifier: 0.0469 (0.0469)  loss_box_reg: 0.0581 (0.0581)  loss_mask: 0.3038 (0.3038)  loss_objectness: 0.0077 (0.0077)  loss_rpn_box_reg: 0.0014 (0.0014)  time: 0.4735  data: 0.0210  max mem: 3876
Epoch: [2]  [10/60]  eta: 0:00:23  lr: 0.005000  loss: 0.6155 (0.6781)  loss_classifier: 0.0716 (0.0899)  loss_box_reg: 0.0999 (0.1353)  loss_mask: 0.3772 (0.4318)  loss_objectness: 0.0077 (0.0096)  loss_rpn_box_reg: 0.0074 (0.0116)  time: 0.4651  data: 0.0213  max mem: 3876
Epoch: [2]  [20/60]  eta: 0:00:17  lr: 0.005000  loss: 0.5891 (0.6083)  loss_classifier: 0.0716 (0.0825)  loss_box_reg: 0.0940 (0.1203)  loss_mask: 0.3715 (0.3880)  loss_objectness: 0.0048 (0.0080)  loss_rpn_box_reg: 0.0068 (0.0095)  time: 0.4394  data: 0.0199  max mem: 3876
Epoch: [2]  [30/60]  eta: 0:00:13  lr: 0.005000  loss: 0.5665 (0.6430)  loss_classifier: 0.0675 (0.0906)  loss_box_reg: 0.0940 (0.1310)  loss_mask: 0.3653 (0.4023)  loss_ob

Epoch: [4]  [ 0/60]  eta: 0:00:19  lr: 0.000500  loss: 0.5172 (0.5172)  loss_classifier: 0.0610 (0.0610)  loss_box_reg: 0.0715 (0.0715)  loss_mask: 0.3783 (0.3783)  loss_objectness: 0.0008 (0.0008)  loss_rpn_box_reg: 0.0056 (0.0056)  time: 0.3263  data: 0.0140  max mem: 3876
Epoch: [4]  [10/60]  eta: 0:00:21  lr: 0.000500  loss: 0.7057 (0.6765)  loss_classifier: 0.1000 (0.1021)  loss_box_reg: 0.1082 (0.1324)  loss_mask: 0.4238 (0.4223)  loss_objectness: 0.0071 (0.0093)  loss_rpn_box_reg: 0.0100 (0.0103)  time: 0.4201  data: 0.0236  max mem: 3876
Epoch: [4]  [20/60]  eta: 0:00:16  lr: 0.000500  loss: 0.5175 (0.5991)  loss_classifier: 0.0801 (0.0881)  loss_box_reg: 0.0897 (0.1146)  loss_mask: 0.3383 (0.3802)  loss_objectness: 0.0069 (0.0081)  loss_rpn_box_reg: 0.0043 (0.0080)  time: 0.4092  data: 0.0216  max mem: 3876
Epoch: [4]  [30/60]  eta: 0:00:12  lr: 0.000500  loss: 0.5175 (0.5829)  loss_classifier: 0.0668 (0.0834)  loss_box_reg: 0.0731 (0.1097)  loss_mask: 0.3383 (0.3753)  loss_ob

Epoch: [6]  [ 0/60]  eta: 0:00:29  lr: 0.000050  loss: 1.0509 (1.0509)  loss_classifier: 0.1322 (0.1322)  loss_box_reg: 0.2038 (0.2038)  loss_mask: 0.6973 (0.6973)  loss_objectness: 0.0067 (0.0067)  loss_rpn_box_reg: 0.0110 (0.0110)  time: 0.4960  data: 0.0280  max mem: 3876
Epoch: [6]  [10/60]  eta: 0:00:21  lr: 0.000050  loss: 0.7088 (0.6590)  loss_classifier: 0.0817 (0.0918)  loss_box_reg: 0.1038 (0.1315)  loss_mask: 0.4201 (0.4203)  loss_objectness: 0.0067 (0.0052)  loss_rpn_box_reg: 0.0086 (0.0101)  time: 0.4316  data: 0.0220  max mem: 3876
Epoch: [6]  [20/60]  eta: 0:00:16  lr: 0.000050  loss: 0.6236 (0.6687)  loss_classifier: 0.0817 (0.0956)  loss_box_reg: 0.0874 (0.1301)  loss_mask: 0.3929 (0.4265)  loss_objectness: 0.0068 (0.0067)  loss_rpn_box_reg: 0.0066 (0.0098)  time: 0.4066  data: 0.0189  max mem: 3876
Epoch: [6]  [30/60]  eta: 0:00:12  lr: 0.000050  loss: 0.5877 (0.6551)  loss_classifier: 0.0726 (0.0909)  loss_box_reg: 0.0874 (0.1233)  loss_mask: 0.3929 (0.4250)  loss_ob

Epoch: [8]  [ 0/60]  eta: 0:00:29  lr: 0.000050  loss: 0.8140 (0.8140)  loss_classifier: 0.1670 (0.1670)  loss_box_reg: 0.2430 (0.2430)  loss_mask: 0.3704 (0.3704)  loss_objectness: 0.0068 (0.0068)  loss_rpn_box_reg: 0.0269 (0.0269)  time: 0.4951  data: 0.0390  max mem: 3876
Epoch: [8]  [10/60]  eta: 0:00:21  lr: 0.000050  loss: 0.5692 (0.5615)  loss_classifier: 0.0799 (0.0870)  loss_box_reg: 0.0995 (0.1068)  loss_mask: 0.3464 (0.3502)  loss_objectness: 0.0033 (0.0084)  loss_rpn_box_reg: 0.0074 (0.0090)  time: 0.4209  data: 0.0211  max mem: 3876
Epoch: [8]  [20/60]  eta: 0:00:17  lr: 0.000050  loss: 0.5044 (0.4889)  loss_classifier: 0.0782 (0.0809)  loss_box_reg: 0.0832 (0.0897)  loss_mask: 0.2874 (0.3040)  loss_objectness: 0.0031 (0.0065)  loss_rpn_box_reg: 0.0048 (0.0077)  time: 0.4243  data: 0.0196  max mem: 3876
Epoch: [8]  [30/60]  eta: 0:00:12  lr: 0.000050  loss: 0.4920 (0.5283)  loss_classifier: 0.0735 (0.0822)  loss_box_reg: 0.0681 (0.0969)  loss_mask: 0.2721 (0.3359)  loss_ob

<br>
<br>
<br>