In [1]:
import os
import torch
import torchvision
from torchvision import transforms
from engine import train_one_epoch, evaluate
import utils
from coco import CocoSubset

root = 'E:/Resource/Dataset/COCO/SubCOCO'
annDir = os.path.join(root, 'annotations/instances_{}.json')
# coco = COCO(annDir.format('train2017'))

loading annotations into memory...
Done (t=2.01s)
creating index...
index created!
loading annotations into memory...
Done (t=0.10s)
creating index...
index created!
Amount of train images:
Dataset CocoSubset
    Number of datapoints: 19759
    Root location: E:/Resource/Dataset/COCO/SubCOCO\train2017
Amount of validation images:
Dataset CocoSubset
    Number of datapoints: 870
    Root location: E:/Resource/Dataset/COCO/SubCOCO\val2017
19759
<class 'coco.CocoSubset'>
<PIL.Image.Image image mode=RGB size=480x640 at 0x257BB0FDC48> {'labels': tensor([1, 4]), 'masks': tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 

## Creating Datasets

In [2]:
img_transform = {
    'train': transforms.Compose([
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(0.5)
    ]),
    'val':transforms.Compose([
        transforms.ToTensor()
    ])}

target_transform = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(0.5)
    ]),
    'val':transforms.Compose([])}

coco_train = CocoSubset(os.path.join(root, 'train2017'),
                        annDir.format('train2017'),
                        img_transform=img_transform['train'],
                        target_transform=target_transform['train'])

coco_val = CocoSubset(os.path.join(root, 'val2017'),
                      annDir.format('val2017'),
                      img_transform=img_transform['val'],
                      target_transform=target_transform['val'])

loading annotations into memory...
Done (t=1.68s)
creating index...
index created!
loading annotations into memory...
Done (t=0.06s)
creating index...
index created!


In [3]:
print(len(coco_train))
print(type(coco_train))
print(coco_train[0])

19759
<class 'coco.CocoSubset'>
(tensor([[[0.5216, 0.5176, 0.5412,  ..., 1.0000, 1.0000, 1.0000],
         [0.5451, 0.5176, 0.5294,  ..., 1.0000, 1.0000, 1.0000],
         [0.5373, 0.5098, 0.5216,  ..., 1.0000, 1.0000, 1.0000],
         ...,
         [0.4196, 0.4902, 0.4157,  ..., 0.6549, 0.5686, 0.5490],
         [0.4392, 0.4196, 0.4275,  ..., 0.7020, 0.6471, 0.5647],
         [0.4784, 0.4549, 0.4078,  ..., 0.6157, 0.6824, 0.6314]],

        [[0.4941, 0.5333, 0.5176,  ..., 1.0000, 1.0000, 1.0000],
         [0.5098, 0.5216, 0.5059,  ..., 1.0000, 1.0000, 1.0000],
         [0.5020, 0.5098, 0.5059,  ..., 1.0000, 1.0000, 1.0000],
         ...,
         [0.2902, 0.4235, 0.3569,  ..., 0.6039, 0.5647, 0.5059],
         [0.2980, 0.4196, 0.3529,  ..., 0.6510, 0.6392, 0.5098],
         [0.3059, 0.4667, 0.3490,  ..., 0.5843, 0.6745, 0.6078]],

        [[0.5216, 0.5373, 0.5255,  ..., 1.0000, 1.0000, 1.0000],
         [0.5294, 0.5294, 0.5137,  ..., 1.0000, 1.0000, 1.0000],
         [0.5216, 0.5176,

In [4]:
data_loader_train = torch.utils.data.DataLoader(coco_train,
                                         batch_size=2,
                                         shuffle=True,
                                         num_workers=8,
                                         collate_fn=utils.collate_fn)
data_loader_val = torch.utils.data.DataLoader(coco_val,
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=8,
                                              collate_fn=utils.collate_fn)

In [5]:
data_loader_train

<torch.utils.data.dataloader.DataLoader at 0x257cc128708>

## Defining Model

In [6]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
torch.manual_seed(1)

def get_instance_segmentation_model(num_classes):
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                      hidden_layer,
                                                      num_classes)
    return model

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')

num_classes = 5

model = get_instance_segmentation_model(num_classes)
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, 
                            lr=0.005,
                            momentum=0.9,
                            weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                              step_size=3,
                                              gamma=0.1)

In [8]:
num_epochs = 1

for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, data_loader_train, device, epoch, print_freq=10)
    lr_scheduler.step()
    evaluate(model, data_loader_val, device=device)

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:766.)
  keep = keep.nonzero().squeeze(1)


Epoch: [0]  [   0/9880]  eta: 36 days, 18:16:06  lr: 0.000010  loss: 5.0507 (5.0507)  loss_classifier: 1.5123 (1.5123)  loss_box_reg: 0.1240 (0.1240)  loss_mask: 2.8487 (2.8487)  loss_objectness: 0.5347 (0.5347)  loss_rpn_box_reg: 0.0311 (0.0311)  time: 321.4743  data: 129.1690  max mem: 2361
Epoch: [0]  [  10/9880]  eta: 3 days, 20:56:20  lr: 0.000060  loss: 4.4112 (4.5674)  loss_classifier: 1.4077 (1.3814)  loss_box_reg: 0.2502 (0.2804)  loss_mask: 2.7185 (2.5674)  loss_objectness: 0.0749 (0.2806)  loss_rpn_box_reg: 0.0311 (0.0575)  time: 33.8988  data: 11.9446  max mem: 3750
Epoch: [0]  [  20/9880]  eta: 2 days, 3:42:13  lr: 0.000110  loss: 3.3185 (3.7635)  loss_classifier: 1.1814 (1.1534)  loss_box_reg: 0.1733 (0.2475)  loss_mask: 1.7061 (2.0457)  loss_objectness: 0.0739 (0.2662)  loss_rpn_box_reg: 0.0191 (0.0507)  time: 3.7478  data: 0.1391  max mem: 3750
Epoch: [0]  [  30/9880]  eta: 1 day, 12:06:57  lr: 0.000160  loss: 2.1135 (3.1082)  loss_classifier: 0.6262 (0.9084)  loss_box_

ValueError: All bounding boxes should have positive height and width. Found invaid box [801.5234375, 328.3018798828125, 801.5234375, 328.3018798828125] for target at index 1.