# Object Detection by finetuning, "instance segmentation"

## Defining the Dataset

## Writeing a custom dataset for PennFudan(for pedestrian detection and segmentation)

In [1]:
import os
import numpy as np
import torch
from PIL import Image

In [2]:
class PennFudanDataset(object):
    def __init__(self,root,transforms):
        self.root=root
        self.transforms=transforms
        # load all image files, sorting them to ensure that they are aligned
        self.imgs=list(sorted(os.listdir(os.path.join(root,"PNGImages"))))
        self.masks=list(sorted(os.listdir(os.path.join(root,"PedMasks"))))
        
    def __getitem__(self,idx):
        # load images ad masks
        img_path=os.path.join(self.root,"PNGImages",self.imgs[idx])
        mask_path=os.path.join(self.root,"PedMasks",self.masks[idx])
        img=Image.open(img_path).convert("RGB")
        
        # note that we haven't converted the mask to RGB, because each color corresponds to a different instace with 0 being background
        mask=Image.open(mask_path)        
        # convert the PIL Image into a numpy array. PIL:(C,H,W)
        mask=np.array(mask)
        # instaces are encoded as different colors
        obj_ids=np.unique(mask)
        # first id is the background, so remove it
        obj_ids=obj_ids[1:]
        
        # split the color-encoded mask into a set of binary masks
        masks = mask==obj_ids[:,None,None]
        
        # get bounding box coordinates for each mask
        num_objs=len(obj_ids)
        boxes=[]
        for i in range(num_objs):
            pos=np.where(masks[i])
            xmin=np.min(pos[1])
            xmax=np.max(pos[1])
            ymin=np.min(pos[0])
            ymax=np.max(pos[0])
            boxes.append([xmin,ymin,xmax,ymax])
            
        # convert everthing into a torch.Tensor
        boxes=torch.as_tensor(boxes,dtype=torch.float32)
        # there is only one class
        labels=torch.ones((num_objs),dtype=torch.int64)
        masks=torch.as_tensor(masks,dtype=torch.uint8)
        
        image_id=torch.tensor([idx])
        area=(boxes[:,3]-boxes[:,1])*(boxes[:,2]-boxes[:,0])
        # suppose all instance are not crowd
        iscrowd=torch.zeros((num_objs,),dtype=torch.int64)
        
        target={}
        target["boxes"]=boxes
        target["labels"]=labels
        target["masks"]=masks
        target["image_id"]=image_id
        target["area"]=area
        target["iscrowd"]=iscrowd
        
        if self.transforms is not None:
            img,target=self.transforms(img,target)
            
        return img,target
    
    def __len__(self):
        return len(self.imgs)

Mask R-CNNを実装するが，やっていることは，元画像に対して畳み込みしたものとRoIを求めたものを乗算し注目する部分のみ詳しく見てあげる．その領域の特徴量マップに対して，プーリング,FC層などを通してあげることでそれがどのクラスの領域であるか，また，bboxを得ることができる．

## 1. Finetuning from a pretrained model

In [3]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# load a model pre-trained on COCO
model=torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# replace the classfier with a new one, that has num_classes which is user-defined
num_classes=2 # 1 class (person) + background
# get number of input features for the classifier
in_features=model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor=FastRCNNPredictor(in_features,num_classes)

## 2. Modifying the model to add a different backbone

In [4]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

# load a pre-trained model for classification and return only the features
backbone=torchvision.models.mobilenet_v2(pretrained=True).features
# FasterRCNN needs to know the number of output channels in a backbone.
# For mobilenet_v2, it's 1280 so we need to add it here
backbone.out_channels=1280

# let's make the RPN generate 5*3 anchors perspatial location, 
# with 5 different sizes and 3 different aspect ratios.
# We have a Tuple[Tuple[int]] because each feature map could potentially have different sizes and aspect ratios
anchor_generator=AnchorGenerator(sizes=((32,64,128,256,512),),aspect_ratios=((0.5,1.0,2.0),))

# let's define what are the feature maps that we will use to perform the region of interst cropping,
# as well as the size of the crop after rescaling.
# if your backbone returns a Tensor, featmap_names is expected to be [0].
# More generally, the backbone should return an OrderdDict[Tensor], and in featmap_names you can choose which feature maps to use.
roi_pooler=torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],output_size=7,sampling_ratio=2)

# put the pieces together inside a FasterRCNN model
model=FasterRCNN(backbone,num_classes=2,rpn_anchor_generator=anchor_generator,box_roi_pool=roi_pooler)

### An instance segmentation model for PennFudan Dataset

In [5]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

def get_model_instace_segmentation(num_classes):
    model=torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    
    in_features=model.roi_heads.box_predictor.cls_score.in_features
    
    model.roi_heads.box_predictor=FastRCNNPredictor(in_features,num_classes)
    
    in_features_mask=model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer=256
    
    model.roi_heads.mask_predictor=MaskRCNNPredictor(in_features_mask,hidden_layer,num_classes)
    
    return model

### Puting everything together

In [6]:
import HelperFunctions.transforms as T

def get_transform(train):
    transforms=[]
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

### Testing forward() method (see what, optional)

In [7]:
import HelperFunctions.utils as utils

model=torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
dataset=PennFudanDataset('./data/PennFudanPed',get_transform(train=True))
dataset_test=PennFudanDataset("./data/PennFudanPed",get_transform(train=False))

indices=torch.randperm(len(dataset)).tolist()

dataset=torch.utils.data.Subset(dataset,indices[:-50])
dataset_test=torch.utils.data.Subset(dataset_test,indices[-50:])

data_loader=torch.utils.data.DataLoader(dataset,batch_size=2,shuffle=True,
                                       num_workers=4,collate_fn=utils.collate_fn)
data_loader_test=torch.utils.data.DataLoader(dataset_test,batch_size=1,shuffle=False,
                                       num_workers=4,collate_fn=utils.collate_fn)
    
# For testing
images,targets=next(iter(data_loader_test))
images=list(image for image in images)
targets=[{k: v for k,v in t.items()} for t in targets]
output=model(images,targets) #Return losses and detections

# For inference
model.eval()
x=[torch.rand(3,300,400),torch.rand(3,500,400)]
predictions=model(x)
print(predictions)

[{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward>), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=<IndexBackward>)}, {'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward>), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=<IndexBackward>)}]


In [12]:
from HelperFunctions.engine import train_one_epoch, evaluate
import HelperFunctions.utils as utils

def main():
    device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print(device)
    
    num_classes=2
    
    dataset=PennFudanDataset("./data/PennFudanPed",get_transform(train=True))
    dataset_test=PennFudanDataset("./data/PennFudanPed",get_transform(train=False))
    
    # split the dataset in train and test set
    indices=torch.randperm(len(dataset)).tolist()
    dataset=torch.utils.data.Subset(dataset,indices[:-50])
    dataset_test=torch.utils.data.Subset(dataset_test,indices[-50:])
    
    data_loader=torch.utils.data.DataLoader(dataset,batch_size=2,shuffle=True,
                                           num_workers=4,collate_fn=utils.collate_fn)
    data_loader_test=torch.utils.data.DataLoader(dataset_test,batch_size=1,shuffle=False,
                                           num_workers=4,collate_fn=utils.collate_fn)
    
    model=get_model_instace_segmentation(num_classes)
    
    model.to(device)
    
    params=[p for p in model.parameters() if p.requires_grad]
    optimizer=torch.optim.SGD(params,lr=0.005,momentum=0.9,weight_decay=0.0005)
    lr_scheduler=torch.optim.lr_scheduler.StepLR(optimizer,step_size=3,gamma=0.1)
    
    num_epochs=10
    
    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model,optimizer,data_loader,device,epoch,print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model,data_loader_test,device=device)
    
    print("That's it!")

In [9]:
main()

cuda
Epoch: [0]  [ 0/60]  eta: 0:01:24  lr: 0.000090  loss: 3.7085 (3.7085)  loss_classifier: 0.6153 (0.6153)  loss_box_reg: 0.2626 (0.2626)  loss_mask: 2.7988 (2.7988)  loss_objectness: 0.0131 (0.0131)  loss_rpn_box_reg: 0.0187 (0.0187)  time: 1.4134  data: 0.6887  max mem: 2585
Epoch: [0]  [10/60]  eta: 0:00:16  lr: 0.000936  loss: 1.2064 (1.8924)  loss_classifier: 0.3996 (0.3959)  loss_box_reg: 0.1964 (0.1787)  loss_mask: 0.7177 (1.2865)  loss_objectness: 0.0133 (0.0236)  loss_rpn_box_reg: 0.0065 (0.0075)  time: 0.3302  data: 0.0657  max mem: 2863
Epoch: [0]  [20/60]  eta: 0:00:11  lr: 0.001783  loss: 0.8085 (1.3060)  loss_classifier: 0.2271 (0.2905)  loss_box_reg: 0.1674 (0.1743)  loss_mask: 0.3467 (0.8080)  loss_objectness: 0.0161 (0.0209)  loss_rpn_box_reg: 0.0082 (0.0123)  time: 0.2300  data: 0.0050  max mem: 2950
Epoch: [0]  [30/60]  eta: 0:00:08  lr: 0.002629  loss: 0.5560 (1.0486)  loss_classifier: 0.1075 (0.2259)  loss_box_reg: 0.1298 (0.1626)  loss_mask: 0.2508 (0.6301)  lo

Epoch: [2]  [ 0/60]  eta: 0:00:53  lr: 0.005000  loss: 0.2218 (0.2218)  loss_classifier: 0.0266 (0.0266)  loss_box_reg: 0.0120 (0.0120)  loss_mask: 0.1760 (0.1760)  loss_objectness: 0.0013 (0.0013)  loss_rpn_box_reg: 0.0058 (0.0058)  time: 0.8934  data: 0.6660  max mem: 3135
Epoch: [2]  [10/60]  eta: 0:00:14  lr: 0.005000  loss: 0.1923 (0.1898)  loss_classifier: 0.0254 (0.0269)  loss_box_reg: 0.0120 (0.0130)  loss_mask: 0.1415 (0.1397)  loss_objectness: 0.0011 (0.0014)  loss_rpn_box_reg: 0.0102 (0.0089)  time: 0.2950  data: 0.0650  max mem: 3135
Epoch: [2]  [20/60]  eta: 0:00:10  lr: 0.005000  loss: 0.1870 (0.2007)  loss_classifier: 0.0291 (0.0295)  loss_box_reg: 0.0150 (0.0161)  loss_mask: 0.1314 (0.1449)  loss_objectness: 0.0006 (0.0015)  loss_rpn_box_reg: 0.0071 (0.0087)  time: 0.2390  data: 0.0049  max mem: 3505
Epoch: [2]  [30/60]  eta: 0:00:07  lr: 0.005000  loss: 0.1903 (0.2025)  loss_classifier: 0.0328 (0.0311)  loss_box_reg: 0.0178 (0.0173)  loss_mask: 0.1269 (0.1425)  loss_ob

Epoch: [4]  [ 0/60]  eta: 0:00:58  lr: 0.000500  loss: 0.1299 (0.1299)  loss_classifier: 0.0229 (0.0229)  loss_box_reg: 0.0077 (0.0077)  loss_mask: 0.0954 (0.0954)  loss_objectness: 0.0001 (0.0001)  loss_rpn_box_reg: 0.0038 (0.0038)  time: 0.9795  data: 0.7282  max mem: 3557
Epoch: [4]  [10/60]  eta: 0:00:15  lr: 0.000500  loss: 0.1433 (0.1481)  loss_classifier: 0.0213 (0.0204)  loss_box_reg: 0.0096 (0.0101)  loss_mask: 0.1035 (0.1107)  loss_objectness: 0.0002 (0.0005)  loss_rpn_box_reg: 0.0064 (0.0064)  time: 0.3102  data: 0.0691  max mem: 3557
Epoch: [4]  [20/60]  eta: 0:00:10  lr: 0.000500  loss: 0.1499 (0.1598)  loss_classifier: 0.0213 (0.0247)  loss_box_reg: 0.0088 (0.0113)  loss_mask: 0.1066 (0.1155)  loss_objectness: 0.0004 (0.0008)  loss_rpn_box_reg: 0.0065 (0.0076)  time: 0.2396  data: 0.0041  max mem: 3557
Epoch: [4]  [30/60]  eta: 0:00:08  lr: 0.000500  loss: 0.1566 (0.1598)  loss_classifier: 0.0258 (0.0254)  loss_box_reg: 0.0088 (0.0117)  loss_mask: 0.1109 (0.1141)  loss_ob

Epoch: [6]  [ 0/60]  eta: 0:00:57  lr: 0.000050  loss: 0.3135 (0.3135)  loss_classifier: 0.0830 (0.0830)  loss_box_reg: 0.0428 (0.0428)  loss_mask: 0.1618 (0.1618)  loss_objectness: 0.0068 (0.0068)  loss_rpn_box_reg: 0.0191 (0.0191)  time: 0.9584  data: 0.7207  max mem: 3557
Epoch: [6]  [10/60]  eta: 0:00:14  lr: 0.000050  loss: 0.1417 (0.1589)  loss_classifier: 0.0219 (0.0242)  loss_box_reg: 0.0052 (0.0106)  loss_mask: 0.1084 (0.1166)  loss_objectness: 0.0002 (0.0012)  loss_rpn_box_reg: 0.0054 (0.0063)  time: 0.2933  data: 0.0680  max mem: 3557
Epoch: [6]  [20/60]  eta: 0:00:10  lr: 0.000050  loss: 0.1412 (0.1586)  loss_classifier: 0.0227 (0.0243)  loss_box_reg: 0.0069 (0.0102)  loss_mask: 0.1077 (0.1164)  loss_objectness: 0.0002 (0.0009)  loss_rpn_box_reg: 0.0055 (0.0068)  time: 0.2393  data: 0.0040  max mem: 3557
Epoch: [6]  [30/60]  eta: 0:00:07  lr: 0.000050  loss: 0.1433 (0.1623)  loss_classifier: 0.0259 (0.0265)  loss_box_reg: 0.0089 (0.0106)  loss_mask: 0.1063 (0.1160)  loss_ob

Epoch: [8]  [ 0/60]  eta: 0:00:53  lr: 0.000050  loss: 0.1348 (0.1348)  loss_classifier: 0.0133 (0.0133)  loss_box_reg: 0.0056 (0.0056)  loss_mask: 0.1070 (0.1070)  loss_objectness: 0.0013 (0.0013)  loss_rpn_box_reg: 0.0076 (0.0076)  time: 0.8959  data: 0.6788  max mem: 3557
Epoch: [8]  [10/60]  eta: 0:00:14  lr: 0.000050  loss: 0.1395 (0.1528)  loss_classifier: 0.0163 (0.0250)  loss_box_reg: 0.0056 (0.0107)  loss_mask: 0.1070 (0.1081)  loss_objectness: 0.0008 (0.0019)  loss_rpn_box_reg: 0.0066 (0.0070)  time: 0.2934  data: 0.0654  max mem: 3557
Epoch: [8]  [20/60]  eta: 0:00:10  lr: 0.000050  loss: 0.1452 (0.1570)  loss_classifier: 0.0238 (0.0260)  loss_box_reg: 0.0076 (0.0105)  loss_mask: 0.1027 (0.1116)  loss_objectness: 0.0006 (0.0015)  loss_rpn_box_reg: 0.0054 (0.0074)  time: 0.2365  data: 0.0044  max mem: 3557
Epoch: [8]  [30/60]  eta: 0:00:07  lr: 0.000050  loss: 0.1572 (0.1659)  loss_classifier: 0.0252 (0.0270)  loss_box_reg: 0.0092 (0.0116)  loss_mask: 0.1168 (0.1183)  loss_ob