# Train Mask R-CNN Net for Object Detection

## 1. Initialize device and dataset

In [None]:
# import libraries
import random
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import numpy as np
import torch.utils.data
import cv2
import torchvision.models.segmentation
import torch
import os

In [None]:
# initialize parameters of data
BATCH_SIZE = 2
IMAGE_SIZE = [600, 600]

# define device
device = torch.device('cuda') if torch.cuda.is_available()

``IMAGE_SIZE = [Width, Height]`` are the dimensions of the image used for training. All images during the training processes will be resized to this size.

``BATCH_SIZE`` is the number of images that will be used for each iteration of the training.

``BATCH_SIZE * Width * Height`` will be proportional to the memory requirement of the training. Depending on your hardware, it might be necessary to use a smaller BATCH_SIZE or image size to avoid out-of-memory problems.

Note that since there is only a single image size, the net once trained is likely to be limited to work with only images of this size. In most cases what is necessary is to change the size of each training batch.

``device`` is automatically set the device where the net will run (GPU or CPU), in practice training without a strong GPU is extremely slow.

In [None]:
# create a list of all images in the dataset
train_dir = 'LabPicsChemistry/Train'

imgs = []
for pth in os.listdir(train_dir):
    imgs.append(train_dir + '/' + pth + '//')

``train_dir`` is the LabPics V2 dataset train folder.

``imgs`` is the list of all images in the trainset

## 2. Create dataloader

Next, a data loader function is created that will allow for loading a batch of random images and their data for training. The data will contain the image and and masks of all the objects in the image. Each mask will be saved as a black-white (0/1) image.

These masks are images the same size as the RGB image where the region of the object instances is marked 1 and the rest are marked 0.

In [None]:
# dataloader
def load_data():
  
    batch_imgs = []
    batch_data = []

    # loop over images in the batch  
    for i in range(BATCH_SIZE):
        
        # pick a random image from the list (idx)
        idx = random.randint(0, len(imgs) - 1)
        
        # load the image
        img = cv2.imread(os.path.join(imgs[idx], 'Image.jpg'))
        
        # resize image to the standard size (IMAGE_SIZE)
        img = cv2.resize(img, IMAGE_SIZE, cv2.INTER_LINEAR)
        
        # get subfolder where the vessel instances map is stored
        mask_dir = os.path.join(imgs[idx], 'Vessels')
        masks = []
        
        # loop over name of masks got from their directory
        for msk_name in os.listdir(mask_dir):
            
            # read the masks
            ves_mask = cv2.imread(mask_dir + '/' + msk_name, 0)
            
            # store mask in 0–255 format and is converted to 0–1 format
            ves_mask = (ves_mask > 0).astype(np.uint8) 
            
            # resize mask to the standard image size
            ves_mask = cv2.resize(ves_mask, IMAGE_SIZE, cv2.INTER_NEAREST)
            
            # add mask to the list
            masks.append(ves_mask)
        
        num_objs = len(masks)
        
        # test the number of objects on the image
        if num_objs == 0: 
            return load_data()
        
        boxes = torch.zeros([num_objs, 4], dtype=torch.float32)
        
        # use the masks to generate a bounding box for each object
        for i in range(num_objs):

            # x, y: are the top coordinate of the bounding box
            # w, h: are the width and height of the bounding box
            x, y, w, h = cv2.boundingRect(masks[i])

            # mask RCNN bounding box format demands the top left and bottom right
            # coordinate of the box which is given by: [x, y, x + w, y + h]
            boxes[i] = torch.tensor([x, y, x + w, y + h])
        
        # convert data into a tensor
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        img = torch.as_tensor(img, dtype=torch.float32)
        
        # stack all the information about the image into one dictionary
        data = {}
        data['boxes'] = boxes       
        data['masks'] = masks
        
        # pick ones for everything to take the class of all the objects to be the same (1)
        data['labels'] =  torch.ones((num_objs,), dtype=torch.int64)

        # add data to the lists
        batch_imgs.append(img)
        batch_data.append(data)  
  
  # load the image data into the training batch and convert it to PyTorch format
  batch_imgs = torch.stack([torch.as_tensor(d) for d in batch_imgs], 0)
  batch_imgs = batch_imgs.swapaxes(1, 3).swapaxes(2, 3)
  
  return batch_imags, batch_data

## 3. Train the model

Now start building the net. First, load a mask RCNN model that was already pretrained on the COCO dataset:

In [None]:
# load pretreined model
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

A pretrained model that uses existing knowledge can learn new tasks and datasets much faster than a model that was not trained before.

The COCO dataset contains over 100 classes. In this project, it is only needed to get two classes. There will be a change of the final layers of the net to predict two classes

In [None]:
# get a number of input features in the head
in_features = model.roi_heads.box_predictor.cls_score.in_features 

# replace standard bounding box predictor with Fast-RCNN (2 classes) 
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes=2)

# load the model to the training device GPU or CPU
model.to(device)

In [None]:
# define the optimizer to determine the way the net weights will be changed during training
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

In [None]:
# set the model to train mode
model.train()

In [None]:
# main training loop
for i in range(10001):
   
    # load the data using the data loader function
    images, targets = load_data()
   
    # load the data into the training device (CPU/GPU)
    images = list(image.to(device) for image in images)
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
   
    # set gradients of all optimized tensors to zero
    optimizer.zero_grad()

    # take the images and data and run it through our neural net to get the loss
    loss_dict = model(images, targets)

    # loss is composed of several parts: class loss, bounding box loss, and mask loss; 
    # all of these parts are summed together to get the total loss as a single number
    losses = sum(loss for loss in loss_dict.values())
   
    # update the neural net weights using backpropagation
    losses.backward()
    optimizer.step()
    
    # print out losses
    print(i, 'loss:', losses.item())
   
    # save the trained model once every 500 steps
    if i % 200 == 0:
        torch.save(model.state_dict(), str(i) + '.torch')
        print('Save model to:', str(i) + '.torch')

## 4. Test the model

Once the training is finished the model has to be tested.

The script is similar to the training script. The first part is simply loading the net as before.

In [None]:
# model for testing
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  

model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True) 

in_features = model.roi_heads.box_predictor.cls_score.in_features 
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes=2)

# the only difference is to load the saved model, 
# and set the model to evaluation state
model.load_state_dict(torch.load('10000.torch'))
model.to(device)
model.eval()

In [None]:
# a single image loaded, resized to standard size, and converted to PyTorch format
images = cv2.imread(img_path)
images = cv2.resize(images, IMAGE_SIZE, cv2.INTER_LINEAR)
images = torch.as_tensor(images, dtype=torch.float32).unsqueeze(0)
images = images.swapaxes(1, 3).swapaxes(2, 3)
images = list(image.to(device) for image in images)

In [None]:
# run the image through the net
with torch.no_grad():
    pred = model(images)

This runs the image through the net and gets a prediction for the object in the image. Note we are not training the net, so we do not need to collect gradient (no_grad) this makes the net run much faster.

The prediction is composed of several parts: “masks” which corresponds to the mask (regions) of every object in the image. “Scores” correspond to how likely the predicted mask is correct. In addition, there is the predicted bounding box and classes.

In [None]:
# get image and its copy
im = images[0].swapaxes(0, 2).swapaxes(0, 1).detach().cpu().numpy().astype(np.uint8)
im2 = im.copy()

# go over all the predictions 
# and display only those objects with “scores” larger than 0.8
for i in range(len(pred[0]['masks'])):
    
    msk = pred[0]['masks'][i, 0].detach().cpu().numpy()
    scr = pred[0]['scores'][i].detach().cpu().numpy()
    
    if scr > 0.8:
        im2[:, :, 0][msk > 0.5] = random.randint(0, 255)
        im2[:, :, 1][msk > 0.5] = random.randint(0, 255)
        im2[:, :, 2][msk > 0.5] = random.randint(0, 255)

cv2.imshow(str(scr), np.hstack([im, im2]))
cv2.waitKey()

Note that the predicted object ‘masks’ are saved as a matrix in the same size as the image with each pixel having a value that corresponds to how likely it is part of the object

Only pixels which values larger than 0.5 are likely to be part of the objects. This is displayed by marking these pixels with a different random color for each object