In [1]:
import os
import numpy as np
import torch
from PIL import Image
import pycocotools
import matplotlib.pyplot as plt

In [2]:
if torch.cuda.is_available():  
    device = "cuda:0" 
else:  
    device = "cpu"  

In [3]:
torch.cuda.get_device_name(0)

'NVIDIA Quadro RTX 3000'

In [4]:
torch.cuda.current_device()

0

## Import data

In [5]:
traindatadir = 'D:\\preprocessedFull\\train_set2'
testdatadir = 'D:\\preprocessedFull\\test_set2'

In [6]:
import os
import pandas as pd
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T


# Class for a customized dataset
# In this case preprocessed CEM images combined in a 3-channel RGB .jpg format
# and the corresponding mask of present lesions in a 1-channel .png format
class CustomImageDataset(Dataset):
    def __init__(self, root, annotations_file, img_dir, mask_dir, transform=None, target_transform=None):
        # Read the .csv file with all the information
        self.img_labels = pd.read_csv(os.path.join(root, annotations_file))
        # Define the directories of the images and masks
        self.img_dir = os.path.join(root, img_dir)
        self.mask_dir = os.path.join(root, mask_dir)
        # Define whethet transformations are included
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        # Return the number of cases in the dataset
        # In this set, CC and MLO of the same breast are considered different cases
        return len(self.img_labels)

    def __getitem__(self, idx):
        # Read the image and the mask for a case from the directories
        img_path = self.img_labels.iloc[idx, 0]
        mask_path = self.img_labels.iloc[idx,6]
        image = read_image(img_path).float()
        mask = read_image(mask_path)

        # Resize so all images and masks have the same size
#         image = T.Resize([800,800])(image)
#         mask = T.Resize([800,800])(mask)    
#         resize_scale_x = 800/image.size()[1]
#         resize_scale_y = 800/image.size()[2]
             
        # Normalize image with mean and standard deviation per channel
        mean = torch.mean(image, dim=(1,2))
        stdev = torch.std(image, dim=(1,2))
        image = T.Normalize(mean, stdev)(image)
        
        # Rescale to [0,1] range per channel
        for dim in range(3) :
            image[dim] -= torch.min(image[dim])
            image[dim] /= torch.max(image[dim])  
        
        # Create separate channel in mask for each lesion
        mask_out = np.zeros((torch.max(mask).item(), mask.shape[-2], mask.shape[-1]))
        for lesion_idx in range(torch.max(mask).item()) :
            mask_out[lesion_idx][mask[0]==lesion_idx+1] = 1      
        
        # Read the location of the lesion bounding box from the .csv file
        xmin = self.img_labels.iloc[idx, 1]
        xmax = self.img_labels.iloc[idx, 2]
        ymin = self.img_labels.iloc[idx, 3]
        ymax = self.img_labels.iloc[idx, 4]
#         boxes = [[xmin*resize_scale_x, ymin*resize_scale_y, xmax*resize_scale_x, ymax*resize_scale_y]]
        boxes = [[xmin, ymin, xmax, ymax]]
        # Read the label of the lesion from the .csv file
        labels = self.img_labels.iloc[idx, 5]  
        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)

        labels = torch.tensor([labels])
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        
        iscrowd = torch.zeros((2,), dtype=torch.int64)
        
        # Apply transformations if defined
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)  
            
        target = {}
        target["image_id"] = image_id
        target["masks"] = mask
        target["boxes"] = boxes
        target["area"] = area
        target["labels"] = labels
        target["iscrowd"] = iscrowd
            
        return image, target


In [7]:
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import utils

training_data = CustomImageDataset(traindatadir, 'annotations_train_full.csv', 'colored_to_jpg', 'mask_to_png_1dim')
train_dataloader = DataLoader(training_data, batch_size=2, shuffle=True, collate_fn=utils.collate_fn)

test_data = CustomImageDataset(testdatadir, 'annotations_train_full.csv', 'colored_to_jpg', 'mask_to_png_1dim')
test_dataloader = DataLoader(test_data, batch_size=2, shuffle=True, collate_fn=utils.collate_fn)


In [8]:
# For Training

train_images, train_targets = next(iter(train_dataloader))
train_image_list = list(image for image in train_images)
train_target_list = [{k: v for k, v in t.items()} for t in train_targets]

# For Testing

test_images, test_targets = next(iter(test_dataloader))
test_image_list = list(image for image in test_images)
test_target_list = [{k: v for k, v in t.items()} for t in test_targets]

In [9]:
train_target_list

[{'image_id': tensor([2]),
  'masks': tensor([[[0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           ...,
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8),
  'boxes': tensor([[1481.,  982., 1606., 1116.]]),
  'area': tensor([16750.]),
  'labels': tensor([0]),
  'iscrowd': tensor([0, 0])},
 {'image_id': tensor([3]),
  'masks': tensor([[[0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           ...,
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8),
  'boxes': tensor([[1753.,  999., 1867., 1136.]]),
  'area': tensor([15618.]),
  'labels': tensor([0]),
  'iscrowd': tensor([0, 0])}]

In [10]:
test_target_list

[{'image_id': tensor([0]),
  'masks': tensor([[[0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           ...,
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8),
  'boxes': tensor([[642., 946., 682., 987.]]),
  'area': tensor([1640.]),
  'labels': tensor([0]),
  'iscrowd': tensor([0, 0])},
 {'image_id': tensor([2]),
  'masks': tensor([[[0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           ...,
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8),
  'boxes': tensor([[ 939.,  175., 1021.,  259.]]),
  'area': tensor([6888.]),
  'labels': tensor([0]),
  'iscrowd': tensor([0, 0])}]

## Define model

In [11]:
import torchvision
from torchvision.models.detection import FasterRCNN, MaskRCNN
from torchvision.models.detection.rpn import AnchorGenerator

# load a pre-trained model for classification and return
# only the features
backbone = torchvision.models.mobilenet_v2(pretrained=True).features
# FasterRCNN needs to know the number of
# output channels in a backbone. For mobilenet_v2, it's 1280
# so we need to add it here
backbone.out_channels = 1280

# backbone = torchvision.models.resnet101(pretrained=True, progress=True)
# backbone.out_channels = 1280

# let's make the RPN generate 5 x 3 anchors per spatial
# location, with 5 different sizes and 3 different aspect
# ratios. We have a Tuple[Tuple[int]] because each feature
# map could potentially have different sizes and
# aspect ratios
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))

# let's define what are the feature maps that we will
# use to perform the region of interest cropping, as well as
# the size of the crop after rescaling.
# if your backbone returns a Tensor, featmap_names is expected to
# be [0]. More generally, the backbone should return an
# OrderedDict[Tensor], and in featmap_names you can choose which
# feature maps to use.
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
                                                output_size=7,
                                                sampling_ratio=2)

mask_roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
                                                     output_size=14,
                                                     sampling_ratio=2)

# put the pieces together inside a FasterRCNN model
model = FasterRCNN(backbone,
                   num_classes=3,
                   rpn_anchor_generator=anchor_generator,
                   box_roi_pool=roi_pooler)

# put the pieces together inside a MaskRCNN model
model = MaskRCNN(backbone,
                 num_classes=3,
                 rpn_anchor_generator=anchor_generator,
                 box_roi_pool=roi_pooler,
                 mask_roi_pool=mask_roi_pooler,
                 min_size=200,
                 max_size=333)

# model.eval()
# x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
# predictions = model(x)

In [12]:
model.to(device)

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(200,), max_size=333, mode='bilinear')
  )
  (backbone): Sequential(
    (0): ConvNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
   

In [13]:
from engine import train_one_epoch, evaluate

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
                            momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

# let's train it for 10 epochs
num_epochs = 10

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, train_dataloader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, test_dataloader, device=device)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Epoch: [0]  [0/6]  eta: 0:00:35  lr: 0.001004  loss: 1.8365 (1.8365)  loss_classifier: 1.0859 (1.0859)  loss_box_reg: 0.0000 (0.0000)  loss_mask: 0.0000 (0.0000)  loss_objectness: 0.7415 (0.7415)  loss_rpn_box_reg: 0.0090 (0.0090)  time: 5.9444  data: 0.6160  max mem: 1393
Epoch: [0]  [5/6]  eta: 0:00:03  lr: 0.005000  loss: 1.7043 (1.4901)  loss_classifier: 0.2891 (0.5544)  loss_box_reg: 0.0000 (0.0001)  loss_mask: 0.0000 (0.2378)  loss_objectness: 0.6826 (0.6784)  loss_rpn_box_reg: 0.0090 (0.0194)  time: 3.1045  data: 0.7401  max mem: 2113
Epoch: [0] Total time: 0:00:18 (3.1045 s / it)
creating index...
index created!
Test:  [0/3]  eta: 0:00:03  model_time: 0.4625 (0.4625)  evaluator_time: 0.0000 (0.0000)  time: 1.2580  data: 0.7799  max mem: 2113
Test:  [2/3]  eta: 0:00:01  model_time: 0.4530 (0.4510)  evaluator_time: 0.0000 (0.0000)  time: 1.2697  data: 0.8083  max mem: 2113
Test: Total time: 0:00:03 (1.2697 s / it)
Averaged stats: model_time: 0.4530 (0.4510)  evaluator_time: 0.000

Epoch: [3]  [0/6]  eta: 0:00:15  lr: 0.000500  loss: 0.1096 (0.1096)  loss_classifier: 0.0009 (0.0009)  loss_box_reg: 0.0000 (0.0000)  loss_mask: 0.0000 (0.0000)  loss_objectness: 0.0637 (0.0637)  loss_rpn_box_reg: 0.0450 (0.0450)  time: 2.6529  data: 0.8876  max mem: 2119
Epoch: [3]  [5/6]  eta: 0:00:02  lr: 0.000500  loss: 0.0712 (0.4219)  loss_classifier: 0.0039 (0.0179)  loss_box_reg: 0.0000 (0.0003)  loss_mask: 0.0001 (0.3426)  loss_objectness: 0.0396 (0.0450)  loss_rpn_box_reg: 0.0069 (0.0161)  time: 2.4605  data: 0.7341  max mem: 2119
Epoch: [3] Total time: 0:00:14 (2.4605 s / it)
creating index...
index created!
Test:  [0/3]  eta: 0:00:03  model_time: 0.4374 (0.4374)  evaluator_time: 0.0000 (0.0000)  time: 1.0581  data: 0.6051  max mem: 2119
Test:  [2/3]  eta: 0:00:01  model_time: 0.4217 (0.4165)  evaluator_time: 0.0000 (0.0000)  time: 1.2339  data: 0.8018  max mem: 2119
Test: Total time: 0:00:03 (1.2339 s / it)
Averaged stats: model_time: 0.4217 (0.4165)  evaluator_time: 0.000

Epoch: [6]  [0/6]  eta: 0:00:15  lr: 0.000050  loss: 0.0680 (0.0680)  loss_classifier: 0.0121 (0.0121)  loss_box_reg: 0.0000 (0.0000)  loss_mask: 0.0051 (0.0051)  loss_objectness: 0.0443 (0.0443)  loss_rpn_box_reg: 0.0065 (0.0065)  time: 2.6289  data: 0.8011  max mem: 2122
Epoch: [6]  [5/6]  eta: 0:00:02  lr: 0.000050  loss: 0.0680 (0.1687)  loss_classifier: 0.0121 (0.0206)  loss_box_reg: 0.0000 (0.0001)  loss_mask: 0.0000 (0.0853)  loss_objectness: 0.0443 (0.0482)  loss_rpn_box_reg: 0.0068 (0.0144)  time: 2.4737  data: 0.7346  max mem: 2122
Epoch: [6] Total time: 0:00:14 (2.4737 s / it)
creating index...
index created!
Test:  [0/3]  eta: 0:00:03  model_time: 0.4530 (0.4530)  evaluator_time: 0.0000 (0.0000)  time: 1.3315  data: 0.8629  max mem: 2122
Test:  [2/3]  eta: 0:00:01  model_time: 0.4530 (0.4582)  evaluator_time: 0.0000 (0.0052)  time: 1.2782  data: 0.8043  max mem: 2122
Test: Total time: 0:00:03 (1.2782 s / it)
Averaged stats: model_time: 0.4530 (0.4582)  evaluator_time: 0.000

Epoch: [9]  [0/6]  eta: 0:00:16  lr: 0.000005  loss: 0.0627 (0.0627)  loss_classifier: 0.0111 (0.0111)  loss_box_reg: 0.0000 (0.0000)  loss_mask: 0.0060 (0.0060)  loss_objectness: 0.0424 (0.0424)  loss_rpn_box_reg: 0.0032 (0.0032)  time: 2.6670  data: 0.8393  max mem: 2122
Epoch: [9]  [5/6]  eta: 0:00:02  lr: 0.000005  loss: 0.0778 (0.1558)  loss_classifier: 0.0012 (0.0157)  loss_box_reg: 0.0000 (0.0001)  loss_mask: 0.0000 (0.0776)  loss_objectness: 0.0432 (0.0480)  loss_rpn_box_reg: 0.0054 (0.0144)  time: 2.5280  data: 0.7471  max mem: 2122
Epoch: [9] Total time: 0:00:15 (2.5280 s / it)
creating index...
index created!
Test:  [0/3]  eta: 0:00:03  model_time: 0.4843 (0.4843)  evaluator_time: 0.0312 (0.0312)  time: 1.1354  data: 0.6043  max mem: 2122
Test:  [2/3]  eta: 0:00:01  model_time: 0.4686 (0.4634)  evaluator_time: 0.0000 (0.0104)  time: 1.3140  data: 0.8246  max mem: 2122
Test: Total time: 0:00:03 (1.3140 s / it)
Averaged stats: model_time: 0.4686 (0.4634)  evaluator_time: 0.000

In [7]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# replace the classifier with a new one, that has
# num_classes which is user-defined
num_classes = 2  # 1 class (lesion) + background
num_classes = 3  # 2 classes (lesion benign + lesion malignant) + background


# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [13]:
predictions

[{'boxes': tensor([[2.5803e+02, 2.3219e+02, 3.9800e+02, 2.9883e+02],
          [1.1064e+02, 1.4831e+02, 1.4184e+02, 1.6377e+02],
          [1.0749e+02, 1.3834e+02, 1.3233e+02, 1.7288e+02],
          [1.5622e+02, 1.9665e+02, 1.8711e+02, 2.1219e+02],
          [1.6788e+02, 1.8433e+02, 1.9201e+02, 2.2053e+02],
          [1.1911e+02, 1.3744e+02, 1.4418e+02, 1.7179e+02],
          [1.7898e+02, 1.8363e+02, 2.0447e+02, 2.1935e+02],
          [1.6970e+02, 1.9679e+02, 2.0021e+02, 2.1216e+02],
          [1.6757e+02, 1.7333e+02, 1.9111e+02, 2.0816e+02],
          [1.5799e+02, 1.8500e+02, 1.8792e+02, 2.0049e+02],
          [1.3170e+02, 1.3631e+02, 1.5699e+02, 1.7156e+02],
          [1.7809e+02, 2.9156e+02, 2.2092e+02, 3.0000e+02],
          [1.3280e+02, 1.4856e+02, 1.6463e+02, 1.6433e+02],
          [1.5361e+02, 2.9105e+02, 1.9505e+02, 3.0000e+02],
          [1.9327e+02, 1.7342e+02, 2.2308e+02, 1.8913e+02],
          [1.8191e+02, 1.9727e+02, 2.1213e+02, 2.1275e+02],
          [1.9056e+02, 1.0140e+

In [9]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor


def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

In [10]:
import transforms as T

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

In [11]:
import utils as vision_utils

In [26]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.to(device)
output = model(image_list, target_list)   # Returns losses and detections

# For inference
model.eval()
x = [torch.rand(3,300,400), torch.rand(3,500,400)]
predictions = model(x)           # Returns predictions

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

In [13]:
predictions

[{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward0>),
  'labels': tensor([], dtype=torch.int64),
  'scores': tensor([], grad_fn=<IndexBackward0>)},
 {'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward0>),
  'labels': tensor([], dtype=torch.int64),
  'scores': tensor([], grad_fn=<IndexBackward0>)}]

In [14]:
output

{'loss_classifier': tensor(0.0169, grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.0004, grad_fn=<DivBackward0>),
 'loss_objectness': tensor(11.8512, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(35.1903, grad_fn=<DivBackward0>)}