# Importing Libraries

In [None]:
pip install torch



In [None]:
pip install pretrainedmodels



In [None]:
!pip install pydicom



In [None]:
import torch
import torchvision
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

import os
import time
import math
import numpy as np
import pandas as pd
import torch.utils.model_zoo as model_zoo
from pretrainedmodels.models import senet, pnasnet, xception

# Setting Device

In [None]:
# setting gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load Dataset 

Let's load the folders and increase the data for each bouinding box with 'x_max' and 'y_max', in order to adopt the Pascal standard format

In [None]:
from google.colab import drive

drive.mount('/content/drive')
IMAGES_DIR = '/content/drive/My Drive/Neural/dataset/stage_2_train_images'
LABELS_CSV = '/content/drive/My Drive/Neural/dataset/stage_2_train_labels.csv'
CLASS_INFO_CSV = '/content/drive/My Drive/Neural/dataset/stage_2_detailed_class_info.csv'

print(IMAGES_DIR)
print(LABELS_CSV)
print(CLASS_INFO_CSV)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Neural/dataset/stage_2_train_images
/content/drive/My Drive/Neural/dataset/stage_2_train_labels.csv
/content/drive/My Drive/Neural/dataset/stage_2_detailed_class_info.csv


In [None]:
train_labels_df = pd.read_csv(LABELS_CSV)
class_info_df = pd.read_csv(CLASS_INFO_CSV)
train_class_df = train_labels_df.merge(class_info_df, left_on='patientId', right_on='patientId', how='inner')
print(train_class_df.count())
train_class_df[:10]

patientId    37629
x            16957
y            16957
width        16957
height       16957
Target       37629
class        37629
dtype: int64


Unnamed: 0,patientId,x,y,width,height,Target,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0,No Lung Opacity / Not Normal
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0,No Lung Opacity / Not Normal
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0,No Lung Opacity / Not Normal
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0,Normal
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1,Lung Opacity
5,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1,Lung Opacity
6,00436515-870c-4b36-a041-de91049b9ab4,562.0,152.0,256.0,453.0,1,Lung Opacity
7,00436515-870c-4b36-a041-de91049b9ab4,562.0,152.0,256.0,453.0,1,Lung Opacity
8,00569f44-917d-4c86-a842-81832af98c30,,,,,0,No Lung Opacity / Not Normal
9,006cec2e-6ce2-4549-bffa-eadfcd1e9970,,,,,0,No Lung Opacity / Not Normal


Needs to remove duplicates

In [None]:
train_class_df = train_class_df.drop_duplicates()
print(train_class_df.count())
train_class_df[:10]

patientId    30227
x             9555
y             9555
width         9555
height        9555
Target       30227
class        30227
dtype: int64


Unnamed: 0,patientId,x,y,width,height,Target,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0,No Lung Opacity / Not Normal
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0,No Lung Opacity / Not Normal
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0,No Lung Opacity / Not Normal
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0,Normal
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1,Lung Opacity
6,00436515-870c-4b36-a041-de91049b9ab4,562.0,152.0,256.0,453.0,1,Lung Opacity
8,00569f44-917d-4c86-a842-81832af98c30,,,,,0,No Lung Opacity / Not Normal
9,006cec2e-6ce2-4549-bffa-eadfcd1e9970,,,,,0,No Lung Opacity / Not Normal
10,00704310-78a8-4b38-8475-49f4573b2dbb,323.0,577.0,160.0,104.0,1,Lung Opacity
12,00704310-78a8-4b38-8475-49f4573b2dbb,695.0,575.0,162.0,137.0,1,Lung Opacity


In [None]:
import pandas as pd
import numpy as np

# fill empty labels (class 0) with all 0
train_class_df.x.fillna(0, inplace=True)
train_class_df.y.fillna(0, inplace=True)
train_class_df.width.fillna(0, inplace=True)
train_class_df.height.fillna(0, inplace=True)

# add Pascal bounding boxes encoding
train_class_df['x_max'] = train_class_df['x']+train_class_df['width']
train_class_df['y_max'] = train_class_df['y']+train_class_df['height']

train_class_df.sample(10)

Unnamed: 0,patientId,x,y,width,height,Target,class,x_max,y_max
2247,17dc4e24-2228-4cdb-8052-9259a907f9c2,557.0,350.0,293.0,478.0,1,Lung Opacity,850.0,828.0
16187,8355ac5a-924d-4c43-99e0-f675b5f0e86c,0.0,0.0,0.0,0.0,0,No Lung Opacity / Not Normal,0.0,0.0
35284,fbe472aa-1dda-4c4d-af44-2d41966c819d,0.0,0.0,0.0,0.0,0,No Lung Opacity / Not Normal,0.0,0.0
27247,c2d86046-7af9-4019-b130-9e9bfde1abfe,0.0,0.0,0.0,0.0,0,Normal,0.0,0.0
15577,7f41d711-55ff-44a5-8857-b02eb300174b,0.0,0.0,0.0,0.0,0,Normal,0.0,0.0
21538,a694ac65-d011-4e71-b6f8-653258278b3f,0.0,0.0,0.0,0.0,0,Normal,0.0,0.0
19927,9bb53dc0-28ed-47ed-bff3-9e6cc3c03d62,650.0,473.0,275.0,325.0,1,Lung Opacity,925.0,798.0
12999,6e88a38a-9467-4ab9-b517-3664e98bc42b,0.0,0.0,0.0,0.0,0,No Lung Opacity / Not Normal,0.0,0.0
37284,2bf8a7ed-b4a5-4ea1-b8e8-cadc7f7adf2d,179.0,254.0,253.0,523.0,1,Lung Opacity,432.0,777.0
18626,92a0f940-652a-4a9e-aeee-26f003abda25,147.0,276.0,270.0,738.0,1,Lung Opacity,417.0,1014.0


## Augmentations definition

In [None]:
import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

def augmentation_pipeline(level):
  if level == 'resize_only':
    list_augmentations = [
      iaa.Resize(512)            
    ]

  elif level == 'light':
    list_augmentations = [
      iaa.Resize(512),
      iaa.Affine(
        scale=1.1, 
        shear=(2.5,2.5), 
        rotate=(-5, 5), 
      ),    
    ]
    
  elif level == 'heavy': #no rotation included
    list_augmentations = [
      iaa.Resize(512),
      iaa.Affine(
        scale=1.15, 
        shear=(4.0,4.0),
      ),   
      iaa.Fliplr(0.2), # horizontally flip 20% of the images
      iaa.Sometimes(0.1, iaa.CoarseSaltAndPepper(p=(0.01, 0.01), size_percent=(0.1, 0.2))),
      iaa.Sometimes(0.5, iaa.GaussianBlur(sigma=(0.0, 2.0))),
      iaa.Sometimes(0.5, iaa.AdditiveGaussianNoise(scale=(0, 0.04 * 255))),            
           
    ]

  elif level == 'heavy_with_rotations':
    list_augmentations = [
      iaa.Resize(512),
      iaa.Affine(
        scale=1.15, 
        shear=(4.0,4.0),
        rotate=(-6, 6), 
      ),   
      iaa.Fliplr(0.2), # horizontally flip 20% of the images
      iaa.Sometimes(0.1, iaa.CoarseSaltAndPepper(p=(0.01, 0.01), size_percent=(0.1, 0.2))),
      iaa.Sometimes(0.5, iaa.GaussianBlur(sigma=(0.0, 2.0))),
      iaa.Sometimes(0.5, iaa.AdditiveGaussianNoise(scale=(0, 0.04 * 255))),            
    ]

  return list_augmentations

## Custom dataset definition

In [None]:
from torch.utils.data import Dataset
from PIL import Image
import pydicom


def get_image_array(image_path):
  try:
    dcm_data = pydicom.read_file(image_path)
    img = dcm_data.pixel_array
    return img
  except:
      pass

def parse_one_annot(box_coord, filename):
  boxes_array = box_coord[box_coord["patientId"] == filename][["x", "y", "x_max", "y_max"]].values
  return boxes_array 

In [None]:
class CXRimages(Dataset):
    def __init__(self, csv_file, images_dir, augmentations='resize_only', transform=None):
      self.path = images_dir      
      self.annotations = csv_file
      self.categories = ["No Lung Opacity / Not Normal", "Normal", "Lung Opacity"]
      self.augmentations = augmentation_pipeline(augmentations)    # augmentations with imgaug
      self.transform = transform                                   # Images ToTensor and normalize
      #self.imgs = sorted(os.listdir(images_dir))


    def num_classes(self):
      return 3


    def __len__(self):
      return len(self.annotations)


    def __getitem__(self, idx):   # requires to define new indexes from 0
        patient_id = self.annotations['patientId'][idx]
        category = self.categories.index(self.annotations['class'][idx])
        target = self.annotations['Target'][idx]

        # load image
        img_path = os.path.join(self.path, patient_id +'.dcm')
        img = get_image_array(img_path)  
        image = img / 255.0

        image = (255*image).clip(0, 255).astype(np.uint8)
        image = Image.fromarray(image).convert('RGB')
  

        # get bounding boxes from csv
        box_list = parse_one_annot(self.annotations, patient_id)
        boxes = torch.as_tensor(box_list, dtype=torch.float32)
        num_objs = len(box_list)

        # get box encoding for imaug
        list_boxes = []
        for j in range(num_objs):
          list_boxes.append(BoundingBox(x1=boxes[j][0].item(), x2=boxes[j][2].item(), y1=boxes[j][1].item(), y2=boxes[j][3].item()))
        bbs = BoundingBoxesOnImage(list_boxes, shape=img.shape)

        # augmentation
        seq_training = iaa.Sequential(self.augmentations)
        image_aug, bbs_aug = seq_training(image=img, bounding_boxes=bbs)     

        # set bounding boxes on required encoding for the model
        final_boxes = np.zeros((0, 5))

        if target == 1:
          for box in bbs_aug.bounding_boxes:
            annotation  = np.zeros((1, 5))
            annotation[0, :4] = [box.x1, box.y1, box.x2, box.y2]
            annotation[0, 4]  = target
            final_boxes       = np.append(final_boxes, annotation, axis=0)  
          
          final_boxes = np.row_stack(final_boxes)


        if self.transform is not None:
                image_aug = self.transform(image_aug.copy())  # .copy() avoid negative values in tensor

        output = {"img": image_aug, "annot": final_boxes, "scale": 1.0, 'category': category}
        return output

## Split - Training - Validation - Test

In [None]:
np.random.seed(13)
msk = np.random.rand(len(train_class_df)) < 0.8

# split train and val/test + add indexes from 0 as required by class definition
train_df = train_class_df[msk].reset_index()  
val_train_df = train_class_df[~msk]

# aplit val/test
split_val = int(len(val_train_df)/2)
val_df = val_train_df.iloc[:split_val,:].reset_index()
test_df = val_train_df.iloc[split_val:,:].reset_index()

print(f'Samples in train set: {len(train_df)} \nSamples in validation set: {len(val_df)} \nSamples in test set: {len(val_df)} \n')

Samples in train set: 24085 
Samples in validation set: 3071 
Samples in test set: 3071 



## Load dataset

Remember to set the correct augmentation for the training set! You can select between:
- 'resize_only'
- 'light'
- 'heavy'
- 'heavy_with_rotations'

In [None]:
import torchvision.transforms as T

def To_tensor_tfms():
   transforms = []
   transforms.append(T.ToTensor())
   return T.Compose(transforms)

train_dataset = CXRimages(csv_file = train_df , images_dir = IMAGES_DIR, augmentations='resize_only', transform = None)

val_dataset = CXRimages(csv_file = val_df , images_dir = IMAGES_DIR, augmentations='resize_only', transform = None)
test_dataset = CXRimages(csv_file = test_df , images_dir = IMAGES_DIR, augmentations='resize_only', transform = None)

print(f'Samples in train set: {len(train_dataset)} \nSamples in validation set: {len(val_dataset)} \nSamples in test set: {len(test_dataset)}')

Samples in train set: 24085 
Samples in validation set: 3071 
Samples in test set: 3071


In [None]:
def collater2d(data):
    imgs = [s['img'] for s in data]
    annots = [s['annot'] for s in data]
    scales = [s['scale'] for s in data]
    cats = np.array([s['category'] for s in data])

    widths = [int(s.shape[0]) for s in imgs]
    heights = [int(s.shape[1]) for s in imgs]
    batch_size = len(imgs)

    max_width = np.array(widths).max()
    max_height = np.array(heights).max()

    padded_imgs = torch.zeros(batch_size, max_width, max_height, 1)

    for i in range(batch_size):
        img = imgs[i]
        padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), 0] = torch.from_numpy(img)

    max_num_annots = max(annot.shape[0] for annot in annots)

    if max_num_annots > 0:

        annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1

        if max_num_annots > 0:
            for idx, annot in enumerate(annots):
                # print(annot.shape)
                if annot.shape[0] > 0:
                    annot_padded[idx, :annot.shape[0], :] = torch.from_numpy(annot)
    else:
        annot_padded = torch.ones((len(annots), 1, 5)) * -1

    padded_imgs = padded_imgs.permute(0, 3, 1, 2)

    return {'img': padded_imgs, 'annot': annot_padded, 'scale': scales, 'category': torch.from_numpy(cats)}

In [None]:
# set batch size
batch_size = 4

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collater2d) 
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collater2d)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collater2d) 

# RetinaNet

## Anchors

In [None]:
class Anchors(nn.Module):
  def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None):
    super(Anchors, self).__init__()

    if pyramid_levels is None:
      self.pyramid_levels = [3, 4, 5, 6, 7]
    else:
      self.pyramid_levels = pyramid_levels

    if strides is None:
      self.strides = [2 ** x for x in self.pyramid_levels]
    if sizes is None:
      self.sizes = [2 ** (x + 2) for x in self.pyramid_levels]
    if ratios is None:
      self.ratios = np.array([0.5, 1, 2])
    if scales is None:
      self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])

  def forward(self, image):
        
    image_shape = image.shape[2:]
    image_shape = np.array(image_shape)
    image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels]

    # compute anchors over all pyramid levels
    all_anchors = np.zeros((0, 4)).astype(np.float32)

    for idx, p in enumerate(self.pyramid_levels):
      anchors         = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales)
      shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors)
      all_anchors     = np.append(all_anchors, shifted_anchors, axis=0)

    all_anchors = np.expand_dims(all_anchors, axis=0)

    return torch.from_numpy(all_anchors.astype(np.float32)).cuda()
  

def generate_anchors(base_size=16, ratios=None, scales=None):
    """
    Generate anchor (reference) windows by enumerating aspect ratios X
    scales w.r.t. a reference window.
    """

    if ratios is None:
      ratios = np.array([0.5, 1, 2])

    if scales is None:
      scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])

    num_anchors = len(ratios) * len(scales)

    # initialize output anchors
    anchors = np.zeros((num_anchors, 4))

    # scale base_size
    anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T

    # compute areas of anchors
    areas = anchors[:, 2] * anchors[:, 3]

    # correct for ratios
    anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales)))
    anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales))

    # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2)
    anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
    anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T

    return anchors
  
def compute_shape(image_shape, pyramid_levels):
  """Compute shapes based on pyramid levels.
  :param image_shape:
  :param pyramid_levels:
  :return:
  """
  image_shape = np.array(image_shape[:2])
  image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels]
  return image_shape

def anchors_for_shape(
  image_shape,
  pyramid_levels=None,
  ratios=None,
  scales=None,
  strides=None,
  sizes=None,
  shapes_callback=None,
  ):

  image_shapes = compute_shape(image_shape, pyramid_levels)

  # compute anchors over all pyramid levels
  all_anchors = np.zeros((0, 4))
  for idx, p in enumerate(pyramid_levels):
    anchors         = generate_anchors(base_size=sizes[idx], ratios=ratios, scales=scales)
    shifted_anchors = shift(image_shapes[idx], strides[idx], anchors)
    all_anchors     = np.append(all_anchors, shifted_anchors, axis=0)

  return all_anchors

def shift(shape, stride, anchors):
  shift_x = (np.arange(0, shape[1]) + 0.5) * stride
  shift_y = (np.arange(0, shape[0]) + 0.5) * stride

  shift_x, shift_y = np.meshgrid(shift_x, shift_y)

  shifts = np.vstack((
      shift_x.ravel(), shift_y.ravel(),
      shift_x.ravel(), shift_y.ravel()
  )).transpose()

  # add A anchors (1, A, 4) to
  # cell K shifts (K, 1, 4) to get
  # shift anchors (K, A, 4)
  # reshape to (K*A, 4) shifted anchors
  A = anchors.shape[0]
  K = shifts.shape[0]
  all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
  all_anchors = all_anchors.reshape((K * A, 4))

  return all_anchors
  

## Boxes

In [None]:
class BBoxTransform(nn.Module):

  def __init__(self, mean=None, std=None):
    super(BBoxTransform, self).__init__()
    if mean is None:
      self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)).cuda()
    else:
      self.mean = mean
    if std is None:
      self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)).cuda()
    else:
      self.std = std
  
  def forward(self, boxes, deltas):

    widths  = boxes[:, :, 2] - boxes[:, :, 0]
    heights = boxes[:, :, 3] - boxes[:, :, 1]
    ctr_x   = boxes[:, :, 0] + 0.5 * widths
    ctr_y   = boxes[:, :, 1] + 0.5 * heights

    dx = deltas[:, :, 0] * self.std[0] + self.mean[0]
    dy = deltas[:, :, 1] * self.std[1] + self.mean[1]
    dw = deltas[:, :, 2] * self.std[2] + self.mean[2]
    dh = deltas[:, :, 3] * self.std[3] + self.mean[3]

    pred_ctr_x = ctr_x + dx * widths
    pred_ctr_y = ctr_y + dy * heights
    pred_w     = torch.exp(dw) * widths
    pred_h     = torch.exp(dh) * heights

    pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w
    pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h
    pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w
    pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h

    pred_boxes = torch.stack([pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2)

    return pred_boxes

In [None]:
class ClipBoxes(nn.Module):

  def __init__(self, width=None, height=None):
    super(ClipBoxes, self).__init__()

  def forward(self, boxes, img):

    batch_size, num_channels, height, width = img.shape

    boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
    boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0)

    boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width)
    boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height)
      
    return boxes

## Encoders
Choose one of the following encoders as the backbone for the RetinNet model. The output of the encoders is the input for the FPN.

In [None]:
class Encoder(nn.Module):
  def __init__(self):
    super().__init__()
    self.fpn_sizes = []

  def forward(self, x):
    """
    :param x: input tensor
    :return: x1, x2, x3, x4 layer outputs
    """
    raise NotImplementedError()

### ResNet50

In [None]:
class BasicBlock(nn.Module):
  expansion = 1

  def __init__(self, inplanes, planes, stride=1, downsample=None):
    super(BasicBlock, self).__init__()
    self.conv1 = conv3x3(inplanes, planes, stride)
    self.bn1 = nn.BatchNorm2d(planes)
    self.relu = nn.ReLU(inplace=True)
    self.conv2 = conv3x3(planes, planes)
    self.bn2 = nn.BatchNorm2d(planes)
    self.downsample = downsample
    self.stride = stride

  def forward(self, x):
    residual = x

    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)

    out = self.conv2(out)
    out = self.bn2(out)

    if self.downsample is not None:
      residual = self.downsample(x)

    out += residual
    out = self.relu(out)

    return out

In [None]:
class Bottleneck(nn.Module):
  expansion = 4

  def __init__(self, inplanes, planes, stride=1, downsample=None):
    super(Bottleneck, self).__init__()
    self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
    self.bn1 = nn.BatchNorm2d(planes)
    self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
    self.bn2 = nn.BatchNorm2d(planes)
    self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
    self.bn3 = nn.BatchNorm2d(planes * 4)
    self.relu = nn.ReLU(inplace=True)
    self.downsample = downsample
    self.stride = stride

  def forward(self, x):
    residual = x

    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)

    out = self.conv2(out)
    out = self.bn2(out)
    out = self.relu(out)

    out = self.conv3(out)
    out = self.bn3(out)

    if self.downsample is not None:
      residual = self.downsample(x)

    out += residual
    out = self.relu(out)

    return out

In [None]:
class ResNet(Encoder):
  def __init__(self, block, layers):
    self.inplanes = 64
    super().__init__()
    self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
    self.bn1 = nn.BatchNorm2d(64)
    self.relu = nn.ReLU(inplace=True)
    self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    self.layer1 = self._make_layer(block, 64, layers[0])
    self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
    self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
    self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

    if block == BasicBlock:
      self.fpn_sizes = [
        self.layer1[layers[0]-1].conv2.out_channels,
        self.layer2[layers[1]-1].conv2.out_channels,
        self.layer3[layers[2]-1].conv2.out_channels,
        self.layer4[layers[3]-1].conv2.out_channels
      ]
    elif block == Bottleneck:
      self.fpn_sizes = [
        self.layer1[layers[0]-1].conv3.out_channels,
        self.layer2[layers[1]-1].conv3.out_channels,
        self.layer3[layers[2]-1].conv3.out_channels,
        self.layer4[layers[3]-1].conv3.out_channels
      ]

  def _make_layer(self, block, planes, blocks, stride=1):
    downsample = None
    if stride != 1 or self.inplanes != planes * block.expansion:
        downsample = nn.Sequential(
          nn.Conv2d(self.inplanes, planes * block.expansion,
            kernel_size=1, stride=stride, bias=False),
          nn.BatchNorm2d(planes * block.expansion),
        )

    layers = []
    layers.append(block(self.inplanes, planes, stride, downsample))
    self.inplanes = planes * block.expansion
    for i in range(1, blocks):
      layers.append(block(self.inplanes, planes))

    return nn.Sequential(*layers)

  def forward(self, inputs):
    img_batch = inputs

    x = torch.cat([img_batch, img_batch, img_batch], dim=1)

    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.maxpool(x)

    x1 = self.layer1(x)
    x2 = self.layer2(x1)
    x3 = self.layer3(x2)
    x4 = self.layer4(x3)

    return x1, x2, x3, x4

In [None]:
def resnet50(num_classes, pretrained=True, **kwargs):
  # defining a resnet50 model
  encoder = ResNet(Bottleneck, [3,4,6,3])
  
  if pretrained:
    #encoder.load_state_dict(model_zoo.load_url(model_urls['https://download.pytorch.org/models/resnet50-19c8e357.pth'], model_dir='models'), strict=False)
    encoder.load_state_dict(model_zoo.load_url('https://download.pytorch.org/models/resnet50-19c8e357.pth'), strict=False)
  model = RetinaNet(encoder=encoder, num_classes=num_classes, **kwargs)
  return model

### SE-ResNeXt50

In [None]:
class SElayer(nn.Module):

    def __init__(self, inplanes):
        super(SElayer, self).__init__()
        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
        self.conv1 = nn.Conv2d(inplanes, inplanes / 16, kernel_size=1, stride=1)
        self.conv2 = nn.Conv2d(inplanes / 16, inplanes, kernel_size=1, stride=1)
        self.relu = nn.ReLU(inplace=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):

        out = self.global_avgpool(x)

        out = self.conv1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.sigmoid(out)

        return x * out

In [None]:
class BottleneckX(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None):
        super(BottleneckX, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes * 2)

        self.conv2 = nn.Conv2d(planes * 2, planes * 2, kernel_size=3, stride=stride,
                               padding=1, groups=cardinality, bias=False)
        self.bn2 = nn.BatchNorm2d(planes * 2)

        self.conv3 = nn.Conv2d(planes * 2, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)

        self.selayer = SElayer(planes * 4)

        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        out = self.selayer(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

In [None]:
class SEResNeXt(Encoder):

    def __init__(self, block, layers, cardinality=32, num_classes=1000):
        super(SEResNeXt, self).__init__()
        self.cardinality = cardinality
        self.inplanes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, self.cardinality, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, self.cardinality))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x1 = self.layer1(x)
        x2 = self.layer2(x1)
        x3 = self.layer3(x2)
        x4 = self.layer4(x3)

        return x1, x2, x3, x4

In [None]:
def se_resnext50(**kwargs):
    """Constructs a SE-ResNeXt-50 model.
    Args:
        num_classes = 1000 (default)
    """
    encoder = SEResNeXt(BottleneckX, [3, 4, 6, 3], **kwargs)
    if pretrained == 'imagenet':
        encoder.load_state_dict(model_zoo.load_url(
            senet.pretrained_settings['se_resnext50_32x4d']['imagenet']['url'], model_dir='models'), strict=False)

    model = RetinaNet(encoder=encoder, num_classes=num_classes, dropout_cls=dropout, dropout_global_cls=dropout)
    return model

### PNasnet

In [None]:
class PNasnet(Encoder):
    def __init__(self, **kwargs):
        super(PNasnet, self).__init__()
        self.encoder = pnasnet.PNASNet5Large(num_classes=1001)

        self.fpn_sizes = [270, 1080, 2160, 4320]
        print(self.fpn_sizes)

    def forward(self, inputs):
        x = torch.cat([inputs, inputs, inputs], dim=1)
        x_conv_0 = self.encoder.conv_0(x)
        x_stem_0 = self.encoder.cell_stem_0(x_conv_0)
        x_stem_1 = self.encoder.cell_stem_1(x_conv_0, x_stem_0)
        x_cell_0 = self.encoder.cell_0(x_stem_0, x_stem_1)
        x_cell_1 = self.encoder.cell_1(x_stem_1, x_cell_0)
        x_cell_2 = self.encoder.cell_2(x_cell_0, x_cell_1)
        x_cell_3 = self.encoder.cell_3(x_cell_1, x_cell_2)
        x_cell_4 = self.encoder.cell_4(x_cell_2, x_cell_3)
        x_cell_5 = self.encoder.cell_5(x_cell_3, x_cell_4)
        x_cell_6 = self.encoder.cell_6(x_cell_4, x_cell_5)
        x_cell_7 = self.encoder.cell_7(x_cell_5, x_cell_6)
        x_cell_8 = self.encoder.cell_8(x_cell_6, x_cell_7)
        x_cell_9 = self.encoder.cell_9(x_cell_7, x_cell_8)
        x_cell_10 = self.encoder.cell_10(x_cell_8, x_cell_9)
        x_cell_11 = self.encoder.cell_11(x_cell_9, x_cell_10)

        return x_stem_0, x_cell_3, x_cell_7, x_cell_11

In [None]:
def pnasnet5(num_classes, pretrained=True, dropout=0.0):
    """Constructs a DPN model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    encoder = PNasnet()

    if pretrained:
        encoder.encoder.load_state_dict(model_zoo.load_url(
            pnasnet.pretrained_settings['pnasnet5large']['imagenet+background']['url'], model_dir='models'), strict=False)

    model = RetinaNet(encoder=encoder, num_classes=num_classes, dropout_cls=dropout, dropout_global_cls=dropout)
    return model

### Xception

In [None]:
class Xception(Encoder):
    """
    Xception optimized for the ImageNet dataset: https://arxiv.org/pdf/1610.02357.pdf
    """

    def __init__(self, num_classes=1000):
        """ Constructor
        Args:
            num_classes: number of classes
        """
        super(Xception, self).__init__()
        self.num_classes = num_classes

        self.conv1 = nn.Conv2d(3, 32, 3, 2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(32, 64, 3, bias=False, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        # relu

        self.block1 = Block(64, 128, 2, 2, start_with_relu=False, grow_first=True)
        self.block2 = Block(128, 256, 2, 2, start_with_relu=True, grow_first=True)
        self.block3 = Block(256, 728, 2, 2, start_with_relu=True, grow_first=True)

        self.block4 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
        self.block5 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
        self.block6 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
        self.block7 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)

        self.block8 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
        self.block9 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
        self.block10 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)
        self.block11 = Block(728, 728, 3, 1, start_with_relu=True, grow_first=True)

        self.block12 = Block(728, 1024, 2, 2, start_with_relu=True, grow_first=False)

        self.conv3 = SeparableConv2d(1024, 1536, 3, 1, 1)
        self.bn3 = nn.BatchNorm2d(1536)

        # relu
        self.conv4 = SeparableConv2d(1536, 2048, 3, 1, 1)
        self.bn4 = nn.BatchNorm2d(2048)

        self.fc = nn.Linear(2048, num_classes)

        self.fpn_sizes = [128, 256, 728, 2048]
        print(self.fpn_sizes)


    def forward(self, inputs):
        res = []

        x = torch.cat([inputs, inputs, inputs], dim=1)

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.block1(x)
        res.append(x)
        x = self.block2(x)
        res.append(x)
        x = self.block3(x)

        x = self.block4(x)
        x = self.block5(x)
        x = self.block6(x)
        x = self.block7(x)

        x = self.block8(x)
        x = self.block9(x)
        x = self.block10(x)
        x = self.block11(x)

        res.append(x)

        x = self.block12(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)

        x = self.conv4(x)
        x = self.bn4(x)

        res.append(x)

        return res

In [None]:
def xception(num_classes, pretrained=True, **kwargs):
    """Constructs a xception model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    encoder = Xception(num_classes=1000)

    if pretrained:
        state_dict = model_zoo.load_url(
            pretrained_settings['xception']['imagenet']['url'], 
            model_dir='models'
        )
        for name, weights in state_dict.items():
            if 'pointwise' in name:
                state_dict[name] = weights.unsqueeze(-1).unsqueeze(-1)
        encoder.load_state_dict(state_dict)

    model = RetinaNet(encoder=encoder, num_classes=num_classes, **kwargs)
    return model

## Feature Pyramid Network (FPN)

In [None]:
class PyramidFeatures(nn.Module):
  def __init__(self, C2_size, C3_size, C4_size, C5_size, feature_size=256, use_l2_features=True):
    super(PyramidFeatures, self).__init__()
    self.use_l2_features = use_l2_features
        
    # upsample C5 to get P5 from the FPN paper
    self.P5_1           = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0)
    self.P5_upsampled   = nn.Upsample(scale_factor=2, mode='nearest')
    self.P5_2           = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)

    # add P5 elementwise to C4
    self.P4_1           = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0)
    self.P4_upsampled   = nn.Upsample(scale_factor=2, mode='nearest')
    self.P4_2           = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)

    # add P4 elementwise to C3
    self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0)
    self.P3_upsampled = nn.Upsample(scale_factor=2, mode='nearest')
    self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)

    # add P3 elementwise to C2
    self.P2_1 = nn.Conv2d(C2_size, feature_size, kernel_size=1, stride=1, padding=0)
    self.P2_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)

    # "P6 is obtained via a 3x3 stride-2 conv on C5"
    self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1)

    # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6"
    self.P7_1 = nn.ReLU()
    self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1)

  def forward(self, inputs):

    C2, C3, C4, C5 = inputs

    P5_x = self.P5_1(C5)
    P5_upsampled_x = self.P5_upsampled(P5_x)
    P5_x = self.P5_2(P5_x)
    
    P4_x = self.P4_1(C4)
    P4_x = P5_upsampled_x + P4_x
    P4_upsampled_x = self.P4_upsampled(P4_x)
    P4_x = self.P4_2(P4_x)

    P3_x = self.P3_1(C3)
    P3_x = P3_x + P4_upsampled_x
    P3_upsampled_x = self.P3_upsampled(P3_x)
    P3_x = self.P3_2(P3_x)

    if self.use_l2_features:
      P2_x = self.P2_1(C2)
      P2_x = P2_x + P3_upsampled_x
      P2_x = self.P2_2(P2_x)

      P6_x = self.P6(C5)

      P7_x = self.P7_1(P6_x)
      P7_x = self.P7_2(P7_x)

      if self.use_l2_features:
        return [P2_x, P3_x, P4_x, P5_x, P6_x, P7_x]
      else:
        return [P3_x, P4_x, P5_x, P6_x, P7_x]

## Subnets multitask learning

### Box Regression Subnet

In [None]:
class RegressionModel(nn.Module):
  
  # creates the default regression submodel,
  # it predicts regression values for each anchor.
  
  def __init__(self, num_features_in, num_anchors=9, feature_size=256):
    super(RegressionModel, self).__init__()
        
    self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
    self.act1 = nn.ReLU()

    self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
    self.act2 = nn.ReLU()

    self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
    self.act3 = nn.ReLU()

    self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
    self.act4 = nn.ReLU()

    self.output = nn.Conv2d(feature_size, num_anchors*4, kernel_size=3, padding=1)

  def forward(self, x):

    out = self.conv1(x)
    out = self.act1(out)

    out = self.conv2(out)
    out = self.act2(out)

    out = self.conv3(out)
    out = self.act3(out)

    out = self.conv4(out)
    out = self.act4(out)

    out = self.output(out)

    # out is B x C x W x H, with C = 4*num_anchors
    out = out.permute(0, 2, 3, 1)

    return out.contiguous().view(out.shape[0], -1, 4)

### Classification Subnet

In [None]:
class ClassificationModel(nn.Module):

  # creates the classification submodel, 
  # it predicts classes for each anchor.

  def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256, dropout=0.5):
    super(ClassificationModel, self).__init__()

    self.num_classes = num_classes
    self.num_anchors = num_anchors
        
    self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
    self.act1 = nn.ReLU()

    self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
    self.act2 = nn.ReLU()

    self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
    self.act3 = nn.ReLU()

    self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
    self.act4 = nn.ReLU()

    self.output = nn.Conv2d(feature_size, num_anchors*num_classes, kernel_size=3, padding=1)
    self.output_act = nn.Sigmoid()

    self.dropout = dropout

  def forward(self, x):

    out = self.conv1(x)
    out = self.act1(out)

    out = self.conv2(out)
    out = self.act2(out)

    out = self.conv3(out)
    out = self.act3(out)

    out = self.conv4(out)
    out = self.act4(out)

    if self.dropout > 0:
      out = F.dropout(out, self.dropout, self.training)

    out = self.output(out)
    out = self.output_act(out)

    # out is B x C x W x H, with C = n_classes + n_anchors
    out1 = out.permute(0, 2, 3, 1)

    batch_size, width, height, channels = out1.shape

    out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes)

    return out2.contiguous().view(x.shape[0], -1, self.num_classes)

### Global Classification

In [None]:
class GlobalClassificationModel(nn.Module):
  def __init__(self, num_features_in, num_classes=80, feature_size=256, dropout=0.5):
    super().__init__()

    self.num_classes = num_classes
    self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, dilation=1, padding=0)
    self.fc = nn.Linear(feature_size*2, num_classes)
    self.output_act = nn.LogSoftmax(dim=-1)

    self.dropout = dropout

  def forward(self, x):
    out = F.max_pool2d(x, 2)
    out = self.conv1(out)
    out = F.relu(out)

    #if self.dropout > 0:
    #  out = F.dropout(out, self.dropout, self.training)

    avg_pool = F.avg_pool2d(out, out.shape[2:])
    max_pool = F.max_pool2d(out, out.shape[2:])
    avg_max_pool = torch.cat((avg_pool, max_pool), 1)
    out = avg_max_pool.view(avg_max_pool.size(0), -1)

    if self.dropout > 0:
      out = F.dropout(out, self.dropout, self.training)

    out = self.fc(out)
    out = self.output_act(out)

    return out

## Focal Loss

In [None]:
def calc_iou(a, b):
    area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])

    iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0])
    ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1])

    iw = torch.clamp(iw, min=0)
    ih = torch.clamp(ih, min=0)

    ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih

    ua = torch.clamp(ua, min=1e-8)

    intersection = iw * ih

    IoU = intersection / ua

    return IoU

In [None]:
class FocalLoss(nn.Module):
    #def __init__(self):

    def forward(self, classifications, regressions, anchors, annotations):
        alpha = 0.25
        gamma = 2.0
        batch_size = classifications.shape[0]
        classification_losses = []
        regression_losses = []

        anchor = anchors[0, :, :]

        anchor_widths  = anchor[:, 2] - anchor[:, 0]
        anchor_heights = anchor[:, 3] - anchor[:, 1]
        anchor_ctr_x   = anchor[:, 0] + 0.5 * anchor_widths
        anchor_ctr_y   = anchor[:, 1] + 0.5 * anchor_heights

        for j in range(batch_size):

            classification = classifications[j, :, :]
            regression = regressions[j, :, :]

            bbox_annotation = annotations[j, :, :]
            bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1]

            classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)

            if bbox_annotation.shape[0] == 0:
                # regression_losses.append(torch.tensor(0).float().cuda())
                # classification_losses.append(torch.tensor(0).float().cuda())
                IoU = torch.zeros(anchor.shape[0], 1).float().cuda()
                bbox_annotation = torch.ones(1, 5).float().cuda() * -1
                # continue
            else:
                IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4])  # num_anchors x num_annotations

            IoU_max, IoU_argmax = torch.max(IoU, dim=1)  # num_anchors x 1

            #import pdb
            #pdb.set_trace()

            # compute the loss for classification
            targets = torch.ones(classification.shape) * -1
            targets = targets.cuda()

            targets[torch.lt(IoU_max, 0.4), :] = 0

            positive_indices = torch.ge(IoU_max, 0.5)

            num_positive_anchors = positive_indices.sum()

            assigned_annotations = bbox_annotation[IoU_argmax, :]

            targets[positive_indices, :] = 0
            targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1

            alpha_factor = torch.ones(targets.shape).cuda() * alpha

            alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor)
            focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification)
            focal_weight = alpha_factor * torch.pow(focal_weight, gamma)

            bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification))

            # cls_loss = focal_weight * torch.pow(bce, gamma)
            cls_loss = focal_weight * bce

            cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda())

            classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0))

            # compute the loss for regression

            if positive_indices.sum() > 0:
                assigned_annotations = assigned_annotations[positive_indices, :]

                anchor_widths_pi = anchor_widths[positive_indices]
                anchor_heights_pi = anchor_heights[positive_indices]
                anchor_ctr_x_pi = anchor_ctr_x[positive_indices]
                anchor_ctr_y_pi = anchor_ctr_y[positive_indices]

                gt_widths  = assigned_annotations[:, 2] - assigned_annotations[:, 0]
                gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1]
                gt_ctr_x   = assigned_annotations[:, 0] + 0.5 * gt_widths
                gt_ctr_y   = assigned_annotations[:, 1] + 0.5 * gt_heights

                # clip widths to 1
                gt_widths  = torch.clamp(gt_widths, min=1)
                gt_heights = torch.clamp(gt_heights, min=1)

                targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
                targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
                targets_dw = torch.log(gt_widths / anchor_widths_pi)
                targets_dh = torch.log(gt_heights / anchor_heights_pi)

                targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh))
                targets = targets.t()

                targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda()


                #negative_indices = 1 - positive_indices

                regression_diff = torch.abs(targets - regression[positive_indices, :])

                regression_loss = torch.where(
                    torch.le(regression_diff, 1.0 / 9.0),
                    0.5 * 9.0 * torch.pow(regression_diff, 2),
                    regression_diff - 0.5 / 9.0
                )
                regression_losses.append(regression_loss.mean())
            else:
                regression_losses.append(torch.tensor(0).float().cuda())

        return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True)

## Model

In [None]:
class RetinaNet(nn.Module):
  
  def __init__(self, encoder: Encoder, num_classes, dropout_cls=0.5,
              dropout_global_cls=0.5, use_l2_features=True):
    super(RetinaNet, self).__init__()
    
    fpn_sizes = encoder.fpn_sizes
    self.use_l2_features = use_l2_features

    self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2], fpn_sizes[3], use_l2_features=use_l2_features)

    self.regressionModel = RegressionModel(256)
    self.classificationModel = ClassificationModel(256, num_classes=num_classes, dropout=dropout_cls)
    self.globalClassificationModel = GlobalClassificationModel(fpn_sizes[-1], num_classes=3, feature_size=256, dropout=dropout_global_cls)
    self.globalClassificationLoss = nn.NLLLoss()

    if use_l2_features:
      pyramid_levels = [2, 3, 4, 5, 6, 7]
    else:
      pyramid_levels = [3, 4, 5, 6, 7]

    self.anchors = Anchors(pyramid_levels=pyramid_levels)

    self.regressBoxes = BBoxTransform()

    self.clipBoxes = ClipBoxes()

    self.focalLoss = FocalLoss()

    for m in self.modules():
      if isinstance(m, nn.Conv2d):
        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
        m.weight.data.normal_(0, math.sqrt(2. / n))
      elif isinstance(m, nn.BatchNorm2d):
        m.weight.data.fill_(1)
        m.bias.data.zero_()

    self.encoder = encoder

    prior = 0.01

    self.classificationModel.output.weight.data.fill_(0)
    self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior))

    self.regressionModel.output.weight.data.fill_(0)
    self.regressionModel.output.bias.data.fill_(0)

    self.freeze_bn()
    
  def freeze_bn(self):
    """Freeze BatchNorm layers."""
    for layer in self.modules():
      if isinstance(layer, nn.BatchNorm2d):
        layer.eval()
  
  def freeze_encoder(self):
    self.encoder.eval()
    # correct version, but keep original as model has been trained this way
    # for param in self.encoder.parameters():
    #     param.requires_grad = False

  def unfreeze_encoder(self):
    for param in self.encoder.parameters():
      param.requires_grad = True
  
  def boxes(self, img_batch, regression, classification, global_classification, anchors):
    transformed_anchors = self.regressBoxes(anchors, regression)
    transformed_anchors = self.clipBoxes(transformed_anchors, img_batch)

    scores = torch.max(classification, dim=2, keepdim=True)[0]

    scores_over_thresh = (scores > 0.025)[0, :, 0]

    if scores_over_thresh.sum() == 0:
      # no boxes to NMS, just return
      return [torch.zeros(0), global_classification, torch.zeros(0, 4)]
    else:
      classification = classification[:, scores_over_thresh, :]
      transformed_anchors = transformed_anchors[:, scores_over_thresh, :]
      scores = scores[:, scores_over_thresh, :]

      # use very low threshold of 0.05 as boxes should not overlap
      anchors_nms_idx = nms(torch.cat([transformed_anchors, scores], dim=2)[0, :, :], 0.05)

      nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1)
      return [nms_scores, global_classification, transformed_anchors[0, anchors_nms_idx, :]]

  def forward(self, inputs, return_loss, return_boxes, return_raw=False):
    
    if return_loss:
      img_batch, annotations, global_annotations = inputs
    else:
      img_batch = inputs

    x1, x2, x3, x4 = self.encoder.forward(img_batch)

    features = self.fpn([x1, x2, x3, x4])

    regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1)

    classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1)

    global_classification = self.globalClassificationModel(x4)
    
    anchors = self.anchors(img_batch)

    if return_raw:
      return [regression, classification, torch.exp(global_classification), anchors]

    res = []

    if return_loss:
      res += list(self.focalLoss(classification, regression, anchors, annotations))
      res += [self.globalClassificationLoss(global_classification, global_annotations)]

    if return_boxes:
      res += self.boxes(img_batch=img_batch,
                        regression=regression,
                        classification=classification,
                        global_classification=global_classification,
                        anchors=anchors)
    return res

# Training

In [None]:
"""
Validate model at the epoch end 
    
Args: 
    retinanet: current model 
    dataloader_valid: dataloader for the validation fold
    epoch_num: current epoch
    save_oof: boolean flag, if calculate oof predictions and save them in pickle (oof = Out-of-Fold)
    predictions_dir: directory for saving predictions

Outputs:
    loss_hist_valid: total validation loss, history 
    loss_cls_hist_valid, loss_cls_global_hist_valid: classification validation losses
    loss_reg_hist_valid: regression validation loss
"""

def validation(
    retinanet: nn.Module, 
    val_dataloader: nn.Module, 
    epoch_num: int, 
    predictions_dir: str, 
    save_oof=True,
) -> tuple:
    
    with torch.no_grad():

        # Set dropout and batch normalization layers to evaluation mode before running inference.
        retinanet.eval()

        loss_hist_valid, loss_cls_hist_valid, loss_cls_global_hist_valid, loss_reg_hist_valid = [],[],[],[]
        
        # Show a smart progress bar
        data_iter = tqdm(enumerate(val_dataloader), total=len(val_dataloader))
        
        
        if save_oof:
            oof = collections.defaultdict(list)
        
        for iter_num, data in data_iter:
            # Run model and save the return values of the model (see Model section for further info)
            (
                classification_loss,
                regression_loss,
                global_classification_loss,
                nms_scores,
                global_class,
                transformed_anchors,
            ) = retinanet(
                [
                  data['img'].cuda().float(),       
                  data['annot'].cuda().float(),          
                  data['category'].cuda()
                ],
                return_loss=True,
                return_boxes=True,
            )

            # Save out of fold predictions
            if save_oof:
                # predictions
                oof["gt_boxes"].append(data["annot"].cpu().numpy().copy())
                oof["gt_category"].append(data["category"].cpu().numpy().copy())
                oof["boxes"].append(transformed_anchors.cpu().numpy().copy())
                oof["scores"].append(nms_scores.cpu().numpy().copy())
                oof["category"].append(global_class.cpu().numpy().copy())

            # Compute losses
            classification_loss = classification_loss.mean()
            regression_loss = regression_loss.mean()
            global_classification_loss = global_classification_loss.mean()
            loss = classification_loss + regression_loss + global_classification_loss * 0.1

            # Add the current losses result to the loss history (list)
            loss_hist_valid.append(float(loss))
            loss_cls_hist_valid.append(float(classification_loss))
            loss_cls_global_hist_valid.append(float(global_classification_loss))
            loss_reg_hist_valid.append(float(regression_loss))
            data_iter.set_description(
                f"{epoch_num} cls: {np.mean(loss_cls_hist_valid):1.4f} cls g: {np.mean(loss_cls_global_hist_valid):1.4f} Reg: {np.mean(loss_reg_hist_valid):1.4f} Loss {np.mean(loss_hist_valid):1.4f}"
            )
            del classification_loss
            del regression_loss

        ##TO DO
        ##if save_oof:  # save predictions
        ##    pickle.dump(oof, open(f"{predictions_dir}/{epoch_num:03}.pkl", "wb"))

    return loss_hist_valid, loss_cls_hist_valid, loss_cls_global_hist_valid, loss_reg_hist_valid


In [None]:
import torch.optim as optim
from tqdm import tqdm
import gc 

def training_f(
    model_name: str,
    epochs: int,
    resume_weights: str="",
    resume_epoch: int=0):

  pretrained = True

  if model_name == 'resnet50':
    retinanet = resnet50(1, pretrained)
  elif model_name == 'se_resnext50':
    retinanet = se_resnext50(1, pretrained)
  elif model_name == 'pnasnet5':
    retinanet = pnasnet5(1, pretrained)
  elif model_name == 'xception':
    retinanet = xception(1, pretrained)

  # TODO crea cartelle checkpoints
  checkpoints_dir = '/content/drive/My Drive/Neural/checkpoints'
  predictions_dir = '/content/drive/My Drive/Neural/predictions'
  os.makedirs(checkpoints_dir, exist_ok=True)
  os.makedirs(predictions_dir, exist_ok=True)
  
  # load weights to continue training
  if resume_weights != "":
    print("load model from: ", resume_weights)
    retinanet = torch.load(resume_weights).cuda()
  else:
    retinanet = retinanet.to(device)
  
  retinanet = torch.nn.DataParallel(retinanet).cuda()

  retinanet.training = True
  optimizer = optim.Adam(retinanet.parameters(), lr=1e-5)
  
  scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, patience=4, verbose=True, factor=0.2
  )
  scheduler_by_epoch = False
  
  # ADDED FROM OTHER FILES
  loss_hist = []

  #for epoch_num in range(resume_epoch+1, epochs):
  for epoch_num in range(epochs):  
    retinanet.train() 
    
    if epoch_num < 1:
      # train FC layers with freezed encoder for the first epoch
      retinanet.module.freeze_encoder()  
    else:
      retinanet.module.unfreeze_encoder()
    
    retinanet.module.freeze_bn()

    # losses
    epoch_loss, loss_cls_hist, loss_cls_global_hist, loss_reg_hist = [], [], [], []

    with torch.set_grad_enabled(True):
      data_iter = tqdm(enumerate(train_dataloader), total = len(train_dataloader))

      for iter_num, data in data_iter:
        optimizer.zero_grad()

        inputs = [
                  data['img'].cuda().float(),      #image
                  data['annot'].cuda().float(),    #boxes
                  data['category'].cuda()
        ]

        (classification_loss, regression_loss, global_classification_loss,) = retinanet(
            inputs, return_loss=True, return_boxes=False
            )

        classification_loss = classification_loss.mean() 
        regression_loss = regression_loss.mean()
        global_classification_loss = global_classification_loss.mean()
        loss = classification_loss + regression_loss + global_classification_loss*0.1

        loss.backward()
        torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.05)
        optimizer.step()
        
        # loss history
        loss_cls_hist.append(float(classification_loss))
        loss_cls_global_hist.append(float(global_classification_loss))
        loss_reg_hist.append(float(regression_loss))

        loss_hist.append(float(loss)) #preso da altro file
        epoch_loss.append(float(loss))
        
        print(
          'Epoch: {} | Iteration: {} | \n\t\tClassification loss: {:1.5f} | Regression loss: {:1.5f} | \n\t\tGlobal loss: {:1.5f} | Running loss: {:1.5f}'.format(
            epoch_num, iter_num, float(classification_loss), float(regression_loss), float(global_classification_loss), np.mean(loss_hist))
        )

        # print losses with tqdm interator
        #data_iter.set_description(
        #  f"{epoch_num} cls: {np.mean(loss_cls_hist):1.4f} cls g: {np.mean(loss_cls_global_hist):1.4f} Reg: {np.mean(loss_reg_hist):1.4f} Loss: {np.mean(epoch_loss):1.4f}"
        #)
        del classification_loss
        del regression_loss

        gc.collect()
        torch.cuda.empty_cache()

    # TODO save model checkpoints 
    #torch.save(retinanet.module, f"{checkpoints_dir}/{model_name}_{epoch_num:03}.pt")

    # validation
    (
      loss_hist_valid,
      loss_cls_hist_valid,
      loss_cls_global_hist_valid,
      loss_reg_hist_valid,
    ) = validation(retinanet,
        val_dataloader,
        epoch_num,
        predictions_dir,
        save_oof=True,
    )
  
    # log validation loss history
    #logger.scalar_summary("loss_valid", np.mean(loss_hist_valid), epoch_num)
    #logger.scalar_summary("loss_valid_classification", np.mean(loss_cls_hist_valid), epoch_num)
    #logger.scalar_summary(
    #  "loss_valid_global_classification", np.mean(loss_cls_global_hist_valid), epoch_num,
    #)
    #logger.scalar_summary("loss_valid_regression", np.mean(loss_reg_hist_valid), epoch_num)
  
    #scheduler.step(np.mean(loss_reg_hist_valid))
  

  retinanet.eval()
  torch.save(retinanet, f"{checkpoints_dir}/{model_name}_final.pt")

In [None]:
## RUNNER ##

model_name = "resnet50"
#model_name = "se_resnext50"
#model_name = "pnasnet5"
#model_name = "xception"

print("TRAINING STARTED")

training_f(model_name, 5, '', 0)

print("TRAINING FINISHED\n")

TRAINING STARTED


Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))




  0%|          | 0/6021 [00:00<?, ?it/s]

TypeError: ignored

In [None]:
gc.collect()
torch.cuda.empty_cache()

# Test

In [None]:
"""
Loads model weights from the checkpoint, plots ground truth and predictions

Args: 
    model_name : string name from the models configs listed in models.py file
    fold       : evaluation fold number, 0-3
    debug      : if True, runs debugging on few images 
    checkpoint : directory with weights (if avaialable) 
    pics_dir   : directory for saving prediction images 
    
"""
def test(
    model_name: str,
    img_size: int, 
    fold: int, 
    debug: bool, 
    checkpoint: str, 
    pics_dir: str
    ):
     
    # Load model
    model = torch.load(checkpoint)
    model = model.to(device)
    model.eval()

    # Show a smart progress bar
    data_iter = tqdm(enumerate(test_dataloader), total=len(test_dataloader))

    for iter_num, data in data_iter:

        # Run model and save the return values of the model (see Model section for further info)
        (
            classification_loss,
            regression_loss,
            global_classification_loss,
            nms_scores,
            nms_class,
            transformed_anchors,
        ) = model(
            [
               data['img'].cuda().float(),       
               data['annot'].cuda().float(),          
               data['category'].cuda()
            ],
            return_loss=True,
            return_boxes=True,
        )

        nms_scores = nms_scores.cpu().detach().numpy()
        nms_class = nms_class.cpu().detach().numpy()
        transformed_anchors = transformed_anchors.cpu().detach().numpy()

        # Print results
        print(f"nms_scores {nms_scores}, transformed_anchors.shape {transformed_anchors.shape}")
        print(f"cls loss: {float(classification_loss)}, global cls loss: {global_classification_loss}, reg loss: {float(regression_loss)}")
        print(f"category: {data["category"].numpy()[0]} {np.exp(nms_class[0])} {dataset_valid.categories[data["category"][0]]}")

        # PLOT RESULTS

        # plot data and ground truth
        plt.figure(iter_num, figsize=(6, 6))
        plt.cla()
        plt.imshow(data["img"][0, 0].cpu().detach().numpy(), cmap=plt.cm.gist_gray)
        plt.axis("off")
        gt = data["annot"].cpu().detach().numpy()[0]
        for i in range(gt.shape[0]):
            if np.all(np.isfinite(gt[i])):
                p0 = gt[i, 0:2]
                p1 = gt[i, 2:4]
                plt.gca().add_patch(
                    plt.Rectangle(
                        p0,
                        width=(p1 - p0)[0],
                        height=(p1 - p0)[1],
                        fill=False,
                        edgecolor="b",
                        linewidth=2,
                    )
                )
        
        # add predicted boxes to the plot
        for i in range(len(nms_scores)):
            nms_score = nms_scores[i]
            if nms_score < 0.1:
                break
            p0 = transformed_anchors[i, 0:2]
            p1 = transformed_anchors[i, 2:4]
            color = "r"
            if nms_score < 0.3:
                color = "y"
            if nms_score < 0.25:
                color = "g"
            plt.gca().add_patch(
                plt.Rectangle(
                    p0,
                    width=(p1 - p0)[0],
                    height=(p1 - p0)[1],
                    fill=False,
                    edgecolor=color,
                    linewidth=2,
                )
            )
            plt.gca().text(p0[0], p0[1], f"{nms_score:.3f}", color=color)
        plt.show()

        os.makedirs(pics_dir, exist_ok=True)
        plt.savefig(
            f"{pics_dir}/predict_{iter_num}.eps", dpi=300, bbox_inches="tight", pad_inches=0,
        )
        plt.savefig(
            f"{pics_dir}/predict_{iter_num}.png", dpi=300, bbox_inches="tight", pad_inches=0,
        )
        plt.close()

        print(nms_scores)