## Try out our model here.

We test our mutli-modal Faster R-CNN with MIMIC dataset here.

In [1]:
import pandas as pd
import numpy as np

import utils.print as print_f

from utils.coco_eval import get_eval_params_dict
from utils.engine import xami_train_one_epoch, xami_evaluate, get_iou_types
from utils.plot import plot_losses, plot_train_val_ap_ars, get_ap_ar_for_train_val
from utils.save import get_data_from_metric_logger
from utils.coco_utils import get_cocos

from models.setup import ModelSetup
from models.build import create_model_from_setup
from models.train import TrainingInfo
from utils.save import check_best, end_train
from data.load import get_datasets, get_dataloaders

from IPython.display import clear_output
from utils.eval import get_ar_ap
from utils.train import get_optimiser, get_lr_scheduler, print_params_setup
from utils.init import reproducibility, clean_memory_get_device
from data.constants import DEFAULT_REFLACX_LABEL_COLS, XAMI_MIMIC_PATH
from  datetime import datetime
import torch.optim as optim
from collections import OrderedDict

## Suppress the assignement warning from pandas.r
pd.options.mode.chained_assignment = None  # default='warn'

## Supress user warning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

%matplotlib inline

  assert (


In [2]:
device = clean_memory_get_device()
reproducibility()

This notebook will running on device: [CUDA]


### Define your MIMIC folde path here.

In [3]:
use_iobb = True
io_type_str = "IoBB" if use_iobb else "IoU"
labels_cols = DEFAULT_REFLACX_LABEL_COLS
iou_thrs = np.array([0.5])


model_setup =  ModelSetup(
        name="CXR_Clinical",
        use_clinical=True,
        use_custom_model=True,
        use_early_stop_model=True,
        best_ar_val_model_path=None,
        best_ap_val_model_path=None,
        final_model_path=None,
        backbone="resnet50",
        optimiser="sgd",
        lr=1e-2,
        # lr=1e-4,
        # weight_decay=0.001,
        weight_decay=0,
        pretrained=True,
        record_training_performance=True,
        dataset_mode="unified",
        image_size=256,
        backbone_out_channels=16,
        batch_size=4,
        warmup_epochs=0,
        lr_scheduler="ReduceLROnPlateau",
        # lr_scheduler=None,
        reduceLROnPlateau_factor=0.1,
        reduceLROnPlateau_patience=10,
        reduceLROnPlateau_full_stop=False,
        multiStepLR_milestones=[30, 50, 70, 90],
        multiStepLR_gamma=0.1,
        representation_size=32,
        mask_hidden_layers=256,
        using_fpn=True,
        use_mask=False,
        clinical_expand_dropout_rate=0,
        clinical_conv_dropout_rate=0,
        clinical_input_channels=32,
        clinical_num_len=9,
        clinical_conv_channels=256,
        fuse_conv_channels=32,
        fuse_dropout_rate=0,
        box_head_dropout_rate=0,
        fuse_depth=4,
        fusion_strategy="add",
        fusion_residule=False,
    )



# Initiate datasets and dataloaders
The batch size is also defined in this section. For testing purpose, we only set it as 2.

In [4]:
dataset_params_dict = {
    "XAMI_MIMIC_PATH": XAMI_MIMIC_PATH,
    "with_clinical": model_setup.use_clinical,
    "dataset_mode": model_setup.dataset_mode,
    "bbox_to_mask": model_setup.use_mask,
    "labels_cols": DEFAULT_REFLACX_LABEL_COLS,
}

detect_eval_dataset, train_dataset, val_dataset, test_dataset = get_datasets(
    dataset_params_dict=dataset_params_dict
)

train_dataloader, val_dataloader, test_dataloader = get_dataloaders(
    train_dataset, val_dataset, test_dataset, batch_size=4
)

In [5]:
print(f"We used to have {len(detect_eval_dataset.df.dicom_id)}, after unifying, we will have {len(detect_eval_dataset.df.dicom_id.unique())}.")

We used to have 590, after unifying, we will have 590.


## Example instance from dataset
We show what's inside a single instance. It will provide:

- Images
- Clinical data
- Targets (Dictionary)

And, inside the target, there're:

- boxes (bounding boxes of abnormality)
- lable (disease index (Note: the class **0** means the background))
- image_id (idx to get that image)
- area (the areas that bouding boxes contain)
- iscrowd (if it's a place with multiple bouding boxes, we assume all the the bouding boxes are not crowd.)

In [6]:
train_dataset[0]

(tensor([[[0.5608, 0.5608, 0.5569,  ..., 0.0000, 0.0000, 0.0000],
          [0.5569, 0.5647, 0.5647,  ..., 0.0000, 0.0000, 0.0000],
          [0.5490, 0.5569, 0.5608,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.7686, 0.7686, 0.7647,  ..., 1.0000, 1.0000, 1.0000],
          [0.7608, 0.7647, 0.7608,  ..., 1.0000, 1.0000, 1.0000],
          [0.7529, 0.7569, 0.7569,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[0.5608, 0.5608, 0.5569,  ..., 0.0000, 0.0000, 0.0000],
          [0.5569, 0.5647, 0.5647,  ..., 0.0000, 0.0000, 0.0000],
          [0.5490, 0.5569, 0.5608,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.7686, 0.7686, 0.7647,  ..., 1.0000, 1.0000, 1.0000],
          [0.7608, 0.7647, 0.7608,  ..., 1.0000, 1.0000, 1.0000],
          [0.7529, 0.7569, 0.7569,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[0.5608, 0.5608, 0.5569,  ..., 0.0000, 0.0000, 0.0000],
          [0.5569, 0.5647, 0.5647,  ..., 0.0000, 0.0000, 0.0000],
          [0.5490, 0.5569, 0.5608,  ...,

## Define Model.

We define he models here. Two backbone examples are in the below code section. The MobileNet is a light weight network, and ResNet is heavier, but usually perform better. In our case, the calculation is not the most important factor; therefore, we chose ResNet with feature pyramid networks (FPN) backbone.

In [7]:
model = create_model_from_setup(
    labels_cols,
    model_setup,
    rpn_nms_thresh=0.3,
    box_detections_per_img=10,
    box_nms_thresh=0.2,
    rpn_score_thresh=0.0,
    box_score_thresh=0.05,
    # image_size=model_setup.image_size,
    # clinical_conv_channels=64,
)

model.to(device)
model.train()


Load custom model
Using ResNet as backbone
Using pretrained backbone. resnet50
Not using pretrained MaksRCNN model.


MultimodalMaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
       

In [8]:
from utils.train import print_params_setup
print_params_setup(model)

[model]: 40,891,931
[model.backbone]: 26,799,296
[model.rpn]: 593,935
[model.roi_heads]: 403,486
[model.roi_heads.box_head]: 402,496
[model.roi_heads.box_head.fc6]: 401,440
[model.roi_heads.box_head.fc7]: 1,056
[model.roi_heads.box_predictor]: 990
[model.clinical_convs]: 3,543,552
[model.fuse_convs]: 2,952,960


## Prepare data to feed

We prepare three main data to test the model:

- CXR image
- Clinical data
- Target

And, for each data, we adjust the format to what the model expect.

In [9]:
data = next(iter(train_dataloader))
data = train_dataset.prepare_input_from_data(data, device)

# Test Feedforawrd (Training)

In [10]:
# labels = [gt_label[idxs] for gt_label, idxs in zip(gt_labels, mask_matched_idxs)]
# import torch

In [11]:
data[-1][0]['labels']

tensor([2, 1, 3, 3], device='cuda:0')

In [12]:
# images, targets = data
# images, targets = model.transform(images, targets)

In [13]:
model.train()
loss_dict, outputs = model(*data[:-1], targets=data[-1])

In [38]:
loss_dict

{'loss_classifier': tensor(1.8202, device='cuda:0', grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.0120, device='cuda:0', grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.6944, device='cuda:0',
        grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0041, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)}

In [39]:
outputs

[{'boxes': tensor([[ 727.8102, 1637.8724,  986.2892, 2099.2532],
          [1223.2861, 2047.7300, 1629.6429, 2353.9402],
          [1358.6228, 1034.7659, 1909.6292, 1239.0436],
          [1039.1136, 2289.4778, 1447.1873, 2544.0000],
          [ 610.3279, 1914.5939,  878.5331, 2401.5198],
          [   0.0000, 1394.8024,  927.4928, 1840.9017],
          [1195.2316, 1532.8962, 1458.2839, 1984.2397],
          [   0.0000, 1599.1731,  126.5366, 2048.1160],
          [2754.4048,   87.3961, 3048.7983,  395.0801],
          [2259.4878,  775.1384, 2637.0986, 1098.3540]], device='cuda:0',
         grad_fn=<StackBackward0>),
  'labels': tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0'),
  'scores': tensor([0.2139, 0.2126, 0.2122, 0.2118, 0.2110, 0.2086, 0.2084, 0.2080, 0.2079,
          0.2078], device='cuda:0', grad_fn=<IndexBackward0>)},
 {'boxes': tensor([[ 281.3226, 2651.7107,  510.8072, 3056.0000],
          [ 447.5430, 2615.5415,  671.3224, 3056.0000],
          [2114.5415, 2726.0354

In [14]:
for k, v in model.img_features.items():
    print(f"[{k}]: {v.shape}")

[0]: torch.Size([4, 256, 64, 64])
[1]: torch.Size([4, 256, 32, 32])
[2]: torch.Size([4, 256, 16, 16])
[3]: torch.Size([4, 256, 8, 8])
[pool]: torch.Size([4, 256, 4, 4])


In [15]:
for k, v in model.clinical_features.items():
    print(f"[{k}]: {v.shape}")

[0]: torch.Size([4, 256, 64, 64])
[1]: torch.Size([4, 256, 32, 32])
[2]: torch.Size([4, 256, 16, 16])
[3]: torch.Size([4, 256, 8, 8])
[pool]: torch.Size([4, 256, 4, 4])


In [16]:
model.clinical_features['0'].shape

torch.Size([4, 256, 64, 64])

In [17]:
model.fuse_convs['0'](model.clinical_features['0']).shape

torch.Size([4, 256, 32, 32])

In [18]:
clinical_features = OrderedDict({})

clinical_input = model.clinical_features['0']

In [19]:
clinical_input.shape

torch.Size([4, 256, 64, 64])

In [20]:
model.clinical_convs.keys()

odict_keys(['0', '1', '2', '3', 'pool'])

In [21]:
model.clinical_convs['2']

Sequential(
  (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
)

In [22]:
clinical_input = model.clinical_convs['0'](clinical_input)

In [23]:
clinical_input.shape # shrink

torch.Size([4, 256, 16, 16])

In [24]:
clinical_features[k] = clinical_input


In [25]:
model.clinical_features['0'].shape

torch.Size([4, 256, 64, 64])

In [26]:
for k, v in clinical_features.items():
    print(f"[{k}]: {v.shape}")

[pool]: torch.Size([4, 256, 16, 16])


In [27]:

clinical_features = OrderedDict({})

clinical_input = model.clinical_features['0']

for k in model.clinical_convs.keys():
    clinical_input = model.clinical_convs[k](clinical_input)
    clinical_features[k] = clinical_input


In [28]:
clinical_input.shape

torch.Size([4, 256, 1, 1])

In [29]:
for k, v in clinical_features.items():
    print(f"[{k}]: {v.shape}")

[0]: torch.Size([4, 256, 16, 16])
[1]: torch.Size([4, 256, 8, 8])
[2]: torch.Size([4, 256, 4, 4])
[3]: torch.Size([4, 256, 2, 2])
[pool]: torch.Size([4, 256, 1, 1])


In [30]:
clinical_input = model.clinical_convs['0'](clinical_input)

In [31]:
clinical_input.shape

torch.Size([4, 256, 1, 1])

In [32]:
clinical_features[k] = clinical_input

In [33]:
clinical_features[k].shape

torch.Size([4, 256, 1, 1])

In [34]:
clinical_features['0'][-1][-1][-1]

tensor([0.3652, 0.0000, 0.0000, 0.4546, 0.0000, 0.8485, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.2708, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [35]:
clinical_features['1'][-1][-1][-1]

tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.7062, 0.0000, 0.0000, 0.0000],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [36]:
for k, v in model.img_features.items():
    print(f"[{k}]: {v.shape}")

[0]: torch.Size([4, 256, 64, 64])
[1]: torch.Size([4, 256, 32, 32])
[2]: torch.Size([4, 256, 16, 16])
[3]: torch.Size([4, 256, 8, 8])
[pool]: torch.Size([4, 256, 4, 4])


In [37]:
images, targets = data

ValueError: too many values to unpack (expected 2)

In [None]:
original_image_sizes= []
for img in images:
    val = img.shape[-2:]
    assert len(val) == 2
    original_image_sizes.append((val[0], val[1]))

images, targets = model.transform(images, targets)

In [None]:
img_features = model.backbone(images.tensors)

In [None]:
list(img_features.keys())[-1]

'pool'

In [None]:
for k, v in img_features.items():
    print(f"[{k}]: {v.shape}")

[0]: torch.Size([4, 256, 64, 64])
[1]: torch.Size([4, 256, 32, 32])
[2]: torch.Size([4, 256, 16, 16])
[3]: torch.Size([4, 256, 8, 8])
[pool]: torch.Size([4, 256, 4, 4])


## Results we get.
Four different losses are given in the result, we will use these losses to optimise the network while training. 

# Test Feedforawrd

### Detection.

A detection contain *boxes*, *lables*, and *scores*.

- *boxes*: All the bounding boxes for this image. 
- *lables*: Labels corresponded to the bounding boxes.
- *score*: Score (Confidence) for each boudning box.

In [None]:
loss_dict, outputs 

({'loss_classifier': tensor(1.8701, device='cuda:0', grad_fn=<NllLossBackward0>),
  'loss_box_reg': tensor(0.0179, device='cuda:0', grad_fn=<DivBackward0>),
  'loss_objectness': tensor(0.6847, device='cuda:0',
         grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
  'loss_rpn_box_reg': tensor(0.0115, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)},
 [{'boxes': tensor([[1.6144e+03, 1.2710e+03, 1.8540e+03, 1.7224e+03],
           [1.4235e+03, 1.2276e+03, 1.6649e+03, 1.6887e+03],
           [2.5160e+03, 2.3314e+03, 2.9126e+03, 2.5440e+03],
           [1.2205e+03, 1.3219e+03, 1.4771e+03, 1.7699e+03],
           [7.5401e+02, 2.3299e+03, 1.1488e+03, 2.5440e+03],
           [5.5990e+02, 5.8346e-01, 8.1155e+02, 2.4206e+02],
           [3.1914e+02, 2.3218e+03, 7.2241e+02, 2.5429e+03],
           [1.5083e+03, 1.5982e+03, 2.3042e+03, 2.2397e+03],
           [1.5508e+03, 2.0036e+03, 1.8119e+03, 2.4510e+03],
           [2.3595e+03, 4.5017e+00, 2.6145e+03, 2.6230e+02]], device='cu