## Try out our model here.

We test our mutli-modal Faster R-CNN with MIMIC dataset here.

In [32]:
import pandas as pd
import numpy as np

import utils.print as print_f

from utils.coco_eval import get_eval_params_dict
from utils.engine import xami_train_one_epoch, xami_evaluate, get_iou_types
from utils.plot import plot_losses, plot_train_val_ap_ars, get_ap_ar_for_train_val
from utils.save import get_data_from_metric_logger
from utils.coco_utils import get_cocos

from models.setup import ModelSetup
from models.build import create_model_from_setup
from models.train import TrainingInfo
from utils.save import check_best, end_train
from data.load import get_datasets, get_dataloaders

from IPython.display import clear_output
from utils.eval import get_ar_ap
from utils.train import get_optimiser, get_lr_scheduler, print_params_setup
from utils.init import reproducibility, clean_memory_get_device
from data.constants import DEFAULT_REFLACX_LABEL_COLS, XAMI_MIMIC_PATH
from  datetime import datetime
import torch.optim as optim
from collections import OrderedDict

## Suppress the assignement warning from pandas.r
pd.options.mode.chained_assignment = None  # default='warn'

## Supress user warning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

%matplotlib inline

In [33]:
device = clean_memory_get_device()
reproducibility()

This notebook will running on device: [CUDA]


### Define your MIMIC folde path here.

In [34]:
use_iobb = True
io_type_str = "IoBB" if use_iobb else "IoU"
labels_cols = DEFAULT_REFLACX_LABEL_COLS
iou_thrs = np.array([0.5])


model_setup =   ModelSetup(
        name="CXR_Clinical",
        use_clinical=True,
        use_custom_model=True,
        use_early_stop_model=True,
        best_ar_val_model_path=None,
        best_ap_val_model_path=None,
        final_model_path=None,
        backbone="resnet50",
        using_fpn=True,
        optimiser="sgd",
        lr=1e-3,
        # lr=1e-4,
        # weight_decay=0.001,
        weight_decay=0,
        pretrained=True,
        record_training_performance=True,
        dataset_mode="unified",
        image_size=512, 
        batch_size=4,
        warmup_epochs=0,
        # lr_scheduler="ReduceLROnPlateau",
        lr_scheduler=None,
        reduceLROnPlateau_factor=0.1,
        reduceLROnPlateau_patience=10,
        reduceLROnPlateau_full_stop=False,
        multiStepLR_milestones=[100],
        multiStepLR_gamma=0.1,
        mask_hidden_layers=256,
        use_mask=False,
        clinical_expand_dropout_rate=0,
        clinical_conv_dropout_rate=0,
        clinical_num_len=9,
        fuse_conv_channels=32,
        fuse_dropout_rate=0,
        fuse_depth=0,
        fusion_residule=False,

        # if it still have overfitting issue, then we don't need to increase the model size (we should rather to decrease it.)
        fusion_strategy="add",

        clinical_input_channels=64,

        clinical_expand_conv_channels= 64,

        representation_size=128, # 32

        box_head_dropout_rate=0,

        clinical_conv_channels=256,

        backbone_out_channels=16,

        gt_in_train_till=50,
    )


# Initiate datasets and dataloaders
The batch size is also defined in this section. For testing purpose, we only set it as 2.

In [35]:
dataset_params_dict = {
    "XAMI_MIMIC_PATH": XAMI_MIMIC_PATH,
    "with_clinical": model_setup.use_clinical,
    "dataset_mode": model_setup.dataset_mode,
    "bbox_to_mask": model_setup.use_mask,
    "labels_cols": DEFAULT_REFLACX_LABEL_COLS,
}

detect_eval_dataset, train_dataset, val_dataset, test_dataset = get_datasets(
    dataset_params_dict=dataset_params_dict
)

train_dataloader, val_dataloader, test_dataloader = get_dataloaders(
    train_dataset, val_dataset, test_dataset, batch_size=4
)

In [36]:
print(f"We used to have {len(detect_eval_dataset.df.dicom_id)}, after unifying, we will have {len(detect_eval_dataset.df.dicom_id.unique())}.")

We used to have 590, after unifying, we will have 590.


## Example instance from dataset
We show what's inside a single instance. It will provide:

- Images
- Clinical data
- Targets (Dictionary)

And, inside the target, there're:

- boxes (bounding boxes of abnormality)
- lable (disease index (Note: the class **0** means the background))
- image_id (idx to get that image)
- area (the areas that bouding boxes contain)
- iscrowd (if it's a place with multiple bouding boxes, we assume all the the bouding boxes are not crowd.)

In [37]:
train_dataset[0]

(tensor([[[0.5608, 0.5608, 0.5569,  ..., 0.0000, 0.0000, 0.0000],
          [0.5569, 0.5647, 0.5647,  ..., 0.0000, 0.0000, 0.0000],
          [0.5490, 0.5569, 0.5608,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.7686, 0.7686, 0.7647,  ..., 1.0000, 1.0000, 1.0000],
          [0.7608, 0.7647, 0.7608,  ..., 1.0000, 1.0000, 1.0000],
          [0.7529, 0.7569, 0.7569,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[0.5608, 0.5608, 0.5569,  ..., 0.0000, 0.0000, 0.0000],
          [0.5569, 0.5647, 0.5647,  ..., 0.0000, 0.0000, 0.0000],
          [0.5490, 0.5569, 0.5608,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.7686, 0.7686, 0.7647,  ..., 1.0000, 1.0000, 1.0000],
          [0.7608, 0.7647, 0.7608,  ..., 1.0000, 1.0000, 1.0000],
          [0.7529, 0.7569, 0.7569,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[0.5608, 0.5608, 0.5569,  ..., 0.0000, 0.0000, 0.0000],
          [0.5569, 0.5647, 0.5647,  ..., 0.0000, 0.0000, 0.0000],
          [0.5490, 0.5569, 0.5608,  ...,

## Define Model.

We define he models here. Two backbone examples are in the below code section. The MobileNet is a light weight network, and ResNet is heavier, but usually perform better. In our case, the calculation is not the most important factor; therefore, we chose ResNet with feature pyramid networks (FPN) backbone.

In [38]:
model = create_model_from_setup(
    labels_cols,
    model_setup,
    rpn_nms_thresh=0.3,
    box_detections_per_img=10,
    box_nms_thresh=0.2,
    rpn_score_thresh=0.0,
    box_score_thresh=0.05,
    # image_size=model_setup.image_size,
    # clinical_conv_channels=64,
)

model.to(device)
model.train()


Load custom model
Using ResNet as backbone
Using pretrained backbone. resnet50
Using ResNet as clinical backbone
Not using pretrained MaksRCNN model.


MultimodalMaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
       

In [39]:
# from utils.train import print_params_setup
# print_params_setup(model)

## Prepare data to feed

We prepare three main data to test the model:

- CXR image
- Clinical data
- Target

And, for each data, we adjust the format to what the model expect.

In [40]:
data = next(iter(train_dataloader))
data = train_dataset.prepare_input_from_data(data, device)

# Test Feedforawrd (Training)

In [41]:
# labels = [gt_label[idxs] for gt_label, idxs in zip(gt_labels, mask_matched_idxs)]
# import torch

In [42]:
data[-1][0]['labels']

tensor([2, 1, 3, 3], device='cuda:0')

In [43]:
# images, targets = data
# images, targets = model.transform(images, targets)

In [44]:
model.train()
loss_dict, outputs = model(*data[:-1], targets=data[-1])

In [45]:
[p.shape for p in model.roi_heads.proposals]

[torch.Size([472, 4]),
 torch.Size([411, 4]),
 torch.Size([437, 4]),
 torch.Size([442, 4])]

In [46]:
len(outputs)

4

In [47]:
import torch

f_clinical_data = torch.randn(4, 64)

In [48]:
torch.concat([f_clinical_data[i].repeat(p.shape[0], 1) for i, p in enumerate(model.roi_heads.proposals)], axis=0).shape

torch.Size([1762, 64])

In [49]:
len(model.roi_heads.pred_boxes)

4

In [50]:
model.roi_heads.head_out.shape

torch.Size([1762, 128])

In [51]:
model.roi_heads.box_head.x

tensor([[[[-1.5807e+00, -8.1208e-01, -8.7276e-01,  ..., -7.6924e-01,
           -2.8922e-01,  5.6094e-01],
          [-1.6409e+00, -9.2595e-01, -1.0593e+00,  ..., -6.0094e-01,
           -2.0629e-01,  5.4258e-01],
          [-1.6641e+00, -1.0060e+00, -1.0673e+00,  ..., -5.4746e-01,
           -2.2616e-01,  4.6571e-01],
          ...,
          [-1.5301e+00, -8.2996e-01, -5.6490e-01,  ..., -5.1849e-01,
           -6.1227e-02,  8.8286e-01],
          [-1.6655e+00, -8.3758e-01, -5.4293e-01,  ..., -4.0153e-01,
           -6.5609e-02,  6.7216e-01],
          [-1.2125e+00, -4.0748e-01, -2.7191e-01,  ...,  3.1971e-02,
            4.3191e-02,  7.1303e-01]],

         [[ 1.2257e+00,  1.5283e+00,  1.2250e+00,  ...,  1.1240e+00,
            1.3083e+00,  1.7230e+00],
          [ 1.1727e+00,  1.5417e+00,  1.3335e+00,  ...,  9.8297e-01,
            1.3541e+00,  1.6718e+00],
          [ 1.1940e+00,  1.5465e+00,  1.3681e+00,  ...,  1.0641e+00,
            1.2731e+00,  1.5643e+00],
          ...,
     

In [67]:
x = torch.concat([ model.roi_heads.box_head.x.flatten(start_dim=1), model.roi_heads.box_head.clinical_input], axis=1)

In [68]:
x.shape

torch.Size([1762, 12608])

In [53]:
# flatten that (channel, resolution, resolution)

In [54]:
1762/7

251.71428571428572

In [55]:
1762/4

440.5

In [56]:
model.roi_heads.box_head.clinical_input.shape

torch.Size([1762, 64])

In [57]:
loss_dict

{'loss_classifier': tensor(1.6793, device='cuda:0', grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.0074, device='cuda:0', grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.7035, device='cuda:0',
        grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0122, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)}

In [58]:
outputs

[{'boxes': tensor([[2718.5696, 1695.6396, 2971.0417, 2170.2285],
          [ 596.9617,    0.0000, 1122.3660,  450.0497],
          [2781.0330, 1411.9908, 2973.8511, 1572.9620],
          [ 759.5148, 1402.0880, 1026.3601, 1522.4780],
          [ 471.2844, 1760.8203,  734.3671, 1886.5193],
          [2320.0142, 1111.7979, 2444.2578, 1343.8405],
          [ 699.6614, 1311.3632,  954.8306, 1794.6321],
          [1428.4680, 2279.8499, 1680.8052, 2404.1248],
          [ 851.4467,  161.3954,  981.0978,  384.7087],
          [2787.1787, 1092.7142, 2979.2561, 1254.6187]], device='cuda:0',
         grad_fn=<StackBackward0>),
  'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
  'scores': tensor([0.2213, 0.2183, 0.2170, 0.2162, 0.2160, 0.2160, 0.2152, 0.2151, 0.2147,
          0.2144], device='cuda:0', grad_fn=<IndexBackward0>)},
 {'boxes': tensor([[190.1391, 223.7963, 415.7574, 367.7264],
          [349.2446, 294.4765, 573.0487, 431.7177],
          [217.8559, 364.9738, 436.8429

In [59]:
for k, v in model.img_features.items():
    print(f"[{k}]: {v.shape}")

[0]: torch.Size([4, 256, 128, 128])
[1]: torch.Size([4, 256, 64, 64])
[2]: torch.Size([4, 256, 32, 32])
[3]: torch.Size([4, 256, 16, 16])
[pool]: torch.Size([4, 256, 8, 8])


In [60]:
for k, v in model.clinical_features.items():
    print(f"[{k}]: {v.shape}")

[0]: torch.Size([4, 256, 128, 128])
[1]: torch.Size([4, 256, 64, 64])
[2]: torch.Size([4, 256, 32, 32])
[3]: torch.Size([4, 256, 16, 16])
[pool]: torch.Size([4, 256, 8, 8])


In [61]:
model.clinical_features['0'].shape

torch.Size([4, 256, 128, 128])

In [62]:
model.fuse_convs['0'](model.clinical_features['0']).shape

AttributeError: 'MultimodalMaskRCNN' object has no attribute 'fuse_convs'

In [None]:
clinical_features = OrderedDict({})

clinical_input = model.clinical_features['0']

In [None]:
clinical_input.shape

In [None]:
model.clinical_convs.keys()

In [None]:
model.clinical_convs['2']

In [None]:
clinical_input = model.clinical_convs['0'](clinical_input)

In [None]:
clinical_input.shape # shrink

In [None]:
clinical_features[k] = clinical_input


In [None]:
model.clinical_features['0'].shape

In [None]:
for k, v in clinical_features.items():
    print(f"[{k}]: {v.shape}")

In [None]:

clinical_features = OrderedDict({})

clinical_input = model.clinical_features['0']

for k in model.clinical_convs.keys():
    clinical_input = model.clinical_convs[k](clinical_input)
    clinical_features[k] = clinical_input


In [None]:
clinical_input.shape

In [None]:
for k, v in clinical_features.items():
    print(f"[{k}]: {v.shape}")

In [None]:
clinical_input = model.clinical_convs['0'](clinical_input)

In [None]:
clinical_input.shape

In [None]:
clinical_features[k] = clinical_input

In [None]:
clinical_features[k].shape

In [None]:
clinical_features['0'][-1][-1][-1]

In [None]:
clinical_features['1'][-1][-1][-1]

In [None]:
for k, v in model.img_features.items():
    print(f"[{k}]: {v.shape}")

In [None]:
images, targets = data

In [None]:
original_image_sizes= []
for img in images:
    val = img.shape[-2:]
    assert len(val) == 2
    original_image_sizes.append((val[0], val[1]))

images, targets = model.transform(images, targets)

In [None]:
img_features = model.backbone(images.tensors)

In [None]:
list(img_features.keys())[-1]

In [None]:
for k, v in img_features.items():
    print(f"[{k}]: {v.shape}")

## Results we get.
Four different losses are given in the result, we will use these losses to optimise the network while training. 

# Test Feedforawrd

### Detection.

A detection contain *boxes*, *lables*, and *scores*.

- *boxes*: All the bounding boxes for this image. 
- *lables*: Labels corresponded to the bounding boxes.
- *score*: Score (Confidence) for each boudning box.

In [None]:
loss_dict, outputs 