## Try out our model here.

We test our mutli-modal Faster R-CNN with MIMIC dataset here.

In [1]:
import torch, gc
import pandas as pd

from models.load import ModelSetup, create_model_from_setup
from utils.data import get_datasets, get_dataloaders

## Suppress the assignement warning from pandas.
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
gc.collect()
# torch.cuda.memory_summary(device=None, abbreviated=False)

use_gpu = torch.cuda.is_available()
device = 'cuda' if use_gpu else 'cpu'
print(f"This notebook will running on device: [{device.upper()}]")

if use_gpu:
    torch.cuda.empty_cache()

This notebook will running on device: [CUDA]


### Define your MIMIC folde path here.

In [3]:
XAMI_MIMIC_PATH = "D:\XAMI-MIMIC"

use_iobb = True
io_type_str = "IoBB" if use_iobb else "IoU"

model_setup =  ModelSetup(
        name="custom_with_clinical",
        use_clinical=True,
        use_custom_model=True,
        use_early_stop_model=True,
        backbone='swin'
    )

# Initiate datasets and dataloaders
The batch size is also defined in this section. For testing purpose, we only set it as 2.

In [4]:
labels_cols = [
    "Enlarged cardiac silhouette",
    "Atelectasis",
    "Pleural abnormality",
    "Consolidation",
    "Pulmonary edema",
    #  'Groundglass opacity', # 6th disease.
]

dataset_params_dict = {
    "XAMI_MIMIC_PATH": XAMI_MIMIC_PATH,
    "with_clinical": model_setup.use_clinical,
    "dataset_mode": "normal",
    "bbox_to_mask": True,
    "labels_cols": labels_cols,
}

detect_eval_dataset, train_dataset, val_dataset, test_dataset = get_datasets(
    dataset_params_dict=dataset_params_dict
)

train_dataloader, val_dataloader, test_dataloader = get_dataloaders(
    train_dataset, val_dataset, test_dataset, batch_size=4
)

In [5]:
print(f"We used to have {len(detect_eval_dataset.df.dicom_id)}, after unifying, we will have {len(detect_eval_dataset.df.dicom_id.unique())}.")

We used to have 590, after unifying, we will have 590.


## Example instance from dataset
We show what's inside a single instance. It will provide:

- Images
- Clinical data
- Targets (Dictionary)

And, inside the target, there're:

- boxes (bounding boxes of abnormality)
- lable (disease index (Note: the class **0** means the background))
- image_id (idx to get that image)
- area (the areas that bouding boxes contain)
- iscrowd (if it's a place with multiple bouding boxes, we assume all the the bouding boxes are not crowd.)

In [6]:
train_dataset[0]

(tensor([[[0.5608, 0.5608, 0.5569,  ..., 0.0000, 0.0000, 0.0000],
          [0.5569, 0.5647, 0.5647,  ..., 0.0000, 0.0000, 0.0000],
          [0.5490, 0.5569, 0.5608,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.7686, 0.7686, 0.7647,  ..., 1.0000, 1.0000, 1.0000],
          [0.7608, 0.7647, 0.7608,  ..., 1.0000, 1.0000, 1.0000],
          [0.7529, 0.7569, 0.7569,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[0.5608, 0.5608, 0.5569,  ..., 0.0000, 0.0000, 0.0000],
          [0.5569, 0.5647, 0.5647,  ..., 0.0000, 0.0000, 0.0000],
          [0.5490, 0.5569, 0.5608,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.7686, 0.7686, 0.7647,  ..., 1.0000, 1.0000, 1.0000],
          [0.7608, 0.7647, 0.7608,  ..., 1.0000, 1.0000, 1.0000],
          [0.7529, 0.7569, 0.7569,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[0.5608, 0.5608, 0.5569,  ..., 0.0000, 0.0000, 0.0000],
          [0.5569, 0.5647, 0.5647,  ..., 0.0000, 0.0000, 0.0000],
          [0.5490, 0.5569, 0.5608,  ...,

## Define Model.

We define he models here. Two backbone examples are in the below code section. The MobileNet is a light weight network, and ResNet is heavier, but usually perform better. In our case, the calculation is not the most important factor; therefore, we chose ResNet with feature pyramid networks (FPN) backbone.

In [7]:
model = create_model_from_setup(detect_eval_dataset, model_setup, image_size=512)
model.to(device)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


MultimodalMaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): SwinFPN(
    (swin): SwinTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
        (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (layers): ModuleList(
        (0): BasicLayer(
          (blocks): ModuleList(
            (0): SwinTransformerBlock(
              (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attn): WindowAttention(
                (qkv): Linear(in_features=96, out_features=288, bias=True)
                (attn_drop): Dropout(p=0.0, inplace=False)
                (proj): Linear(in_features=96, out_features=96, bias=True)
                (proj_drop): Dropout(p=0.0, inplace=False)
                (sof

## Prepare data to feed

We prepare three main data to test the model:

- CXR image
- Clinical data
- Target

And, for each data, we adjust the format to what the model expect.

In [8]:
data = next(iter(train_dataloader))
data = train_dataset.prepare_input_from_data(data, device)

In [9]:
data[-1]

[{'masks': tensor([[[0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           ...,
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0]]], device='cuda:0', dtype=torch.uint8),
  'image_path': 'D:\\XAMI-MIMIC\\patient_14207656\\CXR-JPG\\s58855997\\f31af046-d69a7327-999db5bc-f939baa6-084a9064.jpg',
  'dicom_id': 'f31af046-d69a7327-999db5bc-f939baa6-084a9064',
  'iscrowd': tensor([0], device='cuda:0'),
  'area': tensor([388388.], device='cuda:0', dtype=torch.float64),
  'image_id': tensor([321], device='cuda:0'),
  'labels': tensor([2], device='cuda:0'),
  'boxes': tensor([[1241., 1317., 1920., 1889.]], device='cuda:0', dtype=torch.float64)},
 {'masks': tensor([[[0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           ...,
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0, 0],
           [0, 0, 0,  ..., 0, 0,

# Test Feedforawrd (Training)

In [10]:
output = model(*data[:-1], targets=data[-1])

In [11]:
images, clinical_num, clinical_cat, targets = data
images, targets = model.transform(images, targets)

In [12]:
img_features = model.backbone(images.tensors)

In [13]:
for k, out in img_features.items():
    print(f"[{k}]: {out.shape}")


[0]: torch.Size([4, 256, 128, 128])
[1]: torch.Size([4, 256, 64, 64])
[2]: torch.Size([4, 256, 32, 32])
[3]: torch.Size([4, 256, 16, 16])
[pool]: torch.Size([4, 256, 8, 8])


In [14]:
clinical_features = model.get_clinical_features((clinical_num, clinical_cat))

In [15]:
for k, out in clinical_features.items():
    print(f"[{k}]: {out.shape}")


[pool]: torch.Size([4, 256, 8, 8])
[3]: torch.Size([4, 256, 16, 16])
[2]: torch.Size([4, 256, 32, 32])
[1]: torch.Size([4, 256, 64, 64])
[0]: torch.Size([4, 256, 128, 128])


## Results we get.
Four different losses are given in the result, we will use these losses to optimise the network while training. 

In [16]:
output

{'loss_classifier': tensor(6.4265, device='cuda:0', grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.0205, device='cuda:0', grad_fn=<DivBackward0>),
 'loss_mask': tensor(0.6231, device='cuda:0',
        grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_objectness': tensor(0.6912, device='cuda:0',
        grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0101, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)}

# Test Feedforawrd (Evaluation)

In [17]:
model.eval()
pred = model(*data[:-1])

## Results we get.
If we set the model to evaluation mode and don't pass the target to the forward function, the model will output prediction (detections). In the below sections, we show what's inside the detection of first instance (idx=0).

### Detection.

A detection contain *boxes*, *lables*, and *scores*.

- *boxes*: All the bounding boxes for this image. 
- *lables*: Labels corresponded to the bounding boxes.
- *score*: Score (Confidence) for each boudning box.

In [18]:
pred[1].keys()

dict_keys(['boxes', 'labels', 'scores', 'masks'])

In [19]:
pred[1]['boxes']

tensor([], device='cuda:0', size=(0, 4), grad_fn=<StackBackward0>)

In [20]:
pred[1]['labels']

tensor([], device='cuda:0', dtype=torch.int64)

In [21]:
pred[1]['scores']

tensor([], device='cuda:0', grad_fn=<IndexBackward0>)