In [1]:
import pandas as pd
import numpy as np

import utils.print as print_f

from utils.coco_eval import get_eval_params_dict
from utils.engine import xami_train_one_epoch, xami_evaluate, get_iou_types
from utils.plot import plot_losses, plot_train_val_ap_ars, get_ap_ar_for_train_val
from utils.save import get_data_from_metric_logger
from utils.coco_utils import get_cocos

from models.setup import ModelSetup
from models.build import create_model_from_setup
from models.train import TrainingInfo
from utils.save import check_best, end_train
from data.load import get_datasets, get_dataloaders

from IPython.display import clear_output
from utils.eval import get_ar_ap
from utils.train import get_optimiser, get_lr_scheduler, print_params_setup
from utils.init import reproducibility, clean_memory_get_device
from data.constants import DEFAULT_REFLACX_LABEL_COLS, XAMI_MIMIC_PATH
from  datetime import datetime
import torch.optim as optim
import torch

## Suppress the assignement warning from pandas.r
pd.options.mode.chained_assignment = None  # default='warn'

## Supress user warning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

%matplotlib inline

In [2]:
torch.set_default_dtype(torch.float64)

In [3]:
device = clean_memory_get_device()
reproducibility()

This notebook will running on device: [CUDA]


In [4]:
import torch 
torch.randn(2,3)

# pytorch reproducibility work.

tensor([[ 1.5410, -0.2934, -2.1788],
        [ 0.5684, -1.0845, -1.3986]])

In [5]:
## then we prepare the data. 
use_iobb = True
io_type_str = "IoBB" if use_iobb else "IoU"
labels_cols = DEFAULT_REFLACX_LABEL_COLS
iou_thrs = np.array([0.5])


model_setup =  ModelSetup(
        name="CXR+Clinical",
        use_clinical=True,
        use_custom_model=True,
        use_early_stop_model=True,
        best_ar_val_model_path=None,
        best_ap_val_model_path=None,
        final_model_path=None,
        backbone="mobilenet_v3",
        optimiser="sgd",
        lr=1e-3,
        # lr=1e-4,
        # weight_decay=0.001,
        weight_decay=0,
        pretrained=True,
        record_training_performance=True,
        dataset_mode="unified",
        image_size=256,
        backbone_out_channels=16,
        batch_size=4,
        warmup_epochs=0,
        lr_scheduler="ReduceLROnPlateau",
        # lr_scheduler=None,
        reduceLROnPlateau_factor=0.1,
        reduceLROnPlateau_patience=5,
        reduceLROnPlateau_full_stop=True,
        multiStepLR_milestones=[30, 50, 70, 90],
        multiStepLR_gamma=0.1,
        representation_size=32,
        mask_hidden_layers=256,
        using_fpn=False,
        use_mask=False,
        clinical_expand_dropout_rate=0,
        clinical_conv_dropout_rate=0,
        clinical_input_channels=32,
        clinical_num_len=9,
        clinical_conv_channels=32,
        fuse_conv_channels=32,
        fuse_dropout_rate=0,
        box_head_dropout_rate=0,
        fuse_depth=4,
        fusion_strategy="concat",
        fusion_residule=False,
    )


In [6]:
from data.transforms import get_transform
from data.datasets import ReflacxDataset, collate_fn
from data.load import seed_worker, get_dataloader_g
from torch.utils.data import DataLoader
from utils.coco_utils import get_coco_api_from_dataset

################ Datasets ################
dataset_params_dict = {
    "XAMI_MIMIC_PATH": XAMI_MIMIC_PATH,
    "with_clinical": model_setup.use_clinical,
    "dataset_mode": model_setup.dataset_mode,
    "bbox_to_mask": model_setup.use_mask,
    "labels_cols": labels_cols,
}

train_dataset = ReflacxDataset(
        **dataset_params_dict, split_str="train", transforms=get_transform(train=True), 
    )

train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=collate_fn,
    worker_init_fn=seed_worker,
    generator=get_dataloader_g(0),
)

train_coco = get_coco_api_from_dataset(train_dataloader.dataset)

eval_params_dict = get_eval_params_dict(
    train_dataset, iou_thrs=iou_thrs, use_iobb=use_iobb,
)

############### Model ###############

model = create_model_from_setup(
    labels_cols,
    model_setup,
    rpn_nms_thresh=0.3,
    box_detections_per_img=10,
    box_nms_thresh=0.2,
    rpn_score_thresh=0.0,
    box_score_thresh=0.05,
)

model.to(device)

creating index...
index created!
creating index...
index created!
Load custom model
Using pretrained backbone. mobilenet_v3


MultimodalMaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): Sequential(
    (0): Sequential(
      (0): ConvNormActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
            (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1))
    

In [7]:
## then we check whether the dataloader laod the same data.
data = next(iter(train_dataloader))
data = train_dataset.prepare_input_from_data(data, device)

In [8]:
print(data[-1][0]['image_id'])
print(data[-1][0]['dicom_id'])

tensor([293], device='cuda:0')
7c6edd07-7c4c8c3e-d8b216c7-88d2d002-975f90e3


In [9]:
## the dataload correctly.

In [10]:
# model.train()
# loss_dict, outputs = model(*data[:-1], targets=data[-1])

In [11]:
# loss_dict

In [12]:
# initial model is correct.

# outputs

In [13]:
# {'loss_classifier': tensor(1.8194, device='cuda:0', grad_fn=<NllLossBackward0>),
#  'loss_box_reg': tensor(0.0357, device='cuda:0', grad_fn=<DivBackward0>),
#  'loss_objectness': tensor(0.6932, device='cuda:0',
#         grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
#  'loss_rpn_box_reg': tensor(0.4562, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)}


# [{'boxes': tensor([[2.5204e+00, 1.0799e+00, 2.7798e+02, 1.0702e+02],
#           [9.9822e+00, 4.4080e+00, 1.1009e+03, 4.3683e+02],
#           [2.7997e+01, 2.5075e+01, 3.0560e+03, 2.4849e+03],
#           [2.3693e+03, 1.1362e+03, 2.6742e+03, 1.4973e+03],
#           [4.9516e+02, 9.4079e+02, 8.7172e+02, 1.5129e+03],
#           [9.3348e+02, 9.3862e+02, 1.4424e+03, 1.6008e+03],
#           [4.9111e+02, 9.4757e+02, 8.7426e+02, 1.5319e+03],
#           [0.0000e+00, 1.1112e-01, 2.8200e+02, 1.1193e+02],
#           [0.0000e+00, 4.5356e-01, 1.1169e+03, 4.5688e+02],
#           [0.0000e+00, 2.5801e+00, 3.0560e+03, 2.5440e+03]], device='cuda:0',
#          grad_fn=<StackBackward0>),
#   'labels': tensor([4, 4, 4, 4, 4, 4, 3, 5, 5, 5], device='cuda:0'),
#   'scores': tensor([0.1903, 0.1903, 0.1903, 0.1897, 0.1883, 0.1866, 0.1768, 0.1765, 0.1765,
#           0.1765], device='cuda:0', grad_fn=<IndexBackward0>)},
#  {'boxes': tensor([[2.8612e+00, 1.1996e+01, 2.3124e+02, 5.2541e+02],
#           [1.5962e+01, 3.4023e+01, 1.2900e+03, 1.4902e+03],
#           [0.0000e+00, 6.5438e-01, 2.3528e+02, 5.5139e+02],
#           [0.0000e+00, 1.8560e+00, 1.3125e+03, 1.5639e+03],
#           [1.9556e+00, 8.2432e+00, 2.3229e+02, 5.3670e+02],
#           [1.0909e+01, 2.3380e+01, 1.2958e+03, 1.5222e+03],
#           [0.0000e+00, 0.0000e+00, 2.2641e+02, 5.5311e+02],
#           [0.0000e+00, 0.0000e+00, 1.2630e+03, 1.5688e+03],
#           [0.0000e+00, 1.3181e+01, 2.3259e+02, 5.3898e+02],
#           [0.0000e+00, 3.7384e+01, 1.2975e+03, 1.5287e+03]], device='cuda:0',
#          grad_fn=<StackBackward0>),
# ...
#           [7.4653e+02, 1.4055e+03, 1.1647e+03, 1.8839e+03]], device='cuda:0',
#          grad_fn=<StackBackward0>),
#   'labels': tensor([4, 4, 4, 4, 3, 3, 3, 3, 5, 5], device='cuda:0'),
#   'scores': tensor([0.1931, 0.1911, 0.1891, 0.1891, 0.1834, 0.1834, 0.1824, 0.1818, 0.1684,
#           0.1673], device='cu da:0', grad_fn=<IndexBackward0>)}]

In [14]:
dynamic_loss_weight = None
params = [p for p in model.parameters() if p.requires_grad]
if dynamic_loss_weight:
    params += [p for p in dynamic_loss_weight.parameters() if p.requires_grad]

iou_types = get_iou_types(model, model_setup)
optimizer = torch.optim.SGD(
            params,
            lr=torch.tensor(0.01)

            
        )
lr_scheduler = get_lr_scheduler(optimizer, model_setup)

# train_evaluator, train_loger = xami_train_one_epoch(
#     model=model,
#     optimizer=optimizer,
#     data_loader=train_dataloader,
#     device=device,
#     epoch=0,
#     print_freq=10,
#     iou_types=iou_types,
#     coco=train_coco,
#     score_thres=None,
#     evaluate_on_run=True,
#     params_dict=eval_params_dict,
#     dynamic_loss_weight=dynamic_loss_weight,
# )

In [15]:
# optimizer.param_groups[0]['lr'].item()
# #0.009999999776482582

In [16]:
# model.train()
# loss_dict, outputs = model(*data[:-1], targets=data[-1])

In [17]:
def loss_multiplier(loss_dict):
    loss_dict["loss_classifier"] *= 10
    loss_dict["loss_box_reg"] *= 5

    loss_dict["loss_objectness"] *= 1e-5
    loss_dict["loss_rpn_box_reg"] *= 1e-5

    return loss_dict


In [18]:
# with torch.cuda.amp.autocast(enabled=False):
#         loss_dict, outputs = model(*data[:-1], targets=data[-1])
#         loss_dict = loss_multiplier(loss_dict)

#         if dynamic_loss_weight:
#             # loss_dict["loss_objectness"] *= 4
#             # loss_dict["loss_rpn_box_reg"] *= 2
#             losses = dynamic_loss_weight(loss_dict)
#         else:
#             losses = sum(loss for loss in loss_dict.values())

In [19]:
# losses

In [20]:
# tensor(18.3728, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)
# tensor(18.3728, device='cuda:0', dtype=torch.float64, grad_fn=<AddBackward0>)

In [21]:
# optimizer.zero_grad()
# losses.backward()
# optimizer.step()

In [22]:
from utils import detect_utils

In [23]:
# for i, data in enumerate(train_dataloader):
#     if i==0:
#         first_data = data
#     elif i == 1:
#         second_data = data
#     else:
#         raise StopIteration()

# [target['dicom_id'] for target in first_data[-1]]

# ['4f3b79f0-4c4d27f8-23240d1e-515d66ca-6fde3e41',
#  '4a629500-9c3281ca-90bab490-9b6ac9c1-e5e6a580',
#  'a586a4ad-997f80a9-38bbdd65-cb92a6cc-51c8520e',
#  'a5264d2c-5d74521a-9fc6b28a-4991fbd0-ad4c15d1']


# [target['dicom_id'] for target in second_data[-1]]
## the dataset is checked.

# ['4f4218c0-7e3de34f-abade5db-964b2d47-addcc964',
#  'b967d8e3-c164811b-04ff781e-85201e77-94f258ef',
#  '4de3b906-58b53e1d-27ad92d5-ca85ff56-83cffd81',
#  '1ffc255f-2064c19a-a5cfad02-c22fa939-5db5f9c1']
    

In [24]:
model.train()
# if epoch == 1:
#     print("start wariming up ")
#     warmup_factor = 1.0 / 1000
#     warmup_iters = min(1000, len(data_loader) - 1)

#     lr_scheduler = torch.optim.lr_scheduler.LinearLR(
#         optimizer, start_factor=warmup_factor, total_iters=warmup_iters
#     )

for i, data in enumerate(train_dataloader):
    data = train_dataloader.dataset.prepare_input_from_data(data, device)
    with torch.cuda.amp.autocast(enabled=False, dtype=torch.float32):
        loss_dict, outputs = model(*data[:-1], targets=data[-1])
        loss_dict = loss_multiplier(loss_dict)

        if dynamic_loss_weight: 
            # loss_dict["loss_objectness"] *= 4
            # loss_dict["loss_rpn_box_reg"] *= 2
            losses = dynamic_loss_weight(loss_dict)
        else:
            losses = sum(loss for loss in loss_dict.values())


    print_f.print_title(f"Iter {i}")
    print(losses)
    print(losses.item())


    print(model.fuse_convs[0].weight.data[0][0])
    print(model.fuse_convs[0].weight.data[0][0][0][0].item())


    optimizer.zero_grad()

    losses.backward()
    
    print(model.fuse_convs[0].weight.grad[0][0])
    print(model.fuse_convs[0].weight.grad[0][0][0][0].item())

    # the gradient is not the same.

    optimizer.step()

    if i >= 1:
        raise StopIteration()

#18.0247
#18.0230

tensor(18.5139, device='cuda:0', grad_fn=<AddBackward0>)
18.51391808342778
tensor([[ 0.0113,  0.0396, -0.0356],
        [ 0.0101, -0.0355, -0.0446],
        [-0.0054,  0.0215,  0.0006]], device='cuda:0')
0.011276589760136277
tensor([[-0.0270, -0.1334, -0.1873],
        [-0.1515, -0.1765, -0.1383],
        [-0.0960, -0.0688, -0.0258]], device='cuda:0')
-0.02701888356451491
tensor(16.4996, device='cuda:0', grad_fn=<AddBackward0>)
16.499612341058622
tensor([[ 0.0115,  0.0410, -0.0337],
        [ 0.0116, -0.0337, -0.0432],
        [-0.0045,  0.0222,  0.0009]], device='cuda:0')
0.011546778595781427
tensor([[-0.0706, -0.0488, -0.1707],
        [-0.3516, -0.0866,  0.0581],
        [-0.0635, -0.0512,  0.0034]], device='cuda:0')
-0.07057392702294056


StopIteration: 

In [None]:
# print(model.fuse_convs[0].weight.data[0][0])
# print(model.fuse_convs[0].weight.data[0][0][0][0].item())


tensor([[ 0.0155, -0.0358, -0.0260],
        [ 0.0014,  0.0559, -0.0069],
        [ 0.0526, -0.0463,  0.0237]], device='cuda:0')
0.01546361856162548


In [None]:
losses.backward()

NameError: name 'losses' is not defined

In [None]:
model.fuse_convs[0].weight.grad[0][0][0][0].item()

0.24338267743587494

In [None]:
raise StopIteration()

StopIteration: 

In [None]:
# see if the weight are updated to the same.
model.fuse_convs[0].weight.data[0][0]

tensor([[-0.0365, -0.0433, -0.0091],
        [-0.0491,  0.0281, -0.0107],
        [ 0.0365, -0.0679, -0.0029]], device='cuda:0')

In [None]:
# if it's different, it caused by the optimization process.

# tensor([[ 0.0134, -0.0374, -0.0271],
#         [-0.0006,  0.0555, -0.0072],
#         [ 0.0528, -0.0461,  0.0255]], device='cuda:0')


# tensor([[ 0.0134, -0.0374, -0.0271],
#         [-0.0006,  0.0555, -0.0072],
#         [ 0.0528, -0.0461,  0.0255]], device='cuda:0')


## updated differently using train_one_epoch

# tensor([[-0.0365, -0.0433, -0.0091],
#         [-0.0491,  0.0281, -0.0107],
#         [ 0.0365, -0.0679, -0.0029]], device='cuda:0')




In [None]:
train_ar, train_ap = get_ar_ap(train_evaluator)
print(train_ar, train_ap)

0.5078463399602832 0.1031347743811897


In [None]:
get_data_from_metric_logger(train_loger)

{'lr': 0.0010000000474974513,
 'loss': 3.6032652854919434,
 'loss_classifier': 3.570329189300537,
 'loss_box_reg': 0.03292705863714218,
 'loss_objectness': 6.931795269338181e-06,
 'loss_rpn_box_reg': 2.338313151994953e-06}

In [None]:
 
## it's inside xami_train_one_epoch

# 0.5061725923301151, 0.09079530820259449

# {'lr': 0.0010000000474974513,
#  'loss': 3.5543720722198486,
#  'loss_classifier': 3.5296542644500732,
#  'loss_box_reg': 0.02470848336815834,
#  'loss_objectness': 6.9316301960498095e-06,
#  'loss_rpn_box_reg': 2.338271542612347e-06}

In [None]:
# then we move it out to test which part is wrong.

