In [1]:
import torch
import mlflow
import gc

from visualize_image import visualize_one_xy_slice_in_3d_image
from loading_dataset import load_data
from model import load_model
import numpy as np
from monai.data import box_utils
from monai.apps.detection.metrics.matching import matching_batch
from warmup_scheduler import GradualWarmupScheduler
from monai.apps.detection.metrics.coco import COCOMetric

In [2]:
import os

os.environ['MLFLOW_TRACKING_USERNAME'] = 'Eddy'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'Usp1#'
os.environ['MLFLOW_EXPERIMENT_NAME'] = 'MSD'
description = "MSD test to verify if there is anything wrong with HC and MSD"

In [3]:
gt_box_mode = 'cccwhd'
# gt_box_mode = 'xyzxyz'
batch_size = 8
patch_size = [96,96,40]
# data_list_file_path = '/data/output/hc_train_val3_resampled.json'
data_list_file_path = '/data/output/msd_train_val3.json'
# data_list_file_path = '/data/output/LUNA16_datasplit/dataset_fold0.json'
# data_list_file_path = '/data/output/LUNA16-mini.json'
# data_base_dir = '/data/HC_Images_resample/'
data_base_dir = '/data/MSD_Images_resample/'
# data_base_dir = '/data/LUNA16_Images_resample/'
amp=True

returned_layers = [1,2]
base_anchor_shapes = [[6,8,4],[8,6,5],[10,10,6]]
conv1_t_stride = [2,2,1]
n_input_channels = 1
spatial_dims = 3
fg_labels = [0]
verbose = False
balanced_sampler_pos_fraction = 0.3
score_thresh = 0.02
nms_thresh = 0.22
val_patch_size = [256,256,104]

lr = 1e-2
val_interval = 1
# val_interval = 5
coco_metric = COCOMetric(classes=["nodule"], iou_list=[0.1], max_detection=[100])
best_val_epoch_metric = 0.0
best_val_epoch = -1
max_epochs = 20
w_cls = 1.0

compute_dtype = torch.float32
if amp:
    compute_dtype = torch.float16

# monai.config.print_config()
torch.backends.cudnn.benchmark = True

### loading dataset

In [4]:
train_loader, val_loader, len_train_ds = load_data(
    gt_box_mode, patch_size, batch_size, amp, data_list_file_path, data_base_dir
)

monai.transforms.io.dictionary LoadImaged.__init__:image_only: Current default value of argument `image_only=False` has been deprecated since version 1.1. It will be changed to `image_only=True` in version 1.3.


### loading model

In [5]:
detector, device = load_model(
    returned_layers, base_anchor_shapes, conv1_t_stride, n_input_channels,
    spatial_dims, fg_labels, verbose, balanced_sampler_pos_fraction,
    score_thresh, nms_thresh, val_patch_size
)

### Initialize training

In [6]:
optimizer = torch.optim.SGD(
    detector.network.parameters(),
    lr,
    momentum=0.9,
    weight_decay=3e-5,
    nesterov=True
)

after_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=150, gamma=0.1
)
scheduler_warmup = GradualWarmupScheduler(
    optimizer, multiplier=1, total_epoch=10, after_scheduler=after_scheduler
)
scaler = torch.cuda.amp.GradScaler() if amp else None
optimizer.zero_grad()
optimizer.step()

### train

In [7]:
epoch_len = len_train_ds // train_loader.batch_size

with mlflow.start_run(description=description) as run:

    mlflow.log_param("gt_box_mode", gt_box_mode)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("patch_size", patch_size)
    mlflow.log_param("data_list_file_path", data_list_file_path)
    mlflow.log_param("data_base_dir", data_base_dir)
    mlflow.log_param("amp", amp)
    
    mlflow.log_param("n_input_channels", n_input_channels)
    mlflow.log_param("spatial_dims", spatial_dims)
    mlflow.log_param("balanced_sampler_pos_fraction", balanced_sampler_pos_fraction)
    mlflow.log_param("score_thresh", score_thresh)
    mlflow.log_param("nms_thresh", nms_thresh)
    
    mlflow.log_param("initial_lr", lr)
    mlflow.log_param("val_interval", val_interval)
    mlflow.log_param("max_epochs", max_epochs)
    mlflow.log_param("w_cls", w_cls)
    
    for epoch in range(max_epochs):
        print("-" * 10)
        print(f"epoch {epoch + 1}/{max_epochs}")
        detector.train()
        epoch_loss = 0
        epoch_cls_loss = 0
        epoch_box_reg_loss = 0
        step = 0
        scheduler_warmup.step()

        for batch_data in train_loader:
            step += 1
            inputs = [
                batch_data_ii["image"].to(device) for batch_data_i in batch_data for batch_data_ii in batch_data_i
            ]
            targets = [
                dict(
                    label=batch_data_ii["label"].to(device),
                    box=batch_data_ii["box"].to(device)
                )
                for batch_data_i in batch_data
                for batch_data_ii in batch_data_i
            ]

            for param in detector.network.parameters():
                param.grad = None

            if amp and (scaler is not None):
                with torch.cuda.amp.autocast():
                    outputs = detector(inputs, targets)
                    loss = w_cls * outputs[detector.cls_key] + outputs[detector.box_reg_key]
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = detector(inputs, targets)
                loss = w_cls * outputs[detector.cls_key] + outputs[detector.box_reg_key]
                loss.backward()
                optimizer.step()

            # saving into mlflow
            epoch_loss += loss.detach().item()
            epoch_cls_loss += outputs[detector.cls_key].detach().item()
            epoch_box_reg_loss += outputs[detector.box_reg_key].detach().item()
            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
            mlflow.log_metric("train_loss", loss.detach().item(), epoch_len * epoch + step)

        del inputs, batch_data
        torch.cuda.empty_cache()
        gc.collect()

        epoch_loss /= step
        epoch_cls_loss /= step
        epoch_box_reg_loss /= step
        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
        mlflow.log_metric("avg_train_loss", epoch_loss, epoch + 1)
        mlflow.log_metric("avg_train_cls_loss", epoch_cls_loss, epoch + 1)
        mlflow.log_metric("avg_train_box_reg_loss", epoch_box_reg_loss, epoch + 1)
        mlflow.log_metric("train_lr", optimizer.param_groups[0]["lr"], epoch + 1)

        # saving last trained model
        mlflow.pytorch.log_model(detector.network, "model")

        # validation for model selection
        if (epoch + 1) % val_interval == 0:
            detector.eval()
            val_outputs_all = []
            val_targets_all = []

            with torch.no_grad():
                for val_data in val_loader:
                    # if all val_data_i["image"] smaller than val_patch_size, no need to use inferer
                    # otherwise, need inferer to handle large input images.
                    use_inferer = not all(
                        [val_data_i["image"][0, ...].numel() < np.prod(val_patch_size) for val_data_i in val_data]
                    )
                    val_inputs = [val_data_i.pop("image").to(device) for val_data_i in val_data]

                    if amp:
                        with torch.cuda.amp.autocast():
                            val_outputs = detector(val_inputs, use_inferer=use_inferer)
                    else:
                        val_outputs = detector(val_inputs, use_inferer=use_inferer)

                    # save outputs for evaluation
                    val_outputs_all += val_outputs
                    val_targets_all += val_data

            # visualize an inference image and boxes
            print(val_data[0]["image_meta_dict"]["filename_or_obj"])
            draw_img = visualize_one_xy_slice_in_3d_image(
                gt_boxes=val_data[0]["box"].cpu().detach().numpy(),
                image=val_inputs[0][0, ...].cpu().detach().numpy(),
                pred_boxes=val_outputs[0][detector.target_box_key].cpu().detach().numpy(),
            )
            # mlflow.log_image(draw_img.transpose([2, 1, 0]), "val_img_xy.png")
            mlflow.log_image(draw_img, str(epoch + 1) + "_val_img_xy.png")

            # compute metrics
            del val_inputs
            torch.cuda.empty_cache()
            results_metric = matching_batch(
                iou_fn=box_utils.box_iou,
                iou_thresholds=coco_metric.iou_thresholds,
                pred_boxes=[
                    val_data_i[detector.target_box_key].cpu().detach().numpy() for val_data_i in val_outputs_all
                ],
                pred_classes=[
                    val_data_i[detector.target_label_key].cpu().detach().numpy() for val_data_i in val_outputs_all
                ],
                pred_scores=[
                    val_data_i[detector.pred_score_key].cpu().detach().numpy() for val_data_i in val_outputs_all
                ],
                gt_boxes=[val_data_i[detector.target_box_key].cpu().detach().numpy() for val_data_i in val_targets_all],
                gt_classes=[
                    val_data_i[detector.target_label_key].cpu().detach().numpy() for val_data_i in val_targets_all
                ]
            )
            val_epoch_metric_dict = coco_metric(results_metric)[0]
            print(val_epoch_metric_dict)

            # write metrics
            for k in val_epoch_metric_dict.keys():
                mlflow.log_metric("val_" + k, val_epoch_metric_dict[k], epoch + 1)
            val_epoch_metric = val_epoch_metric_dict.values()
            val_epoch_metric = sum(val_epoch_metric) / len(val_epoch_metric)
            mlflow.log_metric("val_metric", val_epoch_metric, epoch + 1)

            # save best trained model
            if val_epoch_metric > best_val_epoch_metric:
                best_val_epoch_metric = val_epoch_metric
                best_val_epoch = epoch + 1
                mlflow.pytorch.log_model(detector.network, "best_model")
            print(
                "current epoch: {} current metric: {:.4f} "
                "best metric: {:.4f} at epoch {}".format(
                    epoch + 1, val_epoch_metric, best_val_epoch_metric, best_val_epoch
                )
            )

    print(f"train completed, best_metric: {best_val_epoch_metric:.4f} " f"at epoch: {best_val_epoch}")

----------
epoch 1/20


Num foregrounds 0, Num backgrounds 85181751, unable to generate class balanced samples, setting `pos_ratio` to 0.


1/38, train_loss: 0.0419


Num foregrounds 0, Num backgrounds 114827436, unable to generate class balanced samples, setting `pos_ratio` to 0.


2/38, train_loss: 0.0391


Num foregrounds 0, Num backgrounds 72644120, unable to generate class balanced samples, setting `pos_ratio` to 0.


3/38, train_loss: 0.0285


Num foregrounds 0, Num backgrounds 133490588, unable to generate class balanced samples, setting `pos_ratio` to 0.


4/38, train_loss: 0.0213


Num foregrounds 0, Num backgrounds 103063388, unable to generate class balanced samples, setting `pos_ratio` to 0.


5/38, train_loss: 0.0174


Num foregrounds 0, Num backgrounds 101898488, unable to generate class balanced samples, setting `pos_ratio` to 0.


6/38, train_loss: 0.0145


Num foregrounds 0, Num backgrounds 92309931, unable to generate class balanced samples, setting `pos_ratio` to 0.


7/38, train_loss: 0.0140


Num foregrounds 0, Num backgrounds 126176220, unable to generate class balanced samples, setting `pos_ratio` to 0.


8/38, train_loss: 0.0128


Num foregrounds 0, Num backgrounds 114582250, unable to generate class balanced samples, setting `pos_ratio` to 0.


9/38, train_loss: 0.0118


Num foregrounds 0, Num backgrounds 98360876, unable to generate class balanced samples, setting `pos_ratio` to 0.


10/38, train_loss: 0.0111


Num foregrounds 0, Num backgrounds 107279109, unable to generate class balanced samples, setting `pos_ratio` to 0.


11/38, train_loss: 0.0106


Num foregrounds 0, Num backgrounds 116505364, unable to generate class balanced samples, setting `pos_ratio` to 0.


12/38, train_loss: 0.0101


Num foregrounds 0, Num backgrounds 94050240, unable to generate class balanced samples, setting `pos_ratio` to 0.


13/38, train_loss: 0.0098


Num foregrounds 0, Num backgrounds 79127289, unable to generate class balanced samples, setting `pos_ratio` to 0.


14/38, train_loss: 0.0096


Num foregrounds 0, Num backgrounds 64749568, unable to generate class balanced samples, setting `pos_ratio` to 0.


15/38, train_loss: 0.0094


Num foregrounds 0, Num backgrounds 73493747, unable to generate class balanced samples, setting `pos_ratio` to 0.


16/38, train_loss: 0.0092


Num foregrounds 0, Num backgrounds 61253036, unable to generate class balanced samples, setting `pos_ratio` to 0.


17/38, train_loss: 0.0089


Num foregrounds 0, Num backgrounds 82559055, unable to generate class balanced samples, setting `pos_ratio` to 0.


18/38, train_loss: 0.0090


Num foregrounds 0, Num backgrounds 90725442, unable to generate class balanced samples, setting `pos_ratio` to 0.


19/38, train_loss: 0.0087


Num foregrounds 0, Num backgrounds 73169986, unable to generate class balanced samples, setting `pos_ratio` to 0.


20/38, train_loss: 0.0086


Num foregrounds 0, Num backgrounds 166536656, unable to generate class balanced samples, setting `pos_ratio` to 0.


21/38, train_loss: 0.0081


Num foregrounds 0, Num backgrounds 90936098, unable to generate class balanced samples, setting `pos_ratio` to 0.


22/38, train_loss: 0.0081


Num foregrounds 0, Num backgrounds 110117156, unable to generate class balanced samples, setting `pos_ratio` to 0.


23/38, train_loss: 0.0085


Num foregrounds 0, Num backgrounds 106982148, unable to generate class balanced samples, setting `pos_ratio` to 0.


24/38, train_loss: 0.0080


Num foregrounds 0, Num backgrounds 76283904, unable to generate class balanced samples, setting `pos_ratio` to 0.


25/38, train_loss: 0.0081


Num foregrounds 0, Num backgrounds 71359455, unable to generate class balanced samples, setting `pos_ratio` to 0.


26/38, train_loss: 0.0079


Num foregrounds 0, Num backgrounds 94804794, unable to generate class balanced samples, setting `pos_ratio` to 0.


27/38, train_loss: 0.0078


Num foregrounds 0, Num backgrounds 109561000, unable to generate class balanced samples, setting `pos_ratio` to 0.


28/38, train_loss: 0.0078


Num foregrounds 0, Num backgrounds 81264011, unable to generate class balanced samples, setting `pos_ratio` to 0.


29/38, train_loss: 0.0078


Num foregrounds 0, Num backgrounds 96381824, unable to generate class balanced samples, setting `pos_ratio` to 0.


30/38, train_loss: 0.0076


Num foregrounds 0, Num backgrounds 75497472, unable to generate class balanced samples, setting `pos_ratio` to 0.


31/38, train_loss: 0.0073


Num foregrounds 0, Num backgrounds 143567964, unable to generate class balanced samples, setting `pos_ratio` to 0.


32/38, train_loss: 0.0071


Num foregrounds 0, Num backgrounds 99992475, unable to generate class balanced samples, setting `pos_ratio` to 0.


33/38, train_loss: 0.0071


Num foregrounds 0, Num backgrounds 81225000, unable to generate class balanced samples, setting `pos_ratio` to 0.


34/38, train_loss: 0.0073


Num foregrounds 0, Num backgrounds 101966700, unable to generate class balanced samples, setting `pos_ratio` to 0.


35/38, train_loss: 0.0077


Num foregrounds 0, Num backgrounds 70738161, unable to generate class balanced samples, setting `pos_ratio` to 0.


36/38, train_loss: 0.0077


Num foregrounds 0, Num backgrounds 80616489, unable to generate class balanced samples, setting `pos_ratio` to 0.


37/38, train_loss: 0.0075


Num foregrounds 0, Num backgrounds 100042149, unable to generate class balanced samples, setting `pos_ratio` to 0.


38/38, train_loss: 0.0072
epoch 1 average loss: 0.0117




/data/MSD_Images_resample/lung_036/lung_036.nii.gz
voxel coordinate of expected: [134, 272, -101, 207, 327, -88]
{'mAP_IoU_0.10_0.50_0.05_MaxDet_100': 0.0, 'nodule_mAP_IoU_0.10_0.50_0.05_MaxDet_100': 0.0, 'AP_IoU_0.10_MaxDet_100': 0.0, 'nodule_AP_IoU_0.10_MaxDet_100': 0.0, 'mAR_IoU_0.10_0.50_0.05_MaxDet_100': 0.0, 'nodule_mAR_IoU_0.10_0.50_0.05_MaxDet_100': 0.0, 'AR_IoU_0.10_MaxDet_100': 0.0, 'nodule_AR_IoU_0.10_MaxDet_100': 0.0}
current epoch: 1 current metric: 0.0000 best metric: 0.0000 at epoch -1
----------
epoch 2/20
1/38, train_loss: 0.0072
2/38, train_loss: 0.0070
3/38, train_loss: 0.0069
4/38, train_loss: 0.0069
5/38, train_loss: 0.0070
6/38, train_loss: 0.0067
7/38, train_loss: 0.0066
8/38, train_loss: 0.0071
9/38, train_loss: 0.0065
10/38, train_loss: 0.0067
11/38, train_loss: 0.0067
12/38, train_loss: 0.0064
13/38, train_loss: 0.0066
14/38, train_loss: 0.0061
15/38, train_loss: 0.0063
16/38, train_loss: 0.0063
17/38, train_loss: 0.0063
18/38, train_loss: 0.0066
19/38, train_



/data/MSD_Images_resample/lung_036/lung_036.nii.gz
voxel coordinate of expected: [134, 272, -101, 207, 327, -88]
{'mAP_IoU_0.10_0.50_0.05_MaxDet_100': 0.0, 'nodule_mAP_IoU_0.10_0.50_0.05_MaxDet_100': 0.0, 'AP_IoU_0.10_MaxDet_100': 0.0, 'nodule_AP_IoU_0.10_MaxDet_100': 0.0, 'mAR_IoU_0.10_0.50_0.05_MaxDet_100': 0.0, 'nodule_mAR_IoU_0.10_0.50_0.05_MaxDet_100': 0.0, 'AR_IoU_0.10_MaxDet_100': 0.0, 'nodule_AR_IoU_0.10_MaxDet_100': 0.0}
current epoch: 2 current metric: 0.0000 best metric: 0.0000 at epoch -1
----------
epoch 3/20
1/38, train_loss: 0.0052
2/38, train_loss: 0.0052
3/38, train_loss: 0.0044
4/38, train_loss: 0.0051
5/38, train_loss: 0.0045
6/38, train_loss: 0.0045
7/38, train_loss: 0.0048
8/38, train_loss: 0.0046
9/38, train_loss: 0.0044
10/38, train_loss: 0.0042
11/38, train_loss: 0.0043
12/38, train_loss: 0.0042
13/38, train_loss: 0.0040
14/38, train_loss: 0.0044
15/38, train_loss: 0.0039
16/38, train_loss: 0.0040
17/38, train_loss: 0.0040
18/38, train_loss: 0.0041
19/38, train_

KeyboardInterrupt: 