In [1]:
import torch
import mlflow
import gc

from visualize_image import visualize_one_xy_slice_in_3d_image
from loading_dataset import load_data
from model import load_model
import numpy as np
from monai.data import box_utils
from monai.apps.detection.metrics.matching import matching_batch
from warmup_scheduler import GradualWarmupScheduler
from monai.apps.detection.metrics.coco import COCOMetric

In [2]:
import os

os.environ['MLFLOW_TRACKING_USERNAME'] = 'Eddy'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'Usp1#'
os.environ['MLFLOW_EXPERIMENT_NAME'] = 'HC'

In [3]:
gt_box_mode = 'cccwhd'
batch_size = 4
patch_size = [96,96,40]
# data_list_file_path = '/data/output/hc_train_val_resampled.json'
data_list_file_path = '/data/output/LUNA16_datasplit/dataset_fold0.json'
# data_base_dir = '/data/HC_Images_resample/'
data_base_dir = '/data/LUNA16_Images_resample/'
amp=True

returned_layers = [1,2]
base_anchor_shapes = [[6,8,4],[8,6,5],[10,10,6]]
conv1_t_stride = [2,2,1]
n_input_channels = 1
spatial_dims = 3
fg_labels = [0]
verbose = False
balanced_sampler_pos_fraction = 0.3
score_thresh = 0.02
nms_thresh = 0.22
val_patch_size = [256,256,104]

lr = 1e-2
val_interval = 5
coco_metric = COCOMetric(classes=["nodule"], iou_list=[0.1], max_detection=[100])
best_val_epoch_metric = 0.0
best_val_epoch = -1
max_epochs = 20
w_cls = 1.0

compute_dtype = torch.float32
if amp:
    compute_dtype = torch.float16

# monai.config.print_config()
torch.backends.cudnn.benchmark = True

### loading dataset

In [4]:
train_loader, val_loader, len_train_ds = load_data(
    gt_box_mode, patch_size, batch_size, amp, data_list_file_path, data_base_dir
)

monai.transforms.io.dictionary LoadImaged.__init__:image_only: Current default value of argument `image_only=False` has been deprecated since version 1.1. It will be changed to `image_only=True` in version 1.3.


### loading model

In [5]:
detector, device = load_model(
    returned_layers, base_anchor_shapes, conv1_t_stride, n_input_channels,
    spatial_dims, fg_labels, verbose, balanced_sampler_pos_fraction,
    score_thresh, nms_thresh, val_patch_size
)

### Initialize training

In [6]:
optimizer = torch.optim.SGD(
    detector.network.parameters(),
    lr,
    momentum=0.9,
    weight_decay=3e-5,
    nesterov=True
)

after_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=150, gamma=0.1
)
scheduler_warmup = GradualWarmupScheduler(
    optimizer, multiplier=1, total_epoch=10, after_scheduler=after_scheduler
)
scaler = torch.cuda.amp.GradScaler() if amp else None
optimizer.zero_grad()
optimizer.step()

### train

In [None]:
epoch_len = len_train_ds // train_loader.batch_size

with mlflow.start_run() as run:

    mlflow.log_param("gt_box_mode", gt_box_mode)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("patch_size", patch_size)
    mlflow.log_param("data_list_file_path", data_list_file_path)
    mlflow.log_param("data_base_dir", data_base_dir)
    mlflow.log_param("amp", amp)
    
    mlflow.log_param("n_input_channels", n_input_channels)
    mlflow.log_param("spatial_dims", spatial_dims)
    mlflow.log_param("balanced_sampler_pos_fraction", balanced_sampler_pos_fraction)
    mlflow.log_param("score_thresh", score_thresh)
    mlflow.log_param("nms_thresh", nms_thresh)
    
    mlflow.log_param("initial_lr", lr)
    mlflow.log_param("val_interval", val_interval)
    mlflow.log_param("max_epochs", max_epochs)
    mlflow.log_param("w_cls", w_cls)
    
    for epoch in range(max_epochs):
        print("-" * 10)
        print(f"epoch {epoch + 1}/{max_epochs}")
        detector.train()
        epoch_loss = 0
        epoch_cls_loss = 0
        epoch_box_reg_loss = 0
        step = 0
        scheduler_warmup.step()

        for batch_data in train_loader:
            step += 1
            inputs = [
                batch_data_ii["image"].to(device) for batch_data_i in batch_data for batch_data_ii in batch_data_i
            ]
            targets = [
                dict(
                    label=batch_data_ii["label"].to(device),
                    box=batch_data_ii["box"].to(device)
                )
                for batch_data_i in batch_data
                for batch_data_ii in batch_data_i
            ]

            for param in detector.network.parameters():
                param.grad = None

            if amp and (scaler is not None):
                with torch.cuda.amp.autocast():
                    outputs = detector(inputs, targets)
                    loss = w_cls * outputs[detector.cls_key] + outputs[detector.box_reg_key]
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = detector(inputs, targets)
                loss = w_cls * outputs[detector.cls_key] + outputs[detector.box_reg_key]
                loss.backward()
                optimizer.step()

            # saving into mlflow
            epoch_loss += loss.detach().item()
            epoch_cls_loss += outputs[detector.cls_key].detach().item()
            epoch_box_reg_loss += outputs[detector.box_reg_key].detach().item()
            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
            mlflow.log_metric("train_loss", loss.detach().item(), epoch_len * epoch + step)

        del inputs, batch_data
        torch.cuda.empty_cache()
        gc.collect()

        epoch_loss /= step
        epoch_cls_loss /= step
        epoch_box_reg_loss /= step
        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
        mlflow.log_metric("avg_train_loss", epoch_loss, epoch + 1)
        mlflow.log_metric("avg_train_cls_loss", epoch_cls_loss, epoch + 1)
        mlflow.log_metric("avg_train_box_reg_loss", epoch_box_reg_loss, epoch + 1)
        mlflow.log_metric("train_lr", optimizer.param_groups[0]["lr"], epoch + 1)

        # saving last trained model
        mlflow.pytorch.log_model(detector.network, "model")

        # validation for model selection
        if (epoch + 1) % val_interval == 0:
            detector.eval()
            val_outputs_all = []
            val_targets_all = []

            with torch.no_grad():
                for val_data in val_loader:
                    # if all val_data_i["image"] smaller than val_patch_size, no need to use inferer
                    # otherwise, need inferer to handle large input images.
                    use_inferer = not all(
                        [val_data_i["image"][0, ...].numel() < np.prod(val_patch_size) for val_data_i in val_data]
                    )
                    val_inputs = [val_data_i.pop("image").to(device) for val_data_i in val_data]

                    if amp:
                        with torch.cuda.amp.autocast():
                            val_outputs = detector(val_inputs, use_inferer=use_inferer)
                    else:
                        val_outputs = detector(val_inputs, use_inferer=use_inferer)

                    # save outputs for evaluation
                    val_outputs_all += val_outputs
                    val_targets_all += val_data

            # visualize an inference image and boxes 
            draw_img = visualize_one_xy_slice_in_3d_image(
                gt_boxes=val_data[0]["box"].cpu().detach().numpy(),
                image=val_inputs[0][0, ...].cpu().detach().numpy(),
                pred_boxes=val_outputs[0][detector.target_box_key].cpu().detach().numpy(),
            )
            # mlflow.log_image(draw_img.transpose([2, 1, 0]), "val_img_xy.png")
            mlflow.log_image(draw_img, "val_img_xy.png")

            # compute metrics
            del val_inputs
            torch.cuda.empty_cache()
            results_metric = matching_batch(
                iou_fn=box_utils.box_iou,
                iou_thresholds=coco_metric.iou_thresholds,
                pred_boxes=[
                    val_data_i[detector.target_box_key].cpu().detach().numpy() for val_data_i in val_outputs_all
                ],
                pred_classes=[
                    val_data_i[detector.target_label_key].cpu().detach().numpy() for val_data_i in val_outputs_all
                ],
                pred_scores=[
                    val_data_i[detector.pred_score_key].cpu().detach().numpy() for val_data_i in val_outputs_all
                ],
                gt_boxes=[val_data_i[detector.target_box_key].cpu().detach().numpy() for val_data_i in val_targets_all],
                gt_classes=[
                    val_data_i[detector.target_label_key].cpu().detach().numpy() for val_data_i in val_targets_all
                ]
            )
            val_epoch_metric_dict = coco_metric(results_metric)[0]
            print(val_epoch_metric_dict)

            # write metrics
            for k in val_epoch_metric_dict.keys():
                mlflow.log_metric("val_" + k, val_epoch_metric_dict[k], epoch + 1)
            val_epoch_metric = val_epoch_metric_dict.values()
            val_epoch_metric = sum(val_epoch_metric) / len(val_epoch_metric)
            mlflow.log_metric("val_metric", val_epoch_metric, epoch + 1)

            # save best trained model
            if val_epoch_metric > best_val_epoch_metric:
                best_val_epoch_metric = val_epoch_metric
                best_val_epoch = epoch + 1
                mlflow.pytorch.log_model(detector.network, "best_model")
            print(
                "current epoch: {} current metric: {:.4f} "
                "best metric: {:.4f} at epoch {}".format(
                    epoch + 1, val_epoch_metric, best_val_epoch_metric, best_val_epoch
                )
            )

    print(f"train completed, best_metric: {best_val_epoch_metric:.4f} " f"at epoch: {best_val_epoch}")

----------
epoch 1/20
1/507, train_loss: 1.2227
2/507, train_loss: 1.7676
3/507, train_loss: 1.1689
4/507, train_loss: 0.7012
5/507, train_loss: 1.3906
6/507, train_loss: 1.1426
7/507, train_loss: 1.4268
8/507, train_loss: 1.0781
9/507, train_loss: 1.2617
10/507, train_loss: 0.3298
11/507, train_loss: 1.8291
12/507, train_loss: 0.8418
13/507, train_loss: 1.2256
14/507, train_loss: 1.0635
15/507, train_loss: 1.1865
16/507, train_loss: 0.7432
17/507, train_loss: 0.8750
18/507, train_loss: 0.8848
19/507, train_loss: 1.3926
20/507, train_loss: 0.9287
21/507, train_loss: 1.1523
22/507, train_loss: 0.9502
23/507, train_loss: 1.1104
24/507, train_loss: 1.2412
25/507, train_loss: 1.4229
26/507, train_loss: 0.8657
27/507, train_loss: 0.9653
28/507, train_loss: 0.9199
29/507, train_loss: 0.7383
30/507, train_loss: 0.9468
31/507, train_loss: 0.8711
32/507, train_loss: 0.7817
33/507, train_loss: 0.9922
34/507, train_loss: 0.7461
35/507, train_loss: 0.8506
36/507, train_loss: 0.7939
37/507, train_l



----------
epoch 2/20
1/507, train_loss: 0.5347
2/507, train_loss: 0.4849
3/507, train_loss: 0.4937
4/507, train_loss: 0.6538
5/507, train_loss: 0.6011
6/507, train_loss: 0.1361
7/507, train_loss: 0.1171
8/507, train_loss: 0.4771
9/507, train_loss: 0.5176
10/507, train_loss: 0.6094
11/507, train_loss: 0.8423
12/507, train_loss: 0.6372
13/507, train_loss: 0.8003
14/507, train_loss: 0.8179
15/507, train_loss: 0.8818
16/507, train_loss: 0.5820
17/507, train_loss: 0.4546
18/507, train_loss: 0.7256
19/507, train_loss: 0.8271
20/507, train_loss: 0.8892
21/507, train_loss: 0.7222
22/507, train_loss: 0.8242
23/507, train_loss: 0.8164
24/507, train_loss: 0.5264
25/507, train_loss: 0.3496
26/507, train_loss: 0.7529
27/507, train_loss: 0.4292
28/507, train_loss: 0.6758
29/507, train_loss: 0.1277
30/507, train_loss: 0.6704
31/507, train_loss: 0.1042
32/507, train_loss: 0.6719
33/507, train_loss: 0.5967
34/507, train_loss: 0.5469
35/507, train_loss: 0.6807
36/507, train_loss: 0.4939
37/507, train_l



----------
epoch 3/20
1/507, train_loss: 0.5298
2/507, train_loss: 0.7056
3/507, train_loss: 0.9951
4/507, train_loss: 0.5186
5/507, train_loss: 0.3240
6/507, train_loss: 0.5649
7/507, train_loss: 0.5996
8/507, train_loss: 0.5728
9/507, train_loss: 0.3667
10/507, train_loss: 0.3796
11/507, train_loss: 0.3623
12/507, train_loss: 0.0734
13/507, train_loss: 0.1146
14/507, train_loss: 0.4541
15/507, train_loss: 0.7686
16/507, train_loss: 0.2117
17/507, train_loss: 0.3682
18/507, train_loss: 0.3369
19/507, train_loss: 0.8086
20/507, train_loss: 0.2761
21/507, train_loss: 0.0894
22/507, train_loss: 1.0352
23/507, train_loss: 0.5049
24/507, train_loss: 0.0616
25/507, train_loss: 0.7764
26/507, train_loss: 0.5464
27/507, train_loss: 0.6851
28/507, train_loss: 0.6299
29/507, train_loss: 0.4595
30/507, train_loss: 0.5063
31/507, train_loss: 0.5508
32/507, train_loss: 0.2900
33/507, train_loss: 0.5444
34/507, train_loss: 0.9302
35/507, train_loss: 0.4500
36/507, train_loss: 0.9175
37/507, train_l



----------
epoch 4/20
1/507, train_loss: 0.1451
2/507, train_loss: 0.5566
3/507, train_loss: 0.7002
4/507, train_loss: 0.6416
5/507, train_loss: 0.0587
6/507, train_loss: 0.3406
7/507, train_loss: 0.2477
8/507, train_loss: 0.4275
9/507, train_loss: 0.1737
10/507, train_loss: 0.6196
11/507, train_loss: 0.5366
12/507, train_loss: 0.9048
13/507, train_loss: 0.2974
14/507, train_loss: 0.1624
15/507, train_loss: 0.4866
16/507, train_loss: 1.1348
17/507, train_loss: 0.7490
18/507, train_loss: 0.3213
19/507, train_loss: 0.3018
20/507, train_loss: 0.3018
21/507, train_loss: 0.3953
22/507, train_loss: 0.3086
23/507, train_loss: 0.4695
24/507, train_loss: 0.9258
25/507, train_loss: 0.5381
26/507, train_loss: 0.5859
27/507, train_loss: 0.6929
28/507, train_loss: 0.5264
29/507, train_loss: 0.3035
30/507, train_loss: 0.3608
31/507, train_loss: 0.3499
32/507, train_loss: 0.0903
33/507, train_loss: 0.8672
34/507, train_loss: 0.2235
35/507, train_loss: 0.4287
36/507, train_loss: 0.0941
37/507, train_l



----------
epoch 5/20
1/507, train_loss: 0.3181
2/507, train_loss: 0.5352
3/507, train_loss: 0.3672
4/507, train_loss: 0.2090
5/507, train_loss: 0.3328
6/507, train_loss: 0.1938
7/507, train_loss: 0.4438
8/507, train_loss: 0.1399
9/507, train_loss: 0.3718
10/507, train_loss: 0.6318
11/507, train_loss: 0.2402
12/507, train_loss: 0.3250
13/507, train_loss: 0.2910
14/507, train_loss: 0.1628
15/507, train_loss: 0.1222
16/507, train_loss: 0.9346
17/507, train_loss: 0.2695
18/507, train_loss: 0.3015
19/507, train_loss: 0.0441
20/507, train_loss: 0.7207
21/507, train_loss: 0.1754
22/507, train_loss: 0.2373
23/507, train_loss: 0.2432
24/507, train_loss: 0.1501
25/507, train_loss: 0.3381
26/507, train_loss: 0.1321
27/507, train_loss: 0.2256
28/507, train_loss: 0.3848
29/507, train_loss: 0.1749
30/507, train_loss: 0.2249
31/507, train_loss: 0.3098
32/507, train_loss: 0.6870
33/507, train_loss: 0.2671
34/507, train_loss: 0.4897
35/507, train_loss: 0.1495
36/507, train_loss: 0.2417
37/507, train_l



{'mAP_IoU_0.10_0.50_0.05_MaxDet_100': 0.46203613285962963, 'nodule_mAP_IoU_0.10_0.50_0.05_MaxDet_100': 0.46203613285962963, 'AP_IoU_0.10_MaxDet_100': 0.48403505383446666, 'nodule_AP_IoU_0.10_MaxDet_100': 0.48403505383446666, 'mAR_IoU_0.10_0.50_0.05_MaxDet_100': 0.7759562730789185, 'nodule_mAR_IoU_0.10_0.50_0.05_MaxDet_100': 0.7759562730789185, 'AR_IoU_0.10_MaxDet_100': 0.8360655903816223, 'nodule_AR_IoU_0.10_MaxDet_100': 0.8360655903816223}




current epoch: 5 current metric: 0.6395 best metric: 0.6395 at epoch 5
----------
epoch 6/20
1/507, train_loss: 0.1169
2/507, train_loss: 0.9238
3/507, train_loss: 0.3972
4/507, train_loss: 0.3262
5/507, train_loss: 0.1974
6/507, train_loss: 0.2793
7/507, train_loss: 0.2170
8/507, train_loss: 0.1741
9/507, train_loss: 0.0904
10/507, train_loss: 0.3584
11/507, train_loss: 0.1562
12/507, train_loss: 0.0336
13/507, train_loss: 0.4026
14/507, train_loss: 0.7485
15/507, train_loss: 0.1851
16/507, train_loss: 0.2117
17/507, train_loss: 0.1525
18/507, train_loss: 0.3979
19/507, train_loss: 0.3943
20/507, train_loss: 0.2277
21/507, train_loss: 0.0432
22/507, train_loss: 0.2520
23/507, train_loss: 0.1038
24/507, train_loss: 0.5752
25/507, train_loss: 0.1553
26/507, train_loss: 0.3943
27/507, train_loss: 0.5088
28/507, train_loss: 0.1501
29/507, train_loss: 0.1693
30/507, train_loss: 0.3499
31/507, train_loss: 0.0337
32/507, train_loss: 0.2681
33/507, train_loss: 0.3765
34/507, train_loss: 0.185



----------
epoch 7/20
1/507, train_loss: 0.8271
2/507, train_loss: 0.1741
3/507, train_loss: 0.1993
4/507, train_loss: 0.1818
5/507, train_loss: 0.3108
6/507, train_loss: 0.2668
7/507, train_loss: 0.2830
8/507, train_loss: 0.1771
9/507, train_loss: 0.0503
10/507, train_loss: 0.1031
11/507, train_loss: 0.6187
12/507, train_loss: 0.3037
13/507, train_loss: 0.1899
14/507, train_loss: 0.2783
15/507, train_loss: 0.1575
16/507, train_loss: 0.6816
17/507, train_loss: 0.3923
18/507, train_loss: 0.7310
19/507, train_loss: 0.5068
20/507, train_loss: 0.1670
21/507, train_loss: 0.2284
22/507, train_loss: 0.2537
23/507, train_loss: 0.3008
24/507, train_loss: 0.1418
25/507, train_loss: 0.4229
26/507, train_loss: 0.3560
27/507, train_loss: 0.0405
28/507, train_loss: 0.1812
29/507, train_loss: 0.1660
30/507, train_loss: 0.8037
31/507, train_loss: 0.4229
32/507, train_loss: 0.1289
33/507, train_loss: 0.5176
34/507, train_loss: 0.7329
35/507, train_loss: 0.3833
36/507, train_loss: 0.2007
37/507, train_l



----------
epoch 8/20
1/507, train_loss: 0.1582
2/507, train_loss: 0.2922
3/507, train_loss: 0.2695
4/507, train_loss: 0.2007
5/507, train_loss: 0.2227
6/507, train_loss: 0.1338
7/507, train_loss: 0.3833
8/507, train_loss: 0.4414
9/507, train_loss: 0.0651
10/507, train_loss: 0.2158
11/507, train_loss: 0.0998
12/507, train_loss: 0.2429
13/507, train_loss: 0.2966
14/507, train_loss: 0.2747
15/507, train_loss: 0.1943
16/507, train_loss: 0.0303
17/507, train_loss: 0.2269
18/507, train_loss: 0.4619
19/507, train_loss: 0.2383
20/507, train_loss: 0.2147
21/507, train_loss: 0.4675
22/507, train_loss: 0.1237
23/507, train_loss: 0.3015
24/507, train_loss: 0.1412
25/507, train_loss: 0.5913
26/507, train_loss: 0.0277
27/507, train_loss: 0.8774
28/507, train_loss: 0.1780
29/507, train_loss: 0.0735
30/507, train_loss: 0.2480
31/507, train_loss: 0.4143
32/507, train_loss: 0.7466
33/507, train_loss: 0.2524
34/507, train_loss: 0.1185
35/507, train_loss: 0.2471
36/507, train_loss: 0.1927
37/507, train_l



----------
epoch 9/20
1/507, train_loss: 0.1887
2/507, train_loss: 0.4707
3/507, train_loss: 0.2815
4/507, train_loss: 0.5269
5/507, train_loss: 0.1871
6/507, train_loss: 0.2939
7/507, train_loss: 0.1510
8/507, train_loss: 0.5708
9/507, train_loss: 0.7051
10/507, train_loss: 0.1073
11/507, train_loss: 0.4731
12/507, train_loss: 0.4299
13/507, train_loss: 0.1542
14/507, train_loss: 0.2051
15/507, train_loss: 0.4504
16/507, train_loss: 0.9619
17/507, train_loss: 0.0283
18/507, train_loss: 0.1659
19/507, train_loss: 0.2043
20/507, train_loss: 0.3733
21/507, train_loss: 0.2118
22/507, train_loss: 0.1415
23/507, train_loss: 0.1934
24/507, train_loss: 0.5962
25/507, train_loss: 0.1553
26/507, train_loss: 0.1190
27/507, train_loss: 0.1741
28/507, train_loss: 0.0994
29/507, train_loss: 0.2341
30/507, train_loss: 0.1860
31/507, train_loss: 0.5938
32/507, train_loss: 0.2712
33/507, train_loss: 0.1522
34/507, train_loss: 0.6240
35/507, train_loss: 0.1071
36/507, train_loss: 0.1414
37/507, train_l