In [1]:
import os

os.environ['MLFLOW_TRACKING_USERNAME'] = 'Eddy'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'Usp1#'
os.environ['MLFLOW_EXPERIMENT_NAME'] = 'LUNA16'
description = "LUNA16 dataset training without lung segmentation"

In [2]:
import torch
import mlflow
import gc

from visualize_image import visualize_one_xy_slice_in_3d_image
from loading_dataset import load_data
from model import load_model
import numpy as np
from monai.data import box_utils
from monai.apps.detection.metrics.matching import matching_batch
from warmup_scheduler import GradualWarmupScheduler
from monai.apps.detection.metrics.coco import COCOMetric

In [3]:
mlflow.set_tracking_uri("http://mlflow-server-remote:5001")

In [4]:
gt_box_mode = 'cccwhd'
batch_size = 10 # 10
patch_size = [96,96,40]
data_list_file_path = '/data/output/unique_data/luna_train_val.json'
data_base_dir = ''
# data_base_dir = '/data/HC_Images_resample/'
# data_base_dir = '/data/MSD_Images_resample/'
# data_base_dir = '/data/LUNA16_Images_resample/'
amp=True

returned_layers = [1,2]
base_anchor_shapes = [[6,8,4],[8,6,5],[10,10,6]]
conv1_t_stride = [2,2,1]
n_input_channels = 1
spatial_dims = 3
fg_labels = [0]
verbose = False
balanced_sampler_pos_fraction = 0.3
score_thresh = 0.02
nms_thresh = 0.22
val_patch_size = [128, 128, 52]

lr = 1e-2
val_interval = 5
coco_metric = COCOMetric(classes=["nodule"], iou_list=[0.1], max_detection=[100])
# best_val_epoch_metric = 0.7623
best_val_epoch_metric = -1
best_val_epoch = -1
max_epochs = 100
w_cls = 1.0

# windowing
a_min = -1000.0 # -1024
a_max = 400.0 # 300.0

resume_epoch = 0

compute_dtype = torch.float32
if amp:
    compute_dtype = torch.float16

torch.backends.cudnn.benchmark = True

In [5]:
# import monai

# monai.config.print_config()

### loading dataset

In [6]:
train_loader, val_loader, len_train_ds = load_data(
    gt_box_mode, patch_size, batch_size, amp, data_list_file_path, 
    data_base_dir, a_min, a_max
)

monai.transforms.io.dictionary LoadImaged.__init__:image_only: Current default value of argument `image_only=False` has been deprecated since version 1.1. It will be changed to `image_only=True` in version 1.3.


### loading model

In [7]:
detector, device = load_model(
    returned_layers, base_anchor_shapes, conv1_t_stride, n_input_channels,
    spatial_dims, fg_labels, verbose, balanced_sampler_pos_fraction,
    score_thresh, nms_thresh, val_patch_size
)

### Initialize training

In [8]:
optimizer = torch.optim.SGD(
    detector.network.parameters(),
    lr,
    momentum=0.9,
    weight_decay=3e-5,
    nesterov=True
)

after_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=150, gamma=0.1
)
scheduler_warmup = GradualWarmupScheduler(
    optimizer, multiplier=1, total_epoch=10, after_scheduler=after_scheduler
)
scaler = torch.cuda.amp.GradScaler() if amp else None
optimizer.zero_grad()
optimizer.step()

### train

In [9]:
epoch_len = len_train_ds // train_loader.batch_size

# with mlflow.start_run(description=description, run_id="38a6655336604a06bafa99876b344b17") as run:
# with mlflow.start_run(description=description, run_id="63ce257b1e2c48409b7ffeeb26b4f1f1") as run:
# with mlflow.start_run(description=description, run_id="1f92e947841f44228a12dcfb3b9593cb") as run:
# with mlflow.start_run(description=description, run_id="fb8a82599c27452a804605dfc7d87f56") as run:
with mlflow.start_run(description=description) as run:

    mlflow.log_param("gt_box_mode", gt_box_mode)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("patch_size", patch_size)
    mlflow.log_param("data_list_file_path", data_list_file_path)
    mlflow.log_param("data_base_dir", data_base_dir)
    mlflow.log_param("amp", amp)
    
    mlflow.log_param("n_input_channels", n_input_channels)
    mlflow.log_param("spatial_dims", spatial_dims)
    mlflow.log_param("balanced_sampler_pos_fraction", balanced_sampler_pos_fraction)
    mlflow.log_param("score_thresh", score_thresh)
    mlflow.log_param("nms_thresh", nms_thresh)
    
    mlflow.log_param("initial_lr", lr)
    mlflow.log_param("val_interval", val_interval)
    mlflow.log_param("max_epochs", max_epochs)
    mlflow.log_param("w_cls", w_cls)

    mlflow.log_param("a_min", a_min)
    mlflow.log_param("a_max", a_max)
    
    for epoch in range(resume_epoch, max_epochs):
        print("-" * 10)
        print(f"epoch {epoch + 1}/{max_epochs}")
        detector.train()
        epoch_loss = 0
        epoch_cls_loss = 0
        epoch_box_reg_loss = 0
        step = 0
        scheduler_warmup.step()

        for batch_data in train_loader:
            step += 1
            inputs = [
                batch_data_ii["image"].to(device) for batch_data_i in batch_data for batch_data_ii in batch_data_i
            ]
            targets = [
                dict(
                    label=batch_data_ii["label"].to(device),
                    box=batch_data_ii["box"].to(device)
                )
                for batch_data_i in batch_data
                for batch_data_ii in batch_data_i
            ]

            for param in detector.network.parameters():
                param.grad = None

            if amp and (scaler is not None):
                with torch.cuda.amp.autocast():
                    outputs = detector(inputs, targets)
                    loss = w_cls * outputs[detector.cls_key] + outputs[detector.box_reg_key]
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = detector(inputs, targets)
                loss = w_cls * outputs[detector.cls_key] + outputs[detector.box_reg_key]
                loss.backward()
                optimizer.step()

            # saving into mlflow
            epoch_loss += loss.detach().item()
            epoch_cls_loss += outputs[detector.cls_key].detach().item()
            epoch_box_reg_loss += outputs[detector.box_reg_key].detach().item()
            print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
            mlflow.log_metric("train_loss", loss.detach().item(), epoch_len * epoch + step)

        del inputs, batch_data
        torch.cuda.empty_cache()
        gc.collect()

        epoch_loss /= step
        epoch_cls_loss /= step
        epoch_box_reg_loss /= step
        print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")
        mlflow.log_metric("avg_train_loss", epoch_loss, epoch + 1)
        mlflow.log_metric("avg_train_cls_loss", epoch_cls_loss, epoch + 1)
        mlflow.log_metric("avg_train_box_reg_loss", epoch_box_reg_loss, epoch + 1)
        mlflow.log_metric("train_lr", optimizer.param_groups[0]["lr"], epoch + 1)

        # saving last trained model
        mlflow.pytorch.log_model(detector.network, "model")

        # validation for model selection
        if (epoch + 1) % val_interval == 0:
            detector.eval()
            val_outputs_all = []
            val_targets_all = []

            with torch.no_grad():
                for val_data in val_loader:
                    # if all val_data_i["image"] smaller than val_patch_size, no need to use inferer
                    # otherwise, need inferer to handle large input images.
                    use_inferer = not all(
                        [val_data_i["image"][0, ...].numel() < np.prod(val_patch_size) for val_data_i in val_data]
                    )
                    val_inputs = [val_data_i.pop("image").to(device) for val_data_i in val_data]

                    if amp:
                        with torch.cuda.amp.autocast():
                            val_outputs = detector(val_inputs, use_inferer=use_inferer)
                    else:
                        val_outputs = detector(val_inputs, use_inferer=use_inferer)

                    # save outputs for evaluation
                    val_outputs_all += val_outputs
                    val_targets_all += val_data

            # visualize an inference image and boxes
            draw_img = visualize_one_xy_slice_in_3d_image(
                gt_boxes=val_data[0]["box"].cpu().detach().numpy(),
                image=val_inputs[0][0, ...].cpu().detach().numpy(),
                pred_boxes=val_outputs[0][detector.target_box_key].cpu().detach().numpy(),
            )
            # mlflow.log_image(draw_img.transpose([2, 1, 0]), "val_img_xy.png")
            mlflow.log_image(draw_img, str(epoch + 1) + "_val_img_xy.png")

            # compute metrics
            del val_inputs
            torch.cuda.empty_cache()
            results_metric = matching_batch(
                iou_fn=box_utils.box_iou,
                iou_thresholds=coco_metric.iou_thresholds,
                pred_boxes=[
                    val_data_i[detector.target_box_key].cpu().detach().numpy() for val_data_i in val_outputs_all
                ],
                pred_classes=[
                    val_data_i[detector.target_label_key].cpu().detach().numpy() for val_data_i in val_outputs_all
                ],
                pred_scores=[
                    val_data_i[detector.pred_score_key].cpu().detach().numpy() for val_data_i in val_outputs_all
                ],
                gt_boxes=[val_data_i[detector.target_box_key].cpu().detach().numpy() for val_data_i in val_targets_all],
                gt_classes=[
                    val_data_i[detector.target_label_key].cpu().detach().numpy() for val_data_i in val_targets_all
                ]
            )
            val_epoch_metric_dict = coco_metric(results_metric)[0]
            print(val_epoch_metric_dict)

            # write metrics
            for k in val_epoch_metric_dict.keys():
                mlflow.log_metric("val_" + k, val_epoch_metric_dict[k], epoch + 1)
            val_epoch_metric = val_epoch_metric_dict.values()
            val_epoch_metric = sum(val_epoch_metric) / len(val_epoch_metric)
            mlflow.log_metric("val_metric", val_epoch_metric, epoch + 1)

            # save best trained model
            if val_epoch_metric > best_val_epoch_metric:
                best_val_epoch_metric = val_epoch_metric
                best_val_epoch = epoch + 1
                mlflow.pytorch.log_model(detector.network, "best_model")
                mlflow.log_metric("best_metric", best_val_epoch_metric)
                mlflow.log_metric("best_epoch", best_val_epoch)
            print(
                "current epoch: {} current metric: {:.4f} "
                "best metric: {:.4f} at epoch {}".format(
                    epoch + 1, val_epoch_metric, best_val_epoch_metric, best_val_epoch
                )
            )

    print(f"train completed, best_metric: {best_val_epoch_metric:.4f} " f"at epoch: {best_val_epoch}")

----------
epoch 1/100


Applied workaround for CuDNN issue, install nvrtc.so (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:80.)


1/182, train_loss: 1.2979
2/182, train_loss: 1.5957
3/182, train_loss: 1.1133
4/182, train_loss: 1.1016
5/182, train_loss: 1.3047
6/182, train_loss: 1.1719
7/182, train_loss: 1.3955
8/182, train_loss: 1.1680
9/182, train_loss: 1.0674
10/182, train_loss: 1.2363
11/182, train_loss: 0.7578
12/182, train_loss: 0.9717
13/182, train_loss: 0.8340
14/182, train_loss: 1.0898
15/182, train_loss: 0.8145
16/182, train_loss: 0.6943
17/182, train_loss: 0.7393
18/182, train_loss: 0.7388
19/182, train_loss: 0.8604
20/182, train_loss: 0.7822
21/182, train_loss: 0.6890
22/182, train_loss: 0.8633
23/182, train_loss: 0.9121
24/182, train_loss: 0.7432
25/182, train_loss: 0.9297
26/182, train_loss: 0.6685
27/182, train_loss: 0.7949
28/182, train_loss: 0.7065
29/182, train_loss: 0.7402
30/182, train_loss: 0.7148
31/182, train_loss: 0.6685
32/182, train_loss: 0.5034
33/182, train_loss: 0.6597
34/182, train_loss: 0.6660
35/182, train_loss: 0.8916
36/182, train_loss: 0.7676
37/182, train_loss: 0.6753
38/182, tr



----------
epoch 2/100
1/182, train_loss: 0.6118
2/182, train_loss: 0.7007
3/182, train_loss: 0.7163
4/182, train_loss: 0.7607
5/182, train_loss: 0.4600
6/182, train_loss: 0.8545
7/182, train_loss: 0.8325
8/182, train_loss: 0.7114
9/182, train_loss: 0.7437
10/182, train_loss: 0.6934
11/182, train_loss: 0.6245
12/182, train_loss: 0.8462
13/182, train_loss: 0.6465
14/182, train_loss: 0.6367
15/182, train_loss: 0.7803
16/182, train_loss: 0.6255
17/182, train_loss: 0.5825
18/182, train_loss: 0.6499
19/182, train_loss: 0.6064
20/182, train_loss: 0.7124
21/182, train_loss: 0.5938
22/182, train_loss: 0.6572
23/182, train_loss: 0.5586
24/182, train_loss: 0.6953
25/182, train_loss: 0.5371
26/182, train_loss: 0.5088
27/182, train_loss: 0.8237
28/182, train_loss: 0.6182
29/182, train_loss: 0.5562
30/182, train_loss: 0.7314
31/182, train_loss: 0.7344
32/182, train_loss: 0.6011
33/182, train_loss: 0.6436
34/182, train_loss: 0.6323
35/182, train_loss: 0.6792
36/182, train_loss: 0.6694
37/182, train_

KeyboardInterrupt: 