In [1]:
%config IPCompleter.use_jedi = False

In [2]:
import wholeslidedata
from wholeslidedata.iterators import create_batch_iterator
import os

In [3]:
import detectron2
import torch
from detectron2.structures import (
    BoxMode,
    Instances,
    Boxes
)
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
from detectron2 import model_zoo

In [4]:
from wholeslidedata.iterators import BatchIterator
import numpy as np
class Detectron22DataLoader(BatchIterator):
    def __next__(self):
        x_batch, y_batch = super().__next__()
        x_batch = x_batch / 255.0
        
        batch_dicts = []
        for idx, x_sample in enumerate(x_batch):
            sample_dict = {}
#             print(y_batch[idx])
            target_gt_boxes = self._get_gt_boxes(y_batch[idx], x_sample.shape[:2])
            image = image.transpose(2, 0, 1).astype("float32")
            sample_dict['instances'] = target_gt_boxes
            sample_dict['image'] = torch.as_tensor(image)
            batch_dicts.append(sample_dict)
        return batch_dicts

    def _get_gt_boxes(self, y_sample, image_size):
        y_boxes = y_sample[~np.all(y_sample == 0, axis=-1)]
        boxes = [BoxMode.convert(obj[:4], BoxMode.XYXY_ABS, BoxMode.XYXY_ABS) for obj in y_boxes]
        target = Instances(image_size)
        target.gt_boxes = Boxes(boxes)
        classes = [int(obj[-2]) for obj in y_boxes]
        classes = torch.tensor(classes, dtype=torch.int64)
        target.gt_classes = classes
        return target


In [5]:
class WholeSlideDataDetectionTrainer(DefaultTrainer):
    @classmethod
    def build_train_loader(cls, cfg):
        user_config = './configs/detection_config.yml'
        cpus = 1
        mode = 'training'

        training_batch_generator =  create_batch_iterator(user_config=user_config, 
                                    mode=mode, 
                                    cpus=cpus, 
                                    iterator_class=Detectron22DataLoader) 
        return training_batch_generator

In [9]:
user_config = './configs/detection_config.yml'
cpus = 1
mode = 'training'

training_batch_generator =  create_batch_iterator(user_config=user_config, 
                            mode=mode, 
                            cpus=cpus, 
                            iterator_class=Detectron22DataLoader) 

In [10]:
def train():
#     coco_datadict = get_pannuke_coco_datadict(data_folder, fold)
#     register(fold, coco_datadict)
    cfg = get_cfg()
    cfg.merge_from_file(
        model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")
    )
    cfg.DATASETS.TRAIN = ("detection_dataset2",)
    cfg.DATASETS.TEST = ()
    cfg.DATALOADER.NUM_WORKERS = 1
#     cfg.MODEL.WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x/139173657/model_final_68b088.pkl"  # Let training initialize from model zoo
    # cfg.MODEL.WEIGHTS = None
    cfg.SOLVER.IMS_PER_BATCH = 8
    cfg.SOLVER.BASE_LR = 0.00001  # pick a good LR
    cfg.SOLVER.MAX_ITER = 200000  # 300 iterations seems good enough for this toy dataset; you may need to train longer for a practical dataset
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = (
        64  # faster, and good enough for this toy dataset (default: 512)
    )
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
    cfg.OUTPUT_DIR = '/home/user/output/'
    cfg.SOLVER.STEPS = (1000, 10000, 20000, 50000, 100000)
    cfg.SOLVER.WARMUP_ITERS = 100
    cfg.SOLVER.GAMMA = 0.5
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    trainer = WholeSlideDataDetectionTrainer(cfg)
    trainer.resume_or_load(resume=False)
    trainer.train()



In [None]:
train()

[32m[09/21 19:42:26 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

[34mbackbone.fpn_lateral2.{bias, weight}[0m
[34mbackbone.fpn_lateral3.{bias, weight}[0m
[34mbackbone.fpn_lateral4.{bias, weight}[0m
[34mbackbone.fpn_lateral5.{bias, weight}[0m
[34mbackbone.fpn_output2.{bias, weight}[0m
[34mbackbone.fpn_output3.{bias, weight}[0m
[34mbackbone.fpn_output4.{bias, weight}[0m
[34mbackbone.fpn_output5.{bias, weight}[0m
[34mproposal_generator.rpn_head.anchor_deltas.{bias, weight}[0m
[34mproposal_generator.rpn_head.conv.{bias, weight}[0m
[34mproposal_generator.rpn_head.objectness_logits.{bias, weight}[0m
[34mroi_heads.box_head.fc1.{bias, weight}[0m
[34mroi_heads.box_head.fc2.{bias, weight}[0m
[34mroi_heads.box_predictor.bbox_pred.{bias, weight}[0m
[34mroi_heads.box_predictor.cls_score.{bias, weight}[0m
  [35mlinear.{bias, weight}[0m


[32m[09/21 19:42:27 d2.engine.train_loop]: [0mStarting training from iteration 0
[32m[09/21 19:42:33 d2.utils.events]: [0m eta: 16:24:32  iter: 19  total_loss: 1.761  loss_cls: 0.5831  loss_box_reg: 0  loss_rpn_cls: 0.694  loss_rpn_loc: 0.4809  time: 0.2927  data_time: 0.0132  lr: 1.9081e-06  max_mem: 2825M
[32m[09/21 19:42:39 d2.utils.events]: [0m eta: 16:32:08  iter: 39  total_loss: 1.524  loss_cls: 0.3682  loss_box_reg: 0  loss_rpn_cls: 0.6924  loss_rpn_loc: 0.4507  time: 0.2967  data_time: 0.0028  lr: 3.9061e-06  max_mem: 2825M
[32m[09/21 19:42:45 d2.utils.events]: [0m eta: 16:32:42  iter: 59  total_loss: 1.287  loss_cls: 0.1605  loss_box_reg: 0  loss_rpn_cls: 0.6908  loss_rpn_loc: 0.4608  time: 0.2968  data_time: 0.0021  lr: 5.9041e-06  max_mem: 2825M
[32m[09/21 19:42:51 d2.utils.events]: [0m eta: 16:32:36  iter: 79  total_loss: 1.174  loss_cls: 0.0749  loss_box_reg: 0  loss_rpn_cls: 0.6864  loss_rpn_loc: 0.4164  time: 0.2978  data_time: 0.0033  lr: 7.9021e-06  max_mem: 

In [8]:
train()

[32m[09/21 19:39:58 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

Some model parameters or buffers are not found in the checkpoint:
[34mbackbone.fpn_lateral2.{bias, weight}[0m
[34mbackbone.fpn_lateral3.{bias, weight}[0m
[34mbackbone.fpn_lateral4.{bias, weight}[0m
[34mbackbone.fpn_lateral5.{bias, weight}[0m
[34mbackbone.fpn_output2.{bias, weight}[0m
[34mbackbone.fpn_output3.{bias, weight}[0m
[34mbackbone.fpn_output4.{bias, weight}[0m
[34mbackbone.fpn_output5.{bias, weight}[0m
[34mproposal_generator.rpn_head.anchor_deltas.{bias, weight}[0m
[34mproposal_generator.rpn_head.conv.{bias, weight}[0m
[34mproposal_generator.rpn_head.objectness_logits.{bias, weight}[0m
[34mroi_heads.box_head.fc1.{bias, weight}[0m
[34mroi_heads.box_head.fc2.{bias, weight}[0m
[34mroi_heads.box_predictor.bbox_pred.{bias, weight}[0m
[34mroi_heads.box_predictor.cls_score.{bias, weight}[0m
The checkpoint state_dict contains keys that are not used by the model:
  [35mlinear.{bias, weight}[0m


[32m[09/21 19:39:59 d2.engine.train_loop]: [0mStarting training from iteration 0


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1631630815121/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


[32m[09/21 19:40:05 d2.utils.events]: [0m eta: 16:18:50  iter: 19  total_loss: 1.734  loss_cls: 0.5857  loss_box_reg: 0  loss_rpn_cls: 0.6819  loss_rpn_loc: 0.4717  time: 0.2925  data_time: 0.0129  lr: 1.9081e-06  max_mem: 1595M
[32m[09/21 19:40:11 d2.utils.events]: [0m eta: 16:22:18  iter: 39  total_loss: 1.543  loss_cls: 0.4114  loss_box_reg: 0  loss_rpn_cls: 0.6803  loss_rpn_loc: 0.441  time: 0.2947  data_time: 0.0022  lr: 3.9061e-06  max_mem: 1595M
[32m[09/21 19:40:17 d2.utils.events]: [0m eta: 16:34:49  iter: 59  total_loss: 1.311  loss_cls: 0.2067  loss_box_reg: 0  loss_rpn_cls: 0.6782  loss_rpn_loc: 0.4511  time: 0.2968  data_time: 0.0020  lr: 5.9041e-06  max_mem: 1595M
[32m[09/21 19:40:23 d2.utils.events]: [0m eta: 16:28:58  iter: 79  total_loss: 1.177  loss_cls: 0.1015  loss_box_reg: 0  loss_rpn_cls: 0.6742  loss_rpn_loc: 0.4077  time: 0.2958  data_time: 0.0023  lr: 7.9021e-06  max_mem: 1595M
[32m[09/21 19:40:29 d2.utils.events]: [0m eta: 16:32:39  iter: 99  total_lo

Process ProducerForkProcess-11:
Traceback (most recent call last):
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Process CommanderForkProcess-10:
  File "/home/user/.local/lib/python3.8/site-packages/concurrentbuffer/producer.py", line 59, in run
    for message in iter(self._message_queue.get, STOP_MESSAGE):
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/multiprocessing/queues.py", line 97, in get
    res = self._recv_bytes()
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
Traceback (most recent call last):
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/us

[32m[09/21 19:42:12 d2.engine.hooks]: [0mOverall training speed: 445 iterations in 0:02:11 (0.2950 s / it)
[32m[09/21 19:42:12 d2.engine.hooks]: [0mTotal training time: 0:02:11 (0:00:00 on hooks)
[32m[09/21 19:42:12 d2.utils.events]: [0m eta: 16:26:09  iter: 447  total_loss: 0.6645  loss_cls: 0.001431  loss_box_reg: 0  loss_rpn_cls: 0.5586  loss_rpn_loc: 0.102  time: 0.2944  data_time: 0.0021  lr: 1e-05  max_mem: 1595M


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-2da0ffaf5447>", line 1, in <module>
    train()
  File "<ipython-input-7-71237222ffa3>", line 27, in train
    trainer.train()
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/site-packages/detectron2/engine/defaults.py", line 487, in train
    super().train(self.start_iter, self.max_iter)
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/site-packages/detectron2/engine/train_loop.py", line 149, in train
    self.run_step()
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/site-packages/detectron2/engine/defaults.py", line 497, in run_step
    self._trainer.run_step()
  File "/home/user/miniconda3/envs/detectron2/lib/python3.8/site-packages/detectron2/engine/train_loop.py", line 285, in run_step
    losses.backwar

TypeError: object of type 'NoneType' has no len()