# Install Detectron2

In [None]:
!rm -rf detectron2/
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

# Download Pretrained Model

In [None]:
!wget 'https://www.dropbox.com/scl/fi/hrhdo9ev957hbllbnzctt/model_final.pth?rlkey=lfn9ju29g1xtpcxz3pl2pj8do&dl=0' -O model.pth

# Set Some Constants

In [None]:
!export LRU_CACHE_CAPACITY=1
# inference constants
ACCEPTANCE_THRESHOLD=0.45
BATCH = 2

# Make all necessary imports

In [None]:
from datetime import datetime
import logging
from pathlib import Path
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import LazyConfig, instantiate
from detectron2.engine import (
    AMPTrainer,
    SimpleTrainer,
    default_argument_parser,
    default_setup,
    default_writers,
    hooks,
    launch
)
from detectron2.engine.defaults import _try_get_key
from detectron2.engine.defaults import create_ddp_model
from detectron2.evaluation import inference_on_dataset, print_csv_format
from detectron2.utils import comm

from functools import partial
from detectron2.utils.file_io import PathManager
from omegaconf import OmegaConf
import torch.nn as nn
from fvcore.common.param_scheduler import MultiStepParamScheduler

from detectron2 import model_zoo
from detectron2.config import LazyCall as L
from detectron2.solver import WarmupParamScheduler
from detectron2.modeling import MViT
from detectron2.layers import ShapeSpec
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.matcher import Matcher
from detectron2.modeling.roi_heads import (
    FastRCNNOutputLayers,
    FastRCNNConvFCHead,
    CascadeROIHeads,
)
from detectron2.utils.env import seed_all_rng
from detectron2.data.datasets import register_coco_instances
import detectron2.data.transforms as T
from detectron2 import model_zoo
from detectron2.config import LazyCall as L
import detectron2.data.transforms as T
from detectron2.config import LazyCall as L
from detectron2.data import (
    DatasetMapper,
    build_detection_test_loader,
    build_detection_train_loader,
    get_detection_dataset_dicts,
)
from detectron2.evaluation import COCOEvaluator
from detectron2.utils.logger import setup_logger
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import LazyConfig, instantiate
from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger
import matplotlib.pyplot as plt
import time

import atexit
import bisect
from copy import copy
import multiprocessing as mp
from collections import deque
import cv2
import torch

import detectron2.data.transforms as T
from detectron2.data import MetadataCatalog
from detectron2.structures import Instances
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode, Visualizer
import numpy as np
from torchvision import transforms

# Register Datasets

In [None]:
register_coco_instances("badlad_train", {}, '/kaggle/input/dlsprint2/badlad/labels/coco_format/train/badlad-train-coco.json', '/kaggle/input/dlsprint2/badlad/images/train')
register_coco_instances("badlad_test", {}, '/kaggle/input/dlsprint2/badlad/badlad-test-metadata.json', '/kaggle/input/dlsprint2/badlad/images/test')

# Configure Model, Trainer and Dataloader

In [None]:
def setup():
    model = model_zoo.get_config("common/models/mask_rcnn_fpn.py").model
    constants = model_zoo.get_config("common/data/constants.py").constants
    model.pixel_mean = constants.imagenet_rgb256_mean
    model.pixel_std = constants.imagenet_rgb256_std
    model.input_format = "RGB"
    model.backbone.bottom_up = L(MViT)(
        embed_dim=96,
        depth=24,
        num_heads=1,
        last_block_indexes=(1, 4, 20, 23),
        residual_pooling=True,
        drop_path_rate=0.4,
        norm_layer=partial(nn.LayerNorm, eps=1e-6),
        out_features=("scale2", "scale3", "scale4", "scale5"),
    )
    model.backbone.in_features = "${.bottom_up.out_features}"
    model.backbone.square_pad = 1024

    # New heads and LN
    model.backbone.norm = "LN"  # Use LN in FPN
    model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN"

    # 2conv in RPN:
    model.proposal_generator.head.conv_dims = [-1, -1]

    # arguments that don't exist for Cascade R-CNN
    [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
    model.roi_heads.update(
        _target_=CascadeROIHeads,
        box_heads=[
            L(FastRCNNConvFCHead)(
                input_shape=ShapeSpec(channels=256, height=7, width=7),
                conv_dims=[256, 256, 256, 256],
                fc_dims=[1024],
                conv_norm="LN",
            )
            for _ in range(3)
        ],
        box_predictors=[
            L(FastRCNNOutputLayers)(
                input_shape=ShapeSpec(channels=1024),
                test_score_thresh=0.05,
                box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
                cls_agnostic_bbox_reg=True,
                num_classes="${...num_classes}",
                test_topk_per_image=1000
            )
            for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
        ],
        proposal_matchers=[
            L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
            for th in [0.5, 0.6, 0.7]
        ],
    )
    model.roi_heads.num_classes = 4
    model.roi_heads.batch_size_per_image = 512

    dataloader = OmegaConf.create()

    image_size = 1024
    dataloader.train = L(build_detection_train_loader)(
        dataset=L(get_detection_dataset_dicts)(names="badlad_train"),
        mapper=L(DatasetMapper)(
            is_train=True,
            augmentations=[
                L(T.RandomBrightness)(intensity_min=0.8,intensity_max=1.2),
                L(T.RandomContrast)(intensity_min=0.5,intensity_max=1.5),
                L(T.RandomSaturation)(intensity_min=0.5,intensity_max=1.0),
                L(T.RandomRotation)(angle=[-5, 5], sample_style="range"),
                L(T.ResizeScale)(
                    min_scale=0.1, max_scale=2.0, target_width=image_size, target_height=image_size
                ),
                L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False),
            ],
            image_format="RGB",
            use_instance_mask=True,
        ),
        total_batch_size=16,
        num_workers=4,
    )

    dataloader.test = L(build_detection_test_loader)(
        dataset=L(get_detection_dataset_dicts)(names="badlad_test", filter_empty=False),
        mapper=L(DatasetMapper)(
            is_train=False,
            augmentations=[
                L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size),
            ],
            image_format="RGB",
        ),
        batch_size=BATCH,
        num_workers=2,
    )

    dataloader.evaluator = L(COCOEvaluator)(
        dataset_name="${..test.dataset.names}",
    )

    dataloader.train.num_workers = 2
    dataloader.train.total_batch_size = 16
    # recompute boxes due to cropping
    dataloader.train.mapper.recompute_boxes = True

    # Initialization and trainer settings
    train = model_zoo.get_config("common/train.py").train
    train.amp.enabled = True
    train.ddp.fp16_compression = True
    train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in21k.pyth"
    train.output_dir = "./output/mvit2b"
    # Schedule
    # 36 epoch = 20365/16 * 36 = 45821 iterations 
    train.max_iter = 45821
    train.eval_period = 1000
    train.log_period = 20
    train.checkpointer.period = 2000
    train.device = "cuda"

    lr_multiplier = L(WarmupParamScheduler)(
        scheduler=L(MultiStepParamScheduler)(
            values=[1.0, 0.1, 0.01],
            milestones=[40730, 44447],
            num_updates=train.max_iter,
        ),
        warmup_length=50 / train.max_iter,
        warmup_factor=0.001,
    )

    optimizer = model_zoo.get_config("common/optim.py").AdamW
    optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}}
    optimizer.lr = 0.00008

    dataloader.evaluator.output_dir = train.output_dir

    cfg = OmegaConf.create()
    cfg.model = model
    cfg.dataloader = dataloader
    cfg.train = train
    cfg.optimizer = optimizer
    cfg.lr_multiplier = lr_multiplier

    return cfg

Limit CUDA allocation size to avoid OOM.

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF']='max_split_size_mb:512'

# Instantiate Model and Load Weights

In [None]:
logger = setup_logger()
cfg = setup()
model = instantiate(cfg.model)
model.to(cfg.train.device)
checkpointer = DetectionCheckpointer(model)
checkpointer.load('model.pth')
model.eval()

Use RLE encoding for submission format.

In [None]:
def rle_encode(mask):
    pixels = mask.T.flatten()
    use_padding = False
    if pixels[0] or pixels[-1]:
        use_padding = True
        pixel_padded = np.zeros([len(pixels) + 2], dtype=pixels.dtype)
        pixel_padded[1:-1] = pixels
        pixels = pixel_padded
    rle = np.where(pixels[1:] != pixels[:-1])[0] + 2
    if use_padding:
        rle = rle - 1
    rle[1::2] = rle[1::2] - rle[:-1:2]
    return ' '.join(str(x) for x in rle)

In [None]:
from detectron2.utils.memory import retry_if_cuda_oom

thing_classes = ['paragraph', 'text_box', 'image', 'table']
@retry_if_cuda_oom
def get_masks(prediction):
    # get masks for each category
    take = prediction.scores >= ACCEPTANCE_THRESHOLD
    pred_masks = (prediction.pred_masks[take] != 0)
    pred_classes = prediction.pred_classes[take]
    rles = []
    for cat in range(len(thing_classes)):
        pred_mask = pred_masks[pred_classes == cat]
        
        # pred_mask = retry_if_cuda_oom(torch.any)(pred_mask, dim=0)
        pred_mask = torch.any(pred_mask, dim=0)
        rles.append(rle_encode(pred_mask.short().to("cpu").numpy()))

    return rles

# Run Inference on Test Data

In [None]:
def run_inference(data):
    results = []
    with torch.no_grad():
        outputs = model(data)
        if torch.cuda.is_available():
            torch.cuda.synchronize()

        for idx, output in enumerate(outputs):
            output = output["instances"]

            rles = get_masks(output)

            result = [
                f"{data[idx]['image_id']}_{cat},{rles[cat]}\n"
                for cat in range(len(thing_classes))
            ]

            results.extend(result)

        del outputs, output

    return results

In [None]:
import gc
from tqdm import tqdm
print("#### RUNNING INFERENCE ON TEST DATA ####")
torch.cuda.empty_cache()
gc.collect()

submission_file = open("submission.csv", "w")
submission_file.write("Id,Predicted\n")

results: list[str] = []
test_loader = instantiate(cfg.dataloader.test)
for i, data in enumerate(tqdm(test_loader)):
    res = run_inference(data)
    results.extend(res)

    if i % (500 // BATCH) == 0:
        print(f"Inference on batch {i}/{len(test_loader)} done")
        submission_file.writelines(results)
        results = []

submission_file.writelines(results)
submission_file.close()