credit: https://github.com/airsplay/py-bottom-up-attention

In [None]:
%%capture
!git clone https://github.com/airsplay/py-bottom-up-attention.git
%cd py-bottom-up-attention

# Install python libraries
!pip install -r requirements.txt
!pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'

# Install detectron2
!python setup.py build develop

# or if you are on macOS
# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop

# or, as an alternative to `setup.py`, do
# pip install [--editable] .
!pip install --upgrade --force-reinstall  imagesize

In [None]:
from glob import glob
import os
import io
import json
import detectron2
from tqdm.notebook import tqdm

# import some common detectron2 utilities
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

# import some common libraries
import numpy as np
import cv2
import torch

# Show the image in ipynb
from IPython.display import clear_output, Image, display
import PIL.Image
def showarray(a, fmt='jpeg'):
    a = np.uint8(np.clip(a, 0, 255))
    f = io.BytesIO()
    PIL.Image.fromarray(a).save(f, fmt)
    display(Image(data=f.getvalue()))

In [None]:
%cd demo
!ls

In [None]:
# Load VG Classes
data_path = 'data/genome/1600-400-20'

vg_classes = []
with open(os.path.join(data_path, 'objects_vocab.txt')) as f:
    for object in f.readlines():
        vg_classes.append(object.split(',')[0].lower().strip())

MetadataCatalog.get("vg").thing_classes = vg_classes

In [None]:
cfg = get_cfg()
cfg.merge_from_file("../configs/VG-Detection/faster_rcnn_R_101_C4_caffe.yaml")
cfg.MODEL.RPN.POST_NMS_TOPK_TEST = 300
cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.6
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2
# VG Weight
cfg.MODEL.WEIGHTS = "http://nlp.cs.unc.edu/models/faster_rcnn_from_caffe.pkl"
predictor = DefaultPredictor(cfg)

In [None]:
cfg = get_cfg()
cfg.merge_from_file("../configs/VG-Detection/faster_rcnn_R_101_C4_caffe.yaml")
cfg.MODEL.RPN.POST_NMS_TOPK_TEST = 300
cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.6
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2
# VG Weight
cfg.MODEL.WEIGHTS = "http://nlp.cs.unc.edu/models/faster_rcnn_from_caffe.pkl"
predictor = DefaultPredictor(cfg)

In [None]:
im_paths = glob('/kaggle/input/simmc-img/data/all_images/*.png')

In [None]:
NUM_OBJECTS = 10000

from torch import nn

from detectron2.modeling.postprocessing import detector_postprocess
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs, fast_rcnn_inference_single_image
from detectron2.structures.boxes import Boxes
from detectron2.structures.instances import Instances

def doit(raw_image, raw_boxes):
        # Process Boxes
    raw_boxes = Boxes(torch.from_numpy(raw_boxes).cuda())
    
    with torch.no_grad():
        raw_height, raw_width = raw_image.shape[:2]
#         print("Original image size: ", (raw_height, raw_width))
        
        # Preprocessing
        image = predictor.transform_gen.get_transform(raw_image).apply_image(raw_image)
#         print("Transformed image size: ", image.shape[:2])
        
        # Scale the box
        new_height, new_width = image.shape[:2]
        scale_x = 1. * new_width / raw_width
        scale_y = 1. * new_height / raw_height
        #print(scale_x, scale_y)
        boxes = raw_boxes.clone()
        boxes.scale(scale_x=scale_x, scale_y=scale_y)
        
        # ----
        image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
        inputs = [{"image": image, "height": raw_height, "width": raw_width}]
        images = predictor.model.preprocess_image(inputs)
        
        # Run Backbone Res1-Res4
        features = predictor.model.backbone(images.tensor)
        
        # Run RoI head for each proposal (RoI Pooling + Res5)
        proposal_boxes = [boxes]
        features = [features[f] for f in predictor.model.roi_heads.in_features]
        box_features = predictor.model.roi_heads._shared_roi_transform(
            features, proposal_boxes
        )
        feature_pooled = box_features.mean(dim=[2, 3])  # pooled to 1x1
#         print('Pooled features size:', feature_pooled.shape)
        
        # Predict classes and boxes for each proposal.
        pred_class_logits, pred_proposal_deltas = predictor.model.roi_heads.box_predictor(feature_pooled)
#         print(pred_class_logits.shape)
        pred_class_prob = nn.functional.softmax(pred_class_logits, -1)
        pred_scores, pred_classes = pred_class_prob[..., :-1].max(-1)
        
        # Detectron2 Formatting (for visualization only)
        roi_features = feature_pooled
        instances = Instances(
            image_size=(raw_height, raw_width),
            pred_boxes=raw_boxes,
            scores=pred_scores,
            pred_classes=pred_classes
        )
        
        return instances, roi_features

def read_data(im_path, m=False):
    try:
        im = cv2.imread(im_path)
        im_rgb = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    except:
        return None
    h, w = im.shape[0], im.shape[1]
    name = im_path.split('/')[-1].split('.')[0]
    given_boxes = []
    indices = []
    try:
        if not m:
            with open(f'/kaggle/input/simmc-img/data/simmc2_scene_jsons_dstc10_public/public/{name}_scene.json') as f:
                data = json.load(f)
        else:
            with open(f'/kaggle/input/simmc-img/data/simmc2_scene_jsons_dstc10_public/public/m_{name}_scene.json') as f:
                data = json.load(f)
    except:
        return None
    for obj in data['scenes'][0]['objects']:
        x0 = obj['bbox'][0]
        y0 = obj['bbox'][1]
        x1 = x0 + obj['bbox'][3]
        y1 = y0 + obj['bbox'][2]
        given_boxes.append([x0, y0, x1, y1])
        indices.append(obj['index'])
    given_boxes.append([0, 0, w, h])
    indices.append('scene')
    return im, np.array(given_boxes), indices, name

# instances, features = doit(im, given_boxes)

In [None]:
without_m = {}
with_m = {}
for im_path in tqdm(im_paths):
    data = read_data(im_path)
    if data is not None:
        im, given_boxes, indices, name = data
        d = {}
        instances, features = doit(im, given_boxes)
        features = features.cpu().tolist()
        for i, idx in enumerate(indices):
            d[idx] = features[i]
        without_m[name] = d
for im_path in tqdm(im_paths):
    data = read_data(im_path, m=True)
    if data is not None:
        im, given_boxes, indices, name = data
        d = {}
        instances, features = doit(im, given_boxes)
        features = features.cpu().tolist()
        for i, idx in enumerate(indices):
            d[idx] = features[i]
        with_m[name] = d

In [None]:
len(without_m), len(with_m)

In [None]:
%cd /kaggle/working
!rm -r *

In [None]:
!ls

In [None]:
with open('without_m.json', 'w') as f:
    json.dump(without_m, f)
with open('with_m.json', 'w') as f:
    json.dump(with_m, f)

In [None]:
# Show the boxes, labels, and features
pred = instances.to('cpu')
v = Visualizer(im[:, :, :], MetadataCatalog.get("vg"), scale=1.2)
v = v.draw_instance_predictions(pred)
showarray(v.get_image()[:, :, ::-1])
print('instances:\n', instances)
print()
print('boxes:\n', instances.pred_boxes)
print()
# print('Shape of features:\n', features.shape)

In [None]:
# # Verify the correspondence of RoI features
# pred_class_logits, pred_proposal_deltas = predictor.model.roi_heads.box_predictor(features)
# pred_class_probs = torch.nn.functional.softmax(pred_class_logits, -1)[:, :-1]
# max_probs, max_classes = pred_class_probs.max(-1)
# print("%d objects are different, it is because the classes-aware NMS process" % (NUM_OBJECTS - torch.eq(instances.pred_classes, max_classes).sum().item()))
# print("The total difference of score is %0.4f" % (instances.scores - max_probs).abs().sum().item())