In [2]:
from ultralytics import YOLOWorld
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch

import warnings

warnings.filterwarnings(action="ignore")
warnings.simplefilter(action="ignore")

In [3]:
yolo = YOLOWorld(model='../models/yolov8s-world.pt')
yolo = yolo.to(torch.device(device="cuda" if torch.cuda.is_available() else "cpu"))
key_layer_idx = {
    # module 2-9 same as yolov8 
    "backbone_c2f1": 2,
    "backbone_c2f2": 4,
    "backbone_c2f3": 6,
    "backbone_c2f4": 8, 
    "backbone_sppf": 9,
    # module changed
    "neck_c2f1": 15,
    "neck_c2f2": 19,
    "neck_c2f3": 22,
    "detect_head": 23
}
layers = {layer: yolo.model.model[idx] for layer, idx in key_layer_idx.items()}
detect_head = layers['detect_head']

提取输出

In [4]:
import os
from PIL import Image
from DyFilterAttack.analyzer.utils import SaveFeatures
save_feats = SaveFeatures()
save_feats.register_hooks(module=detect_head, parent_path='detect_head', verbose=True)
img_path = '../testset/bus.jpg'
results = yolo.predict(img_path)
detect_head_raw_feats = save_feats.get_features()

nl = detect_head.nl
nc, reg_max =  detect_head.nc, detect_head.reg_max
no = nc + 4 * reg_max
assert no == detect_head.no

Registering Hook: detect_head.cv2
Registering Hook: detect_head.cv2.0
Registering Hook: detect_head.cv2.0.0
Registering Hook: detect_head.cv2.0.0.conv
Registering Hook: detect_head.cv2.0.0.bn
Registering Hook: detect_head.cv2.0.0.act
Registering Hook: detect_head.cv2.0.1
Registering Hook: detect_head.cv2.0.1.conv
Registering Hook: detect_head.cv2.0.1.bn
Registering Hook: detect_head.cv2.0.1.act
Registering Hook: detect_head.cv2.0.2
Registering Hook: detect_head.cv2.1
Registering Hook: detect_head.cv2.1.0
Registering Hook: detect_head.cv2.1.0.conv
Registering Hook: detect_head.cv2.1.0.bn
Registering Hook: detect_head.cv2.1.0.act
Registering Hook: detect_head.cv2.1.1
Registering Hook: detect_head.cv2.1.1.conv
Registering Hook: detect_head.cv2.1.1.bn
Registering Hook: detect_head.cv2.1.1.act
Registering Hook: detect_head.cv2.1.2
Registering Hook: detect_head.cv2.2
Registering Hook: detect_head.cv2.2.0
Registering Hook: detect_head.cv2.2.0.conv
Registering Hook: detect_head.cv2.2.0.bn
Regi

In [5]:
# plot det result[B=0]
result = results[0]
for det in result.boxes:
    xmin, ymin, xmax, ymax = det.xyxy[0]
    conf = det.conf  # Confidence
    cls = det.cls  # Class ID
    class_name = result.names[cls[0].item()]
    print(f"bbox: {xmin}, {ymin}, {xmax}, {ymax}, conf: {conf}, class: {class_name}")

image = Image.fromarray(result.plot()[:, :, ::-1])
image.show()
image.save('result/bus_result.jpg')

bbox: 50.064640045166016, 400.375, 248.49916076660156, 902.4866333007812, conf: tensor([0.9363], device='cuda:0'), class: person
bbox: 222.50120544433594, 405.6936950683594, 344.2733154296875, 858.9097900390625, conf: tensor([0.9270], device='cuda:0'), class: person
bbox: 669.1835327148438, 389.4619140625, 810.0, 875.4700317382812, conf: tensor([0.9181], device='cuda:0'), class: person
bbox: 5.60137939453125, 229.0391387939453, 805.223388671875, 738.57177734375, conf: tensor([0.9020], device='cuda:0'), class: bus
bbox: 0.0, 253.29833984375, 33.03541946411133, 324.5682373046875, conf: tensor([0.6973], device='cuda:0'), class: stop sign
bbox: 0.0, 447.8066101074219, 75.65015411376953, 1044.215087890625, conf: tensor([0.6439], device='cuda:0'), class: person
bbox: 0.0, 490.36016845703125, 77.55752563476562, 876.3672485351562, conf: tensor([0.4763], device='cuda:0'), class: person


In [6]:
# process1 
# text -> (B, nc, embed_dim)
# image -> (B, embed_dim, H, W)
# cv4 contrast(iamge, text) -> (B, nc, H, W)
# cv2(image) -> (B, reg_max * 4, H, W)
# cat_result -> (B, nc + reg_max * 4, H ,W) -> (B, no, H ,W)
# x[i] -> cat_result[i] (i = 1, 2, nl)

cv2_raw_feats = [detect_head_raw_feats[f'detect_head.cv2.{i}'] for i in range(nl)]
cv4_raw_feats = [detect_head_raw_feats[f'detect_head.cv4.{i}'] for i in range(nl)]

print(f'cv2_raw_feats {0}: {cv2_raw_feats[0].size()}')
print(f'cv2_raw_feats {1}: {cv2_raw_feats[1].size()}')
print(f'cv2_raw_feats {2}: {cv2_raw_feats[2].size()}')
print(f'cv4_raw_feats {0}: {cv4_raw_feats[0].size()}')
print(f'cv4_raw_feats {1}: {cv4_raw_feats[1].size()}')
print(f'cv4_raw_feats {2}: {cv4_raw_feats[2].size()}')

# process2 (_inference)
# flat(x[i]) -> (B, no, H * W)
# cat(x) -> (B, C, H0 * W0 + H1 * W1 + H2 * W2)
# split(x) -> bbox(B, 4 * reg_max, H0 * W0 + H1 * W1 + H2 * W2), cls(logit)(B, nc, H0 * W0 + H1 * W1 + H2 * W2)
# docode(bbox) -> dbox
# logit(cls) -> sigmoid(cls)
# y -> cat(dbox, cls)

cat_raw_feats = [torch.cat((cv2_raw_feats[i], cv4_raw_feats[i]), 1) for i in range(nl)]
flatten_raw_feats = torch.cat([cat_raw_feat.view(cat_raw_feats[0].shape[0], no, -1) for cat_raw_feat in cat_raw_feats], 2)
raw_box = flatten_raw_feats[:, : reg_max * 4]
raw_cls = flatten_raw_feats[:, reg_max * 4 :]

dfl_feats = detect_head_raw_feats['detect_head.dfl']
dbox = detect_head.decode_bboxes(dfl_feats, detect_head.anchors.unsqueeze(0)) * detect_head.strides
# ! Attention: we need cls(logit) as y_det
logit_cls = raw_cls
sigmoid_cls = logit_cls.sigmoid()

print(f'cat_raw_feat: {cat_raw_feats[0].size()}')           # (B, C, H * W)
print(f'flatten_raw_feats: {flatten_raw_feats.size()}')     # (B, C, H0 * W0 + H1 * W1 + H2 * W2)
print(f'raw_box: {raw_box.size()}')                         # (B, 4 * reg_max, H0 * W0 + H1 * W1 + H2 * W2)
print(f'raw_cls: {raw_cls.size()}')                         # (B, nc, H0 * W0 + H1 * W1 + H2 * W2)
print(f'dbox: {dbox.size()}')                               # (B, 4,  H0 * W0 + H1 * W1 + H2 * W2)
print(f'logit_cls: {logit_cls.size()}')                     # (B, nc, H0 * W0 + H1 * W1 + H2 * W2)
print(f'sigmoid_cls: {sigmoid_cls.size()}')                 # (B, nc, H0 * W0 + H1 * W1 + H2 * W2)

cv2_raw_feats 0: torch.Size([1, 64, 80, 60])
cv2_raw_feats 1: torch.Size([1, 64, 40, 30])
cv2_raw_feats 2: torch.Size([1, 64, 20, 15])
cv4_raw_feats 0: torch.Size([1, 80, 80, 60])
cv4_raw_feats 1: torch.Size([1, 80, 40, 30])
cv4_raw_feats 2: torch.Size([1, 80, 20, 15])
cat_raw_feat: torch.Size([1, 144, 80, 60])
flatten_raw_feats: torch.Size([1, 144, 6300])
raw_box: torch.Size([1, 64, 6300])
raw_cls: torch.Size([1, 80, 6300])
dbox: torch.Size([1, 4, 6300])
logit_cls: torch.Size([1, 80, 6300])
sigmoid_cls: torch.Size([1, 80, 6300])


In [77]:
# process3 (construct y_det_orig and y_de_target)
# obtain the specific cls indices selected by non_max_suppression
from ultralytics.utils import ops
import numpy as np
predictor = yolo.predictor
preds = torch.cat([dbox, sigmoid_cls], 1)  # (B, 4+nc, N)

detections, keep_idxs = ops.non_max_suppression(
    preds,
    predictor.args.conf,
    predictor.args.iou,
    predictor.args.classes,
    predictor.args.agnostic_nms,
    predictor.args.max_det,
    nc=0 if predictor.args.task == "detect" else len(predictor.model.names),
    end2end=getattr(predictor.model, "end2end", False),
    rotated=predictor.args.task == "obb",
    return_idxs=True,
)

num_nms_output = [idx.numel() for idx in keep_idxs]
max_out = max(num_nms_output)

y_det = raw_cls.new_zeros(raw_cls.shape[0], raw_cls.shape[1], max_out)
for b, idx in enumerate(keep_idxs):  
    if idx.numel() > 0:
        y_det[b, :, :idx.numel()] = flatten_raw_feats[:raw_cls.shape[0], raw_box.shape[1]:, idx]
        _det = raw_cls[:, :, idx]
        assert np.all((_det==y_det).cpu().numpy())
        

first_max_cls_idx = torch.argmax(y_det, dim=1)  # (B, max_out)
y_det_orig = y_det[torch.arange(y_det.shape[0]), first_max_cls_idx, torch.arange(y_det.shape[2])] # (B, max_out)

_, topk_indices = torch.topk(y_det, 2, dim=1)
second_max_cls_idx = topk_indices[:, 1]  # (B, max_out)
y_det_target = y_det[torch.arange(y_det.shape[0]), second_max_cls_idx, torch.arange(y_det.shape[2])] # (B, max_out)

print(f'y_det_orig_idx(shape)  : {first_max_cls_idx.shape}')
print(f'y_det_orig(shape)      : {y_det_orig.shape}')
print(f'y_det_target_idx(shape): {second_max_cls_idx.shape}')
print(f'y_det_target(shape)    : {y_det_target.shape}')
print(f'y_det_orig_idx  : {first_max_cls_idx}')
print(f'y_det_orig      : {y_det_orig}')
print(f'y_det_target_idx: {second_max_cls_idx}')
print(f'y_det_target    : {y_det_target}')

y_det_orig_idx(shape)  : torch.Size([1, 7])
y_det_orig(shape)      : torch.Size([1, 7])
y_det_target_idx(shape): torch.Size([1, 7])
y_det_target(shape)    : torch.Size([1, 7])
y_det_orig_idx  : tensor([[ 0,  0,  0,  5, 11,  0,  0]], device='cuda:0')
y_det_orig      : tensor([[ 2.6885,  2.5418,  2.4168,  2.2194,  0.8343,  0.5923, -0.0947]], device='cuda:0')
y_det_target_idx: tensor([[77, 77, 16,  7, 25, 77, 77]], device='cuda:0')
y_det_target    : tensor([[-8.1884, -9.1895, -9.1035, -3.3859, -4.7500, -6.7822, -4.1971]], device='cuda:0')
