In [18]:
from ultralytics import YOLOWorld
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch

import warnings

warnings.filterwarnings(action="ignore")
warnings.simplefilter(action="ignore")

In [19]:
yolo = YOLOWorld(model='../models/yolov8s-world.pt')
yolo = yolo.to(torch.device(device="cuda" if torch.cuda.is_available() else "cpu"))
key_layer_idx = {
    # module 2-9 same as yolov8 
    "backbone_c2f1": 2,
    "backbone_c2f2": 4,
    "backbone_c2f3": 6,
    "backbone_c2f4": 8, 
    "backbone_sppf": 9,
    # module changed
    "neck_c2f1": 15,
    "neck_c2f2": 19,
    "neck_c2f3": 22,
    "detect_head": 23
}
layers = {layer: yolo.model.model[idx] for layer, idx in key_layer_idx.items()}
detect_head = layers['detect_head']

提取输出

In [20]:
import os
from PIL import Image
from DyFilterAttack.analyzer.utils import SaveFeatures
save_feats = SaveFeatures()
save_feats.register_hooks(module=detect_head, parent_path='detect_head', verbose=True)
img_path = '../testset/bus.jpg'
results = yolo.predict(img_path)
detect_head_raw_feats = save_feats.get_features()

nl = detect_head.nl
nc, reg_max =  detect_head.nc, detect_head.reg_max
no = nc + 4 * reg_max
assert no == detect_head.no

Registering Hook: detect_head.cv2
Registering Hook: detect_head.cv2.0
Registering Hook: detect_head.cv2.0.0
Registering Hook: detect_head.cv2.0.0.conv
Registering Hook: detect_head.cv2.0.0.bn
Registering Hook: detect_head.cv2.0.0.act
Registering Hook: detect_head.cv2.0.1
Registering Hook: detect_head.cv2.0.1.conv
Registering Hook: detect_head.cv2.0.1.bn
Registering Hook: detect_head.cv2.0.1.act
Registering Hook: detect_head.cv2.0.2
Registering Hook: detect_head.cv2.1
Registering Hook: detect_head.cv2.1.0
Registering Hook: detect_head.cv2.1.0.conv
Registering Hook: detect_head.cv2.1.0.bn
Registering Hook: detect_head.cv2.1.0.act
Registering Hook: detect_head.cv2.1.1
Registering Hook: detect_head.cv2.1.1.conv
Registering Hook: detect_head.cv2.1.1.bn
Registering Hook: detect_head.cv2.1.1.act
Registering Hook: detect_head.cv2.1.2
Registering Hook: detect_head.cv2.2
Registering Hook: detect_head.cv2.2.0
Registering Hook: detect_head.cv2.2.0.conv
Registering Hook: detect_head.cv2.2.0.bn
Regi

In [25]:
# plot det result[B=0]
result = results[0]
for det in result.boxes:
    xmin, ymin, xmax, ymax = det.xyxy[0]
    conf = det.conf  # Confidence
    cls = det.cls  # Class ID
    class_name = result.names[cls[0].item()]
    print(f"bbox: {xmin}, {ymin}, {xmax}, {ymax}, conf: {conf}, class: {class_name}")

image = Image.fromarray(result.plot()[:, :, ::-1])
image.show()
image.save('result/bus_result.jpg')

bbox: 49.767730712890625, 400.7535400390625, 248.6864013671875, 903.1759033203125, conf: tensor([0.9302], device='cuda:0'), class: person
bbox: 222.7787322998047, 405.39019775390625, 344.40283203125, 858.6336059570312, conf: tensor([0.9239], device='cuda:0'), class: person
bbox: 669.021240234375, 388.7566833496094, 810.0, 875.3314819335938, conf: tensor([0.9147], device='cuda:0'), class: person
bbox: 2.825357437133789, 228.2903289794922, 806.0357055664062, 738.6715087890625, conf: tensor([0.8791], device='cuda:0'), class: bus
bbox: 0.0, 440.74639892578125, 75.3588638305664, 1035.9183349609375, conf: tensor([0.6437], device='cuda:0'), class: person
bbox: 0.0, 251.178466796875, 32.303497314453125, 325.3988952636719, conf: tensor([0.4908], device='cuda:0'), class: stop sign
bbox: 0.0, 479.6157531738281, 77.19075775146484, 877.2371826171875, conf: tensor([0.4771], device='cuda:0'), class: person


In [22]:
# process1 
# text -> (B, nc, embed_dim)
# image -> (B, embed_dim, H, W)
# cv4 contrast(iamge, text) -> (B, nc, H, W)
# cv2(image) -> (B, reg_max * 4, H, W)
# cat_result -> (B, nc + reg_max * 4, H ,W) -> (B, no, H ,W)
# x[i] -> cat_result[i] (i = 1, 2, nl)

cv2_raw_feats = [detect_head_raw_feats[f'detect_head.cv2.{i}'] for i in range(nl)]
cv4_raw_feats = [detect_head_raw_feats[f'detect_head.cv4.{i}'] for i in range(nl)]

print(f'cv2_raw_feats {0}: {cv2_raw_feats[0].size()}')
print(f'cv2_raw_feats {1}: {cv2_raw_feats[1].size()}')
print(f'cv2_raw_feats {2}: {cv2_raw_feats[2].size()}')
print(f'cv4_raw_feats {0}: {cv4_raw_feats[0].size()}')
print(f'cv4_raw_feats {1}: {cv4_raw_feats[1].size()}')
print(f'cv4_raw_feats {2}: {cv4_raw_feats[2].size()}')

# process2 (_inference)
# flat(x[i]) -> (B, no, H * W)
# cat(x) -> (B, C, H0 * W0 + H1 * W1 + H2 * W2)
# split(x) -> bbox(B, 4 * reg_max, H0 * W0 + H1 * W1 + H2 * W2), cls(logit)(B, nc, H0 * W0 + H1 * W1 + H2 * W2)
# docode(bbox) -> dbox
# logit(cls) -> sigmoid(cls)
# y -> cat(dbox, cls)

cat_raw_feats = [torch.cat((cv2_raw_feats[i], cv4_raw_feats[i]), 1) for i in range(nl)]
flatten_raw_feats = torch.cat([cat_raw_feat.view(cat_raw_feats[0].shape[0], no, -1) for cat_raw_feat in cat_raw_feats], 2)
raw_box = flatten_raw_feats[:, : reg_max * 4]
raw_cls = flatten_raw_feats[:, reg_max * 4 :]

dfl_feats = detect_head_raw_feats['detect_head.dfl']
dbox = detect_head.decode_bboxes(dfl_feats, detect_head.anchors.unsqueeze(0)) * detect_head.strides
# ! Attention: we need cls(logit) as y_det
logit_cls = raw_cls
sigmoid_cls = logit_cls.sigmoid()

print(f'cat_raw_feat: {cat_raw_feats[0].size()}')           # (B, C, H * W)
print(f'flatten_raw_feats: {flatten_raw_feats.size()}')     # (B, C, H0 * W0 + H1 * W1 + H2 * W2)
print(f'raw_box: {raw_box.size()}')                         # (B, 4 * reg_max, H0 * W0 + H1 * W1 + H2 * W2)
print(f'raw_cls: {raw_cls.size()}')                         # (B, nc, H0 * W0 + H1 * W1 + H2 * W2)
print(f'dbox: {dbox.size()}')                               # (B, 4,  H0 * W0 + H1 * W1 + H2 * W2)
print(f'logit_cls: {logit_cls.size()}')                     # (B, nc, H0 * W0 + H1 * W1 + H2 * W2)
print(f'sigmoid_cls: {sigmoid_cls.size()}')                 # (B, nc, H0 * W0 + H1 * W1 + H2 * W2)

cv2_raw_feats 0: torch.Size([1, 64, 80, 60])
cv2_raw_feats 1: torch.Size([1, 64, 40, 30])
cv2_raw_feats 2: torch.Size([1, 64, 20, 15])
cv4_raw_feats 0: torch.Size([1, 80, 80, 60])
cv4_raw_feats 1: torch.Size([1, 80, 40, 30])
cv4_raw_feats 2: torch.Size([1, 80, 20, 15])
cat_raw_feat: torch.Size([1, 144, 80, 60])
flatten_raw_feats: torch.Size([1, 144, 6300])
raw_box: torch.Size([1, 64, 6300])
raw_cls: torch.Size([1, 80, 6300])
dbox: torch.Size([1, 4, 6300])
logit_cls: torch.Size([1, 80, 6300])
sigmoid_cls: torch.Size([1, 80, 6300])


In [23]:
from ultralytics.utils import ops
predictor = yolo.predictor
preds = torch.cat([dbox, sigmoid_cls], 1)  # (B, 4+nc, N)

detections = ops.non_max_suppression(
    preds,
    predictor.args.conf,
    predictor.args.iou,
    predictor.args.classes,
    predictor.args.agnostic_nms,
    predictor.args.max_det,
    nc=0 if predictor.args.task == "detect" else len(predictor.model.names),
    end2end=getattr(predictor.model, "end2end", False),
    rotated=predictor.args.task == "obb",
    return_idxs=save_feats,
)

print(detections)

([tensor([[ 2.9668e+01,  2.3726e+02,  1.4726e+02,  5.3481e+02,  9.3635e-01,  0.0000e+00],
        [ 1.3185e+02,  2.4041e+02,  2.0401e+02,  5.0898e+02,  9.2702e-01,  0.0000e+00],
        [ 3.9655e+02,  2.3079e+02,  4.8004e+02,  5.1880e+02,  9.1810e-01,  0.0000e+00],
        [ 3.3193e+00,  1.3573e+02,  4.7717e+02,  4.3767e+02,  9.0198e-01,  5.0000e+00],
        [-1.8368e-02,  1.5010e+02,  1.9577e+01,  1.9234e+02,  6.9726e-01,  1.1000e+01],
        [-1.9777e-01,  2.6537e+02,  4.4830e+01,  6.1879e+02,  6.4389e-01,  0.0000e+00],
        [-9.2316e-03,  2.9058e+02,  4.5960e+01,  5.1933e+02,  4.7633e-01,  0.0000e+00]], device='cuda:0')], [tensor([6197, 6185, 6223, 6126, 5100, 6165, 5610], device='cuda:0')])
