In [11]:
from ultralytics import YOLOWorld
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch

import warnings

warnings.filterwarnings(action="ignore")
warnings.simplefilter(action="ignore")

In [None]:
yolo = YOLOWorld(model='../models/yolov8s-world.pt')
yolo = yolo.to(torch.device(device="cuda" if torch.cuda.is_available() else "cpu"))
key_layer_idx = {
    # module 2-9 same as yolov8 
    "backbone_c2f1": 2,
    "backbone_c2f2": 4,
    "backbone_c2f3": 6,
    "backbone_c2f4": 8, 
    "backbone_sppf": 9,
    # module changed
    "neck_c2f1": 15,
    "neck_c2f2": 19,
    "neck_c2f3": 22,
    "detect_head": 23
}
layers = {layer: yolo.model.model[idx] for layer, idx in key_layer_idx.items()}
detect_head = layers['detect_head']

WorldDetect(
  (cv2): ModuleList(
    (0): Sequential(
      (0): Conv(
        (conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
    )
    (1): Sequential(
      (0): Conv(
        (conv): Conv2d(256, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)


提取输出

In [None]:
from DyFilterAttack.analyzer.utils import SaveFeatures
save_feats = SaveFeatures()
save_feats.register_hooks(module=backbone_c2f1, parent_path='backbone_c2f1', verbose=True)
img_path = '../testset/bus.jpg'
yolo.predict(img_path)
detect_head_raw_feats = save_feats.get_features()
# print(detect_head_raw_feats['detect_head.cv4.1'].size())

nl = detect_head.nl
nc, reg_max =  detect_head.nc, detect_head.reg_max
no = nc + 4 * reg_max
assert no == detect_head.no

Registering Hook: backbone_c2f1.cv1
Registering Hook: backbone_c2f1.cv1.conv
Registering Hook: backbone_c2f1.cv1.bn
Registering Hook: backbone_c2f1.cv1.act
Registering Hook: backbone_c2f1.cv2
Registering Hook: backbone_c2f1.cv2.conv
Registering Hook: backbone_c2f1.cv2.bn
Registering Hook: backbone_c2f1.cv2.act
Registering Hook: backbone_c2f1.m
Registering Hook: backbone_c2f1.m.0
Registering Hook: backbone_c2f1.m.0.cv1
Registering Hook: backbone_c2f1.m.0.cv1.conv
Registering Hook: backbone_c2f1.m.0.cv1.bn
Registering Hook: backbone_c2f1.m.0.cv1.act
Registering Hook: backbone_c2f1.m.0.cv2
Registering Hook: backbone_c2f1.m.0.cv2.conv
Registering Hook: backbone_c2f1.m.0.cv2.bn
Registering Hook: backbone_c2f1.m.0.cv2.act

image 1/1 e:\bmx\DyFilterAttack\DyFilterAttack\analyzer\..\testset\bus.jpg: 640x480 5 persons, 1 bus, 1 stop sign, 20.7ms
Speed: 2.0ms preprocess, 20.7ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 480)


In [14]:
# process1 
# text -> (B, nc, embed_dim)
# image -> (B, embed_dim, H, W)
# cv4 contrast(iamge, text) -> (B, nc, H, W)
# cv2(image) -> (B, reg_max * 4, H, W)
# cat_result -> (B, nc + reg_max * 4, H ,W) -> (B, no, H ,W)
# x[i] -> cat_result[i] (i = 1, 2, nl)

cv2_raw_feats = [detect_head_raw_feats[f'detect_head.cv2.{i}'] for i in range(nl)]
cv4_raw_feats = [detect_head_raw_feats[f'detect_head.cv4.{i}'] for i in range(nl)]

print(f'cv2_raw_feats {0}: {cv2_raw_feats[0].size()}')
print(f'cv2_raw_feats {1}: {cv2_raw_feats[1].size()}')
print(f'cv2_raw_feats {2}: {cv2_raw_feats[2].size()}')
print(f'cv4_raw_feats {0}: {cv4_raw_feats[0].size()}')
print(f'cv4_raw_feats {1}: {cv4_raw_feats[1].size()}')
print(f'cv4_raw_feats {2}: {cv4_raw_feats[2].size()}')

# process2 (_inference)
# flat(x[i]) -> (B, no, H * W)
# cat(x) -> (B, C, H0 * W0 + H1 * W1 + H2 * W2)
# split(x) -> bbox(B, 4 * reg_max, H0 * W0 + H1 * W1 + H2 * W2), cls(logit)(B, nc, H0 * W0 + H1 * W1 + H2 * W2)
# docode(bbox) -> dbox
# logit(cls) -> sigmoid(cls)
# y -> cat(dbox, cls)

cat_raw_feats = [torch.cat((cv2_raw_feats[i], cv4_raw_feats[i]), 1) for i in range(nl)]
flatten_raw_feats = torch.cat([cat_raw_feat.view(cat_raw_feats[0].shape[0], no, -1) for cat_raw_feat in cat_raw_feats], 2)
raw_box = flatten_raw_feats[:, : reg_max * 4]
raw_cls = flatten_raw_feats[:, reg_max * 4 :]

dfl_feats = detect_head_raw_feats['detect_head.dfl']
dbox = detect_head.decode_bboxes(dfl_feats, detect_head.anchors.unsqueeze(0)) * detect_head.strides
# ! Attention: we need cls(logit) as y_det
logit_cls = raw_cls
sigmoid_cls = logit_cls.sigmoid()

print(f'cat_raw_feat: {cat_raw_feats[0].size()}')           # (B, C, H * W)
print(f'flatten_raw_feats: {flatten_raw_feats.size()}')     # (B, C, H0 * W0 + H1 * W1 + H2 * W2)
print(f'raw_box: {raw_box.size()}')                         # (B, 4 * reg_max, H0 * W0 + H1 * W1 + H2 * W2)
print(f'raw_cls: {raw_cls.size()}')                         # (B, nc, H0 * W0 + H1 * W1 + H2 * W2)
print(f'dbox: {dbox.size()}')                               # (B, 4,  H0 * W0 + H1 * W1 + H2 * W2)
print(f'logit_cls: {logit_cls.size()}')                     # (B, nc, H0 * W0 + H1 * W1 + H2 * W2)
print(f'sigmoid_cls: {sigmoid_cls.size()}')                 # (B, nc, H0 * W0 + H1 * W1 + H2 * W2)

KeyError: 'detect_head.cv2.0'

In [None]:
from ultralytics.utils import ops
print(yolo.predictor.args)

preds = ops.non_max_suppression(
    flatten_raw_feats,
    self.args.conf,
    self.args.iou,
    self.args.classes,
    self.args.agnostic_nms,
    max_det=self.args.max_det,
    nc=0 if self.args.task == "detect" else len(self.model.names),
    end2end=getattr(self.model, "end2end", False),
    rotated=self.args.task == "obb",
    return_idxs=save_feats,
)

if not isinstance(orig_imgs, list):  # input images are a torch.Tensor, not a list
    orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)

if save_feats:
    obj_feats = self.get_obj_feats(self._feats, preds[1])
    preds = preds[0]

results = self.construct_results(preds, img, orig_imgs, **kwargs)

if save_feats:
    for r, f in zip(results, obj_feats):
        r.feats = f  # add object features to results

return results

task=detect
mode=predict
model=../models/yolov8s-world.pt
data=None
epochs=100
time=None
patience=100
batch=1
imgsz=640
save=False
save_period=-1
cache=False
device=cuda:0
workers=8
project=None
name=None
exist_ok=False
pretrained=True
optimizer=auto
verbose=True
seed=0
deterministic=True
single_cls=False
rect=True
cos_lr=False
close_mosaic=10
resume=False
amp=True
fraction=1.0
profile=False
freeze=None
multi_scale=False
overlap_mask=True
mask_ratio=4
dropout=0.0
val=True
split=val
save_json=False
conf=0.25
iou=0.7
max_det=300
half=False
dnn=False
plots=True
source=None
vid_stride=1
stream_buffer=False
visualize=False
augment=False
agnostic_nms=False
classes=None
retina_masks=False
embed=None
show=False
save_frames=False
save_txt=False
save_conf=False
save_crop=False
show_labels=True
show_conf=True
show_boxes=True
line_width=None
format=torchscript
keras=False
optimize=False
int8=False
dynamic=False
simplify=True
opset=None
workspace=None
nms=False
lr0=0.01
lrf=0.01
momentum=0.937
weig

NameError: name 'self' is not defined