In [None]:
from ultralytics import YOLO
import torch
import cv2

# definition of "net"
device = torch.device(device="cuda" if torch.cuda.is_available() else "cpu")
yolo = YOLO(model='../models/yolov8n.pt')
yolo_model = yolo.model
yolo_model = yolo_model.to(device=device) # type: ignore

# automatic preprocess and postprocess in YOLO object
# img tensor
img = cv2.imread(filename="../images/puppies.jpg")
img = cv2.resize(src=img, dsize=(640, 640))  # 输入尺寸需与模型训练时一致
img_tensor = torch.from_numpy(img.transpose(2, 0, 1)/255).unsqueeze(dim=0).to(device=device, dtype=torch.float32)

yolo_model.eval()
yolo_output = yolo_model(img_tensor)[0]
# torch.Size([1, 84, 8400])
# 8400: (80×80+40×40+20×20)×84=(6400+1600+400)×84=8400×84
#   84: (x, y, w, h, classes_scores[80])=1+1+1+1+80
print(yolo_output.size())

# 将输出重组为 [batch, anchors, (4+1+classes), h, w]
batch_size, channels, num_boxes = yolo_output.shape
num_anchors = channels - 5
boxes = yolo_output[:, :4, :]          # [1, 4, 8400] → 4坐标
conf = yolo_output[:, 4:5, :]          # [1, 1, 8400] → 置信度
cls_probs = yolo_output[:, 5:, :]      # [1, num_classes, 8400] → 类别概率

# print(yolo_output.size())

# batch_size, channels, h, w = yolo_output.shape
# num_anchors = 3  # 每个位置有3个锚框
# orgi_pred = yolo_output.view(batch_size, num_anchors, -1, h, w).view(batch_size, num_anchors, )
# print(orgi_pred.size())

torch.Size([1, 84, 8400])


In [None]:
def get_target_box_output(yolo_output, conf_threshold=0.5):
    # 输入：YOLOv8的原始输出（多尺度检测头张量）
    # 输出：置信度最高的目标框的原始输出（未经过NMS和sigmoid）
    boxes = [out.orig_img for out in yolo_output]
    for out in boxes:  # 遍历多尺度检测头
        print(out.shape)
        bs, na, h, w, nc = out.shape  # [1, 3, 80, 80, 85]等
        out = out.view(bs, na, h*w, nc)  # 展平空间维度为[1, 3, 6400, 85]
        for i in range(na):  # 遍历每个锚框
            obj_conf = out[0, i, :, 4]  # 目标置信度（未sigmoid）
            class_conf, class_idx = out[0, i, :, 5:].max(dim=1)  # 类别置信度（未sigmoid）
            total_conf = obj_conf.sigmoid() * class_conf.sigmoid()  # 最终置信度（目标存在且类别的概率）
            # 筛选置信度>阈值的框
            mask = total_conf > conf_threshold
            if mask.any():
                max_idx = total_conf[mask].argmax()  # 置信度最高的框
                target_out = out[0, i, mask][max_idx]  # 该框的原始输出（85维）
                return target_out  # 返回该框的原始输出（用于梯度计算）
    return None  # 无有效目标框

get_target_box_output(yolo_output=yolo_output)

In [None]:
import numpy as np
from torch.autograd import Variable

import copy
from torch.autograd.gradcheck import zero_gradients


def deepfool(image, net, num_classes=10, overshoot=0.02, max_iter=50):
    """
    :param image: Image of size HxWx3
    :param net: network (input: images, output: values of activation **BEFORE** softmax).
    :param num_classes: num_classes (limits the number of classes to test against, by default = 10)
    :param overshoot: used as a termination criterion to prevent vanishing updates (default = 0.02).
    :param max_iter: maximum number of iterations for deepfool (default = 50)
    :return: minimal perturbation that fools the classifier, number of iterations that it required, new estimated_label and perturbed image
    """
    is_cuda = torch.cuda.is_available()

    if is_cuda:
        print("Using GPU")
        image = image.cuda()
        net = net.cuda()
    else:
        print("Using CPU")

    f_image = net.forward(Variable(image[None, :, :, :], requires_grad=True)).data.cpu().numpy().flatten()
    I = (np.array(f_image)).flatten().argsort()[::-1]

    I = I[0:num_classes]
    label = I[0]

    input_shape = image.cpu().numpy().shape
    pert_image = copy.deepcopy(image)
    w = np.zeros(input_shape)
    r_tot = np.zeros(input_shape)

    loop_i = 0

    x = Variable(pert_image[None, :], requires_grad=True)
    fs = net.forward(x)
    fs_list = [fs[0, I[k]] for k in range(num_classes)]
    k_i = label

    while k_i == label and loop_i < max_iter:

        pert = np.inf
        fs[0, I[0]].backward(retain_graph=True)
        grad_orig = x.grad.data.cpu().numpy().copy()

        for k in range(1, num_classes):
            zero_gradients(x)

            fs[0, I[k]].backward(retain_graph=True)
            cur_grad = x.grad.data.cpu().numpy().copy()

            # set new w_k and new f_k
            w_k = cur_grad - grad_orig
            f_k = (fs[0, I[k]] - fs[0, I[0]]).data.cpu().numpy()

            pert_k = abs(f_k) / np.linalg.norm(w_k.flatten())

            # determine which w_k to use
            if pert_k < pert:
                pert = pert_k
                w = w_k

        # compute r_i and r_tot
        # Added 1e-4 for numerical stability
        r_i = (pert + 1e-4) * w / np.linalg.norm(w)
        r_tot = np.float32(r_tot + r_i)

        if is_cuda:
            pert_image = image + (1 + overshoot) * torch.from_numpy(r_tot).cuda()
        else:
            pert_image = image + (1 + overshoot) * torch.from_numpy(r_tot)

        x = Variable(pert_image, requires_grad=True)
        fs = net.forward(x)
        k_i = np.argmax(fs.data.cpu().numpy().flatten())

        loop_i += 1

    r_tot = (1 + overshoot) * r_tot

    return r_tot, loop_i, label, k_i, pert_image


In [None]:
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
import torchvision.models as models
from PIL import Image
import os

net = models.resnet34(pretrained=True)

# Switch to evaluation mode
net.eval()

im_orig = Image.open('test_im2.jpg')

mean = [ 0.485, 0.456, 0.406 ]
std = [ 0.229, 0.224, 0.225 ]


# Remove the mean
im = transforms.Compose([
    transforms.Scale(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean = mean,
                         std = std)])(im_orig)

r, loop_i, label_orig, label_pert, pert_image = deepfool(im, net)

labels = open(os.path.join('synset_words.txt'), 'r').read().split('\n')

str_label_orig = labels[np.int(label_orig)].split(',')[0]
str_label_pert = labels[np.int(label_pert)].split(',')[0]

print("Original label = ", str_label_orig)
print("Perturbed label = ", str_label_pert)

def clip_tensor(A, minv, maxv):
    A = torch.max(A, minv*torch.ones(A.shape))
    A = torch.min(A, maxv*torch.ones(A.shape))
    return A

clip = lambda x: clip_tensor(x, 0, 255)

tf = transforms.Compose([transforms.Normalize(mean=[0, 0, 0], std=map(lambda x: 1 / x, std)),
                        transforms.Normalize(mean=map(lambda x: -x, mean), std=[1, 1, 1]),
                        transforms.Lambda(clip),
                        transforms.ToPILImage(),
                        transforms.CenterCrop(224)])

plt.figure()
plt.imshow(tf(pert_image.cpu()[0]))
plt.title(str_label_pert)
plt.show()