In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4" 

# 设置 HTTP 和 HTTPS 代理
os.environ["http_proxy"] = "***"
os.environ["https_proxy"] = "***"

# predict

## free prompt
你这段代码是基于 YOLOE 做 PE-free（Prompt Embedding Free）推理 的流程，它背后的目的就是：

利用一个 vocab 构建固定类别的 embedding，使得后续模型推理时无需每次动态计算文本 embedding（也就无需 text prompt），提高推理效率。

In [4]:
import argparse
import os
from PIL import Image
import supervision as sv
from ultralytics import YOLOE
import numpy as np
from ultralytics.models.yolo.yoloe.predict_vp import YOLOEVPSegPredictor
import matplotlib.pyplot as plt
from ultralytics import YOLOE
from ultralytics.models.yolo.yoloe.val_pe_free import YOLOEPEFreeDetectValidator

加载未融合模型并提取 vocab（文本嵌入字典），这一步你加载了一个“未融合模型”，目的是提取 prompt embedding（VPE）。

In [13]:
unfused_model = YOLOE("yoloe-v8l.yaml")
unfused_model.load("pretrain/yoloe-v8l-seg.pt")
unfused_model.eval()
unfused_model.cuda()

这里从 ram_tag_list.txt 文件中读取类名，然后通过 get_vocab() 得到这些类别对应的 embedding。

In [6]:
with open('tools/ram_tag_list.txt', 'r') as f:
    names = [x.strip() for x in f.readlines()]
names = ['person', 'dog']
vocab = unfused_model.get_vocab(names)

Build text model mobileclip:blt
YOLOe-v8l summary (fused): 311 layers, 49,168,166 parameters, 49,166,608 gradients, 164.8 GFLOPs


In [7]:
len(names), vocab

(2,
 ModuleList(
   (0-2): 3 x Conv2d(256, 2, kernel_size=(1, 1), stride=(1, 1))
 ))

In [8]:
# 检查是否包含 "person" 和 "bus"
print("person" in names)  # True or False
print("bus" in names)     # True or False

True
False


加载主模型，并设置 vocab（让模型支持这些类别）

In [9]:
model = YOLOE("pretrain/yoloe-v8l-seg-pf.pt").cuda()
model.set_vocab(vocab, names=names)

推理设置（非必要但重要）

In [10]:
model.model.model[-1].is_fused = True
model.model.model[-1].conf = 0.001
model.model.model[-1].max_det = 1000

In [11]:
# 效果有问题 ？？？？？？？？？？？？？？？？？？
# target_image_path = "/root/dataset/glass_data_20250317/images/train/Image_20250210134520105.bmp"
target_image_path = "./ultralytics/assets/zidane.jpg"
target_image = Image.open(target_image_path).convert("RGB")

output = "/root/project/research/Yolo/yoloe/test_data_out/"
os.makedirs(output, exist_ok=True)

results = model.predict(target_image, save=False, conf=0.25)
# for r in results:
#     print(r.boxes.data)  # print detection bounding boxes


0: 640x640 10 persons, 27 dogs, 31.1ms
Speed: 4.6ms preprocess, 31.1ms inference, 38.2ms postprocess per image at shape (1, 3, 640, 640)


In [12]:
detections = sv.Detections.from_ultralytics(results[0])
# print(f"detections: {detections}")

resolution_wh = target_image.size

thickness = sv.calculate_optimal_line_thickness(resolution_wh=resolution_wh)
text_scale = sv.calculate_optimal_text_scale(resolution_wh=resolution_wh)

labels = [
    f"{class_name} {confidence:.2f}"
    for class_name, confidence in zip(detections["class_name"], detections.confidence)
]

annotated_image = target_image.copy()

annotated_image = sv.MaskAnnotator(
    color_lookup=sv.ColorLookup.INDEX,
    opacity=0.4
).annotate(scene=annotated_image, detections=detections)

annotated_image = sv.BoxAnnotator(
    color_lookup=sv.ColorLookup.INDEX,
    thickness=thickness
).annotate(scene=annotated_image, detections=detections)

annotated_image = sv.LabelAnnotator(
    color_lookup=sv.ColorLookup.INDEX,
    text_scale=text_scale,
    smart_position=True
).annotate(scene=annotated_image, detections=detections, labels=labels)

print(f"annotated_image: {annotated_image.size}")
# fig, ax = plt.subplots(figsize=(8, 8))
# ax.imshow(annotated_image, cmap="gray")

base, ext = os.path.splitext(target_image_path)
path, filename = os.path.split(target_image_path)
# print(f"base: {path}, ext: {filename}")
output_name = os.path.join(output, "re_" + filename)
# print(f"output_name: {output_name}")
annotated_image.save(output_name)

annotated_image: (1280, 720)
