In [1]:
import numpy as np
from ultralytics import YOLOE
from ultralytics.models.yolo.yoloe import YOLOEVPSegPredictor
import supervision as sv
import cv2

## Тест детекции с текстовым промптом

In [66]:
model = YOLOE("yoloe-11l-seg.pt")
image_size = (640, 640)
image = 'sample_substraction/19176.jpg'
license_plate_prompt = """
Russian car license plate: 
- white rectangular-shape metal plate
- black cyrillic characters
- format: 1 letter, 3 digits, 2 letters, regional code
- example: A 123 BC 77 RUS
- mounted with visible bolts/screws
"""
classes = [license_plate_prompt]
# Вместе с промптами - передаем эмбеддинги текста
model.set_classes(classes, model.get_text_pe(classes))
results = model.predict(image, imgsz = image_size)
detections = sv.Detections.from_ultralytics(results[0])

annotator = sv.BoxAnnotator()
# Для выделения сегмента - пайплайн по сути отличается только дополнительным сегментированием объекта
mask_annotator = sv.MaskAnnotator()
label_annotator = sv.LabelAnnotator()

labels = [f'{confidence:.2f}' 
          for confidence 
          in detections.confidence]

image_for_labeling = cv2.imread(image)
image_with_bounding_boxes = annotator.annotate(scene = image_for_labeling, 
                                               detections = detections)
segmented_image_with_bounding_boxes = mask_annotator.annotate(scene = image_with_bounding_boxes,
                                                              detections = detections)
labeled_image = label_annotator.annotate(scene = segmented_image_with_bounding_boxes, 
                                         detections = detections,
                                         labels = labels)
cv2.imwrite('labeled_by_text_prompt.jpg', labeled_image)

Ultralytics 8.3.139  Python-3.11.9 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 4060 Laptop GPU, 8188MiB)
YOLOe-11l-seg summary (fused): 227 layers, 35,117,862 parameters, 2,254,374 gradients, 144.1 GFLOPs

image 1/1 d:\my_cv_projecs\yolo\sample_substraction\19176.jpg: 480x640 1 
Russian car license plate: 
- white rectangular-shape metal plate
- black cyrillic characters
- format: 1 letter, 3 digits, 2 letters, regional code
- example: A 123 BC 77 RUS
- mounted with visible bolts/screws
, 34.6ms
Speed: 1.8ms preprocess, 34.6ms inference, 4.4ms postprocess per image at shape (1, 3, 480, 640)


True

## Тест детекции с визуальным промптом

In [67]:
model = YOLOE("yoloe-11l-seg.pt")
image_size = (640, 640)
target_image = 'sample_substraction/19176.jpg'
refer_image = 'sample_substraction/10515.jpg'

xyxy_boxes_coords = np.array([[358.7, 513.11, 594.8, 580.85]])

# Визуальный промпт для модели - указываются координаты целевого объекта и класс объекта
# bboxes - нужно подавать координаты в формате XYXY (x1, y1, x2, y2)
# TODO детально изучить, как формировать универсальные промпты, 
# подходящие под различные кейсы расположения объекта
visual_prompts = dict(
    bboxes = xyxy_boxes_coords,
    cls = np.array([0])
)

# Запускаем прогноз модели с использованием визуального промпта
results = model.predict(
    source = target_image,
    refer_image = refer_image,
    visual_prompts = visual_prompts,
    imgsz = image_size,
    predictor = YOLOEVPSegPredictor
)

detections = sv.Detections.from_ultralytics(results[0])

annotator = sv.BoxAnnotator()
# Для выделения сегмента - пайплайн по сути отличается только дополнительным сегментированием объекта
mask_annotator = sv.MaskAnnotator()
label_annotator = sv.LabelAnnotator()

labels = [f'{confidence:.2f}' 
          for confidence 
          in detections.confidence]

image_for_labeling = cv2.imread(target_image)
image_with_bounding_boxes = annotator.annotate(scene = image_for_labeling, 
                                               detections = detections)
segmented_image_with_bounding_boxes = mask_annotator.annotate(scene = image_with_bounding_boxes,
                                                              detections = detections)
labeled_image = label_annotator.annotate(scene = segmented_image_with_bounding_boxes, 
                                         detections = detections,
                                         labels = labels)
cv2.imwrite('labeled_by_visual_prompt.jpg', labeled_image)

Ultralytics 8.3.139  Python-3.11.9 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 4060 Laptop GPU, 8188MiB)
YOLOe-11l-seg summary (fused): 227 layers, 35,117,862 parameters, 2,254,374 gradients

image 1/1 d:\my_cv_projecs\yolo\sample_substraction\19176.jpg: 480x640 1 object0, 26.1ms
Speed: 1.4ms preprocess, 26.1ms inference, 2.4ms postprocess per image at shape (1, 3, 480, 640)


True