In [None]:
import random
from pathlib import Path

import matplotlib.pyplot as plt
from transformers import AutoImageProcessor, AutoModel
from PIL import Image

MODELS_DIR = Path("../infra/models")
IMAGES_DATA_TRAIN = Path("../infra/images/coco-2017/train/data")
assert IMAGES_DATA_TRAIN.exists()
extensions = {filename.suffix for filename in IMAGES_DATA_TRAIN.glob("*.*")}
assert extensions == {".jpg"}
IMAGES = list(IMAGES_DATA_TRAIN.glob("*.jpg"))

In [None]:
image = Image.open(random.choice(IMAGES))

processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
model = AutoModel.from_pretrained('facebook/dinov2-base').to("cuda")

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs.to("cuda"))
last_hidden_states = outputs.last_hidden_state

In [None]:
import torch

dummy_input = torch.randn(1, 3, 224, 224)  # Adjust dimensions as needed
model_path = MODELS_DIR / "dinov2/1/model.onnx"

# Export the model to ONNX
torch.onnx.export(
    model,
    dummy_input.to("cuda"),
    str(model_path),
    input_names=['input'],
    output_names=['last_hidden_state', 'pooler_output'],  # The name of the output node
)

In [None]:
import onnxruntime as ort
import torch
import torchvision

def preprocess_image(image_path):
    # Load the image
    image = Image.open(image_path).convert("RGB")
    
    # Resize the image to the expected input size of the model
    image = image.resize((224, 224))  # Adjust the size as needed
    
    # Convert the image to a PyTorch tensor
    image_tensor = torchvision.transforms.ToTensor()(image)
    
    # Normalize the image using the mean and standard deviation values expected by the model
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    image_tensor = torchvision.transforms.Normalize(mean, std)(image_tensor)
    
    # Convert to numpy array if needed
    pixel_values = image_tensor.to("cpu").squeeze().numpy()
    
    return pixel_values

session = ort.InferenceSession(model_path)

result = session.run(None, {"input": preprocess_image(random.choice(IMAGES))[None, ...]})
result[0].shape

In [None]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
img = inputs['pixel_values'].to("cpu").squeeze().numpy().transpose(1, 2, 0)
img = (img * std + mean)
plt.imshow(img)

In [None]:
from ultralytics import YOLO
import shutil

model_path = MODELS_DIR / "yolo/1/model.onnx"

model = YOLO(MODELS_DIR / "yolo/1/yolov8x.pt")
model.export(format="onnx", dynamic=True, opset=12, imgsz=640)
shutil.move(model_path.parent / "yolov8x.onnx", model_path)

In [None]:
import onnxruntime as ort
import torch
from torchvision import transforms
import numpy as np

session = ort.InferenceSession(model_path)
image = Image.open(random.choice(IMAGES)).convert('RGB')

def cross_iou_matrix(x1: np.ndarray, y1: np.ndarray, x2: np.ndarray, y2: np.ndarray) -> np.ndarray:
    xmin = np.maximum(x1[..., None], x1[None, ...])
    ymin = np.maximum(y1[..., None], y1[None, ...])
    xmax = np.minimum(x2[..., None], x2[None, ...])
    ymax = np.minimum(y2[..., None], y2[None, ...])
    i = np.maximum(0.0, xmax - xmin) * np.maximum(0.0, ymax - ymin)
    u = (x2 - x1) * (y2 - y1)
    return i / (u[..., None] + u[None, ...] - i)

def nms(x1: np.ndarray,
        y1: np.ndarray,
        x2: np.ndarray,
        y2: np.ndarray,
        scores: np.ndarray,
        class_ids: np.ndarray,
        nms_threshold: float) -> list[int]:
    cross_iou = cross_iou_matrix(x1, y1, x2, y2)
    visited = set()
    result = []
    for i, class_id in enumerate(class_ids):
        if i in visited:
            continue

        new_ids, *_ = np.where((class_ids == class_id) & (cross_iou[i, ...] >= nms_threshold))
        visited.update(new_ids)
        result.append(new_ids[scores[new_ids].argmax()])

    return sorted(result, key=lambda i: scores[i], reverse=True)

class ImageProcessor:
    _MEAN = np.array([0.485, 0.456, 0.406])
    _STD = np.array([0.229, 0.224, 0.225])

    def __init__(self, model_w, model_h, orig_w, orig_h, nms_iou = 0.75):
        self.model_w = model_w
        self.model_h = model_h
        self.orig_w = orig_w
        self.orig_h = orig_h
        self.transform = transforms.Compose([transforms.Resize((self.model_h, self.model_w)),
                                             transforms.Normalize(mean=self._MEAN, std=self._STD)])
        self.nms_iou = 0.6

    def estimate_dims(self) -> None:
        self.model_aspect_ratio = self.model_w / self.model_h
        self.orig_aspect_ratio = self.orig_w / self.orig_h
        if self.model_aspect_ratio > self.orig_aspect_ratio:
            self.new_w = int(round(self.orig_h * self.model_aspect_ratio))
            self.new_h = self.orig_h
        else:
            self.new_w = self.orig_w 
            self.new_h = int(round(self.orig_w / self.model_aspect_ratio))

        self.dw = (self.new_w - self.orig_w) // 2
        self.dh = (self.new_h - self.orig_h) // 2

    def preprocess(self, image: Image.Image) -> torch.Tensor:
        self.estimate_dims()
        output = torch.from_numpy(self._MEAN[..., None, None] * np.ones((3, self.new_h, self.new_w)))
        output[:, self.dh:self.dh + self.orig_h, self.dw:self.dw + self.orig_w] = transforms.ToTensor()(image)
        output = self.transform(output)
        return output

    def postprocess(self, predictions: np.ndarray) -> np.ndarray:
        x, y, w, h = predictions[:4, ...]
        scores = predictions[4:, ...].T
        x = (x - self.dw) / (self.model_w - self.dw * 2) * self.orig_w
        y = (y - self.dh) / (self.model_h - self.dh * 2) * self.orig_h
        w = w / (self.model_w - self.dw * 2) * self.orig_w
        h = h / (self.model_h - self.dh * 2) * self.orig_h

        nms_filtered_ids = nms(x - w/2, y - h/2, x + w/2, y + h/2, scores.max(-1), scores.argmax(-1), self.nms_iou)

        return np.concat((x[nms_filtered_ids, None],
                          y[nms_filtered_ids, None],
                          w[nms_filtered_ids, None],
                          h[nms_filtered_ids, None],
                          scores[nms_filtered_ids, ...]), axis=-1)

w, h = image.size
processor = ImageProcessor(640, 640, w, h)
input = processor.preprocess(image).unsqueeze(0)
result = session.run(None, {"images": input.numpy().astype(np.float32)})
output = processor.postprocess(result[0].squeeze())

# Extract bounding boxes and confidence scores
boxes = output[:, :4]  # Bounding box coordinates
scores = output[:, 4:]  # Confidence scores

# Filter out low confidence predictions
threshold = 0.1
filtered_mask = scores.max(axis=1) > threshold
filtered_boxes = boxes[filtered_mask, ...]
filtered_scores = scores[filtered_mask]

# Convert the image back to a numpy array for plotting
image_np = np.array(image)

# Plot the image
plt.figure(figsize=(10, 10))
plt.imshow(image_np)

# Draw bounding boxes
for i in range(filtered_boxes.shape[0]):
    xc, yc, w, h = filtered_boxes[i]
    x, y = xc - w/2, yc - h/2

    class_id = np.argmax(filtered_scores[i])
    score = filtered_scores[i, class_id]
    color = np.random.rand(3)  # Random color for each box

    # Draw rectangle
    plt.gca().add_patch(plt.Rectangle((x, y), w, h, 
                                      fill=False, edgecolor=color, linewidth=2))
    
    # Add confidence score text
    plt.text(x, y, f"{class_id:.2f}:{score:.2f}", color=color, fontsize=12, backgroundcolor="white")

# Show the plot
plt.axis('off')
plt.show()