In [1]:
from ultralytics import YOLO
import torch
import cv2
import numpy as np
from torchvision.transforms import Compose, Resize, ToTensor, Normalize



In [43]:
# Load YOLOv8 model
model = YOLO("yolov8n.pt")

# Perform inference on an image
img_path = "./120364.jpg"
results = model(img_path)
result = results[0]

# Get the original image
img = result.orig_img
# Print and visualize the results
# Loop over each result (for each image processed during inference)
for box in result.boxes.xyxy:
    x1, y1, x2, y2 = map(int, box[:4])
    img = cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)

# Show the image with bounding boxes
cv2.imshow("Detection Results", img)
cv2.waitKey(0)
cv2.destroyAllWindows()



image 1/1 c:\Users\USER\Desktop\food-detection\120364.jpg: 384x640 3 bowls, 1 dining table, 20.9ms
Speed: 1.0ms preprocess, 20.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)


In [41]:
# Get bounding boxes
boxes = results[0].boxes.xyxy.cpu().numpy()

# Load MiDaS model
model_type = "DPT_Large"  # MiDaS v3 - Large
midas = torch.hub.load("intel-isl/MiDaS", model_type)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)
midas.eval()

# Load the transforms to resize and normalize the image
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = midas_transforms.dpt_transform if model_type == "DPT_Large" else midas_transforms.small_transform

# Read and transform the input image
img_cv2 = cv2.imread(img_path)
img_rgb = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2RGB)
input_img = transform(img_rgb).unsqueeze(0).to(device)
if input_img.shape[1] == 1:  # Assuming the second dimension is redundant
    input_img = input_img.squeeze(1)  # Remove the second dimension
    print("Squeezed input_img shape:", input_img.shape)
# Perform depth estimation
with torch.no_grad():
    print(input_img)
    prediction = midas(input_img)

    # Resize the prediction to original image size
    prediction = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=img_rgb.shape[:2],
        mode="bicubic",
        align_corners=False,
    ).squeeze()

# Convert prediction to numpy array
depth_map = prediction.cpu().numpy()

# Iterate over detected bounding boxes
for box in boxes:
    x1, y1, x2, y2 = map(int, box[:4])

    # Extract the region of interest (ROI) from the depth map
    food_region = depth_map[y1:y2, x1:x2]
    print("food_region", food_region)

    # Calculate the average depth of the food region
    avg_depth = np.mean(food_region)

    # Estimate the area of the food region (in pixels)
    area = (x2 - x1) * (y2 - y1)

    # Estimate the volume (simplified approach)
    volume_estimate = area * avg_depth

    # Annotate the image with the volume estimate
    label = f"Volume: {volume_estimate:.2f} cm3"
    cv2.putText(img_cv2, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)


    print(f"Bounding box: ({x1}, {y1}, {x2}, {y2})")
    print(f"Estimated average depth: {avg_depth}")
    print(f"Estimated volume: {volume_estimate}\n")


Using cache found in C:\Users\USER/.cache\torch\hub\intel-isl_MiDaS_master
Using cache found in C:\Users\USER/.cache\torch\hub\intel-isl_MiDaS_master


Squeezed input_img shape: torch.Size([1, 3, 384, 672])
tensor([[[[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],

         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],

         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]], device='cuda:0')
food_region [[     1.2671      1.2717      1.2785 ...      5.7169      5.7217      5.7299]
 [     1.2886      1.2932    

In [44]:
cv2.imshow('Detected Objects and Volume', img_cv2)
cv2.waitKey(0)
cv2.destroyAllWindows()

# Export the YOLOv8 model to ONNX format
# path = model.export(format="onnx")