In [3]:
from ultralytics import YOLO
import torch
import cv2
import numpy as np
from torchvision.transforms import Compose, Resize, ToTensor, Normalize



In [7]:
# Load YOLOv8 model
model = YOLO("yolov8n.pt")

# Perform inference on an image
img_path = "./120364.jpg"
results = model(img_path)

# Print and visualize the results
print(results)
# results.show()

# Get bounding boxes
boxes = results[0].boxes.xyxy.numpy()

# Load MiDaS model
model_type = "DPT_Large"  # MiDaS v3 - Large
midas = torch.hub.load("intel-isl/MiDaS", model_type)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)
midas.eval()

# Load the transforms to resize and normalize the image
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = midas_transforms.dpt_transform if model_type == "DPT_Large" else midas_transforms.small_transform

# Read and transform the input image
img_cv2 = cv2.imread(img_path)
img_rgb = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2RGB)
input_img = transform(img_rgb).unsqueeze(0).to(device)

# Perform depth estimation
with torch.no_grad():
    prediction = midas(input_img)

    # Resize the prediction to original image size
    prediction = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=img_rgb.shape[:2],
        mode="bicubic",
        align_corners=False,
    ).squeeze()

# Convert prediction to numpy array
depth_map = prediction.cpu().numpy()

# Iterate over detected bounding boxes
for box in boxes:
    x1, y1, x2, y2 = map(int, box[:4])

    # Extract the region of interest (ROI) from the depth map
    food_region = depth_map[y1:y2, x1:x2]

    # Calculate the average depth of the food region
    avg_depth = np.mean(food_region)

    # Estimate the area of the food region (in pixels)
    area = (x2 - x1) * (y2 - y1)

    # Estimate the volume (simplified approach)
    volume_estimate = area * avg_depth

    print(f"Bounding box: ({x1}, {y1}, {x2}, {y2})")
    print(f"Estimated average depth: {avg_depth}")
    print(f"Estimated volume: {volume_estimate}\n")



image 1/1 c:\Users\andys\Desktop\\\food-detection\120364.jpg: 384x640 3 bowls, 1 dining table, 106.7ms
Speed: 2.0ms preprocess, 106.7ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)
[ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'sp

Using cache found in C:\Users\andys/.cache\torch\hub\intel-isl_MiDaS_master
  from .autonotebook import tqdm as notebook_tqdm
Downloading: "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt" to C:\Users\andys/.cache\torch\hub\checkpoints\dpt_large_384.pt
 11%|█         | 140M/1.28G [00:30<04:17, 4.78MB/s] 


KeyboardInterrupt: 