In [1]:
class Preprocessor:
    def __init__(self, target_height, target_width):
        self.target_height = target_height
        self.target_width = target_width

    def process_images(self, images):
        # Resize
        images = self._resize_images(images)
        # Normalize
        images = self._normalize_images(images)
        return images

    def process_depths(self, depths):
        # Resize
        depths = self._resize_images(depths)
        # Normalize
        depths = self._normalize_depths(depths)
        # Expand dimensions
        depths = np.expand_dims(depths, -1)  # Add an extra dimension for the 'channels'
        return depths

    def _resize_images(self, images):
        return np.array([cv2.resize(img, (self.target_width, self.target_height)) for img in images])

    def _normalize_images(self, images):
        # Images are normalized to [0, 1] interval
        return images / 255.0

    def _normalize_depths(self, depths):
        # Depths are normalized to [0, 1] interval
        return depths / np.max(depths)

In [2]:
import numpy as np
import cv2

import tensorflow as tf
from tensorflow.keras import layers

preprocessor = Preprocessor(256, 256)

# Load YOLOv3 model
yolo = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg") 
layer_names = yolo.getLayerNames()
output_layers = [layer_names[i - 1] for i in yolo.getUnconnectedOutLayers()]

# Load names of classes and get random colors
classes = open('coco.names').read().strip().split('\n')
np.random.seed(42)
colors = np.random.randint(0, 255, size=(len(classes), 3), dtype='uint8')

# Load U-Net model
unet = tf.keras.models.load_model('depth_model') 

# Load image
image = cv2.imread("dog.jpg")
image = cv2.resize(image, (256, 256))
height, width, channels = image.shape
image_extend = np.expand_dims(image, axis=0)  # add an extra dimension
image_processed = preprocessor.process_images(image_extend)

# Detect objects with YOLOv3
blob = cv2.dnn.blobFromImage(image, 0.00392, (256, 256), (0, 0, 0), True, crop=False)
yolo.setInput(blob)
outs = yolo.forward(output_layers)

# Estimate depth with U-Net
depth_map = unet.predict(image_processed)

# Calculate distances
for out in outs:
    for detection in out:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]
        if confidence > 0.7: # you can adjust this threshold
            # Object detected
            center_x = int(detection[0] * width)
            center_y = int(detection[1] * height)
            w = int(detection[2] * width)
            h = int(detection[3] * height)
            # Rectangle coordinates
            x = int(center_x - w / 2)
            y = int(center_y - h / 2)
            
            # Calculate average depth within bounding box
            single_depth_map = depth_map[0, :, :, 0]
            depth = np.mean(single_depth_map[y:y+h, x:x+w])
            print(f"Object {str(classes[class_id])} is at a depth of {depth}")


Object bicycle is at a depth of 0.25919607281684875
Object bicycle is at a depth of 0.25910496711730957
Object bicycle is at a depth of 0.2819024920463562
Object tv is at a depth of 0.2132578194141388
Object truck is at a depth of 0.317163348197937
Object dog is at a depth of 0.2045358568429947
Object dog is at a depth of 0.20351308584213257
Object dog is at a depth of 0.2080395519733429
Object dog is at a depth of 0.1978648155927658


In [26]:
dist = 0.0036*(0.31212911009788513**2) - 0.5373*0.31212911009788513 + 21.714
dist

21.546643757637337