# Install necessary packages

If you run this notebook on Google Colab, you'll have to install timm.

# Import libraries

In [81]:
import cv2
import torch
import time
import os
import numpy as np
from IPython.display import Video

# Load MiDaS model.

We'll load MiDaS thanks to torch hub. Feel free to use different versions of the model !

In [89]:
# Let's load a DPT Hybrid model for depth estimation task.
model_type = "DPT_Hybrid"  # DPT Hybrid model

midas = torch.hub.load("intel-isl/MiDaS", model_type)

# Move the model to GPU if it is available.
device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
midas.to(device)
print('DPT Hybrid model successfully loaded.')

Using cache found in C:\Users\bhave/.cache\torch\hub\intel-isl_MiDaS_master
  model = create_fn(


DPT Hybrid model successfully loaded.


## Load data and set image preprocessor class:

We need to load, resize and normalize our images so that MiDaS can process them correctly. Fortunately, MiDaS has a ``transforms`` that does the preprocessing for us. You can see the video whoose depth we're going to estimate.

In [90]:
# Use transforms class
transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = transforms.dpt_transform  # If you used a small midas model,
                                      # use transforms.small_transform instead.

# Open the camera (usually the default camera is at index 0)
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open camera.")
    exit()

Using cache found in C:\Users\bhave/.cache\torch\hub\intel-isl_MiDaS_master


Now we are going to extract all of the images from the video and place them in a folder.

In [91]:

# # Output directory for JPEG images
# output_directory = f"./data/video_into_imagesbw"

# # Ensure that the output directory exists
# os.makedirs(output_directory, exist_ok=True)

# # Open the video file
# cap = cv2.VideoCapture("data\interior_designbw.mp4")

# # Get the resolution of the video
# frame_width = int(cap.get(3))  # Width
# frame_height = int(cap.get(4))  # Height

# # Check if the video was opened successfully
# if not cap.isOpened():
#     print("Unable to open the video. Check the file path.")
# else:
#     frame_count = 0

#     while True:
#         ret, frame = cap.read()

#         # Check if reading the frame was successful
#         if not ret:
#             break

#           # Save the frame as a JPEG image
#         frame_filename = os.path.join(output_directory, f'frame_{frame_count:04d}.jpg')
#         cv2.imwrite(frame_filename, frame)
#         frame_count += 1

#     # Release the video file
#     cap.release()

#     print(f'{frame_count} images have been extracted and saved in {output_directory}')

In [92]:
midas.eval()

total_time = []
depth_video = []

# Define the sharpening kernel
sharpening_kernel = np.array([[0, -1, 0],
                              [-1, 5,-1],
                              [0, -1, 0]])

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    # Convert the frame to grayscale
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Apply the sharpening filter
    gray_frame = cv2.filter2D(gray_frame, -1, sharpening_kernel)

    # Convert grayscale image to 3-channel image
    gray_frame_3ch = cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2BGR)

    # Apply transforms:
    input_img = transform(gray_frame_3ch).to(device)

    # Prediction and preprocess output:
    with torch.no_grad():
        start = time.time()
        pred = midas(input_img)

        pred = torch.nn.functional.interpolate(pred.unsqueeze(1),
                                               size=gray_frame.shape[:2],
                                               mode="bicubic",
                                               align_corners=False,
                                               ).squeeze()
        depth_img = pred.cpu().numpy()

        # Apply Gaussian blur to smooth the depth map
        depth_img = cv2.GaussianBlur(depth_img, (3, 3), 0)

        # Resizing and coloring the image
        depth_img = cv2.normalize(depth_img, None, 0, 1, norm_type=cv2.NORM_MINMAX,
                                  dtype=cv2.CV_64F)
        depth_img = (depth_img * 255).astype(np.uint8)
        depth_img = cv2.applyColorMap(depth_img, cv2.COLORMAP_MAGMA)

        end = time.time()
        total_time.append(end - start)
        depth_video.append(depth_img)

        # Concatenate the original frame and the depth image side by side
        combined_img = np.hstack((cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2BGR), depth_img))

        # Display the combined frame
        cv2.imshow('Original and Depth Estimation', combined_img)

        # Exit on pressing 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

print(f"Total time: {np.sum(total_time):.2f}s")
print(f"Average time per frame: {np.mean(total_time):.2f}s")

# Release the camera and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()

Total time: 33.31s
Average time per frame: 1.28s


# Run the model

Now that we have decomposed the video into images, we can run the model for each image.

In [93]:
# # Inference mode
# midas.eval()

# total_time = []
# depth_video = []

# for i in range(frame_count):

#   FRAME_PATH = f"./data/video_into_imagesbw/frame_{i:04d}.jpg"
#   img = cv2.imread(FRAME_PATH, cv2.IMREAD_COLOR)

#   # Apply transforms:
#   input_img = transform(img).to(device)

#   # Prediction and preprocess output:
#   with torch.no_grad():
#     start = time.time()
#     pred = midas(input_img)

#     pred = torch.nn.functional.interpolate(pred.unsqueeze(1),
#                                           size = img.shape[:2],
#                                           mode = "bicubic",
#                                           align_corners = False,
#                                           ).squeeze()
#     depth_img = pred.cpu().numpy()

#     #Resizing and coloring the image
#     depth_img = cv2.normalize(depth_img, None, 0, 1, norm_type = cv2.NORM_MINMAX,
#                               dtype = cv2.CV_64F)
#     depth_img = (depth_img*255).astype(np.uint8)
#     depth_img = cv2.applyColorMap(depth_img, cv2.COLORMAP_MAGMA)

#     end = time.time()
#     total_time.append(end - start)
#     depth_video.append(depth_img)

# print(f"Total time: {np.sum(total_time):.2f}s")
# print(f"Average time per image: {np.mean(total_time):.2f}s")

As we can see, the ``DPT_Hybrid`` model needs an everage of 0.15s inference + processing time, which is approximately 7 frames per second in the context of a real time depth estimation.

# Results

Now let's see the video !

In [94]:
# frame_size = (1920, 1080)  # Specify the width and height of your frames
# frame_rate = 30  # Frames per second
# codec = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4 format

# output_file = 'output_videobw.mp4'
# out = cv2.VideoWriter(output_file, codec, frame_rate, frame_size)

# for frame in depth_video:  # 'frames' is your list of RGB frames
#     out.write(frame)

# out.release()

In [95]:
Video(output_file, embed = False)

NameError: name 'output_file' is not defined

As we can see, the results on the video are quite detailed for a monocular camera depth estimation!