In [1]:
import torch
from src.depth_est_dpt import dpt_depth
from src.sem_seg_dpt import dpt_semantic, dpt_get_labels
from src.utils.image_utils import semantic_overlay, load_image, save_image, save_plot, norm_depth
import numpy as np
import cv2

import matplotlib.pyplot as plt
from PIL import Image

Some weights of DPTForDepthEstimation were not initialized from the model checkpoint at Intel/dpt-large and are newly initialized: ['neck.fusion_stage.layers.0.residual_layer1.convolution2.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.weight', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DPTForSemanticSegmentation were not initialized from the model checkpoint at Intel/dpt-large-ade and are newly initialized: ['neck.fusion_stage.layers.0.residual_layer1.batch_norm1.num_batches_tracked', 'neck.fusion_stage.layers.0.residual_layer1.batch_norm1.running_mean', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.weight', 'neck.fusion_stage.layers.0.residual_layer1.batch_norm1.running_var', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.weight', 'neck

In [2]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3080 Laptop GPU'

In [3]:

def get_dual_overlay(image: np.ndarray, seg_image: np.ndarray, depth_image: np.ndarray) -> np.ndarray:
    """Get overlay of segmentation and depth image.

    Args:
        image (np.ndarray): RGB image
        seg_image (np.ndarray): segmentation image
        depth_image (np.ndarray): depth image

    Returns:
        np.ndarray: overlay image
    """
    # normalize depth image
    depth_image_norm = norm_depth(depth_image)
    # interpolate to use the full color range
    depth_int_color = cv2.applyColorMap(depth_image_norm, cv2.COLORMAP_INFERNO)
    seg_street_color = np.zeros_like(depth_int_color)

    seg_street_color[ seg_street, :] = [0, 255, 55]
    seg_street_overlay = cv2.addWeighted(depth_int_color, 1, seg_street_color, 0.5, 0)
    seg_street_overlay[seg_street == False, :] = [0, 0, 0]
    full_overlay = cv2.addWeighted(image, 0.6, seg_street_overlay, 0.9, 0)

    return full_overlay


In [4]:
# Calibration values for depth estimation to distance
PX1 = 230
PX2 = 169
DST1 = 100
DST2 = 200  # cm

def get_interpolation(
    pxvalue1: int, dist1: int, pxvalue2: int, dist2: int, x: int, isDistance: bool = False
) -> int:
    """Get distance for pixel value x.
    Given two points and their relative distance to the camera,
    interpolate the distance for point x.
    Pixel values are are 0 for the closest point and 255 for the farthest point, which is infinity.

    Args:
        pxvalue1 (int): pixel value of first point
        dist1 (int): distance to first point
        pxvalue2 (int): pixel value of second point
        dist2 (int): distance to second point
        x (int): pixel value to interpolate.
        isDistance (bool, optional): If true, the distance is returned. Defaults to False.

    Returns:
        int: distance to pixel value x
    """
    # Avoid division by zero
    if dist1 == dist2:
        return dist1
    # logarithmic interpolation
    if not isDistance:
        result = np.exp(
            np.log(dist1)
            + (np.log(dist2) - np.log(dist1)) * (x - pxvalue1) / (pxvalue2 - pxvalue1)
        )
        return int(result)
    # Inverted interpolation for the distance
    result = pxvalue1 + (pxvalue2 - pxvalue1) * (np.log(x) - np.log(dist1)) / (
        np.log(dist2) - np.log(dist1)
    )
    return int(result)


def get_distance(depth_map: np.ndarray, x: int, y: int) -> int:
    """Get distance from depth map.

    Args:
        depth_map (np.ndarray): normalized depth map
        x (int): x coordinate
        y (int): y coordinate

    Returns:
        int: distance to point (x, y)
    """
    pxvalue = depth_map[x,y]
    return get_interpolation(PX1, DST1, PX2, DST2, pxvalue)

def get_horizon(depth_map: np.ndarray) -> int:
    """Get horizon from depth map.

    Args:
        depth_map (np.ndarray): normalized depth map

    Returns:
        int: horizon
    """
    x_middle = int(depth_map.shape[0] / 2)
    max_mask = depth_map[x_middle-11: x_middle+11, :]
    max_middle = depth_map[max_mask].max()
    y_max = np.argmin(max_middle, 0).min()
    return y_max


def get_pixel(depth_map: np.ndarray, dist: int, horizon: int = None) -> int:
    """Get pixel value from depth map.

    Args:
        depth_map (np.ndarray): normalized depth map
        dist (int): distance
        horizon (int, optional): horizon. Defaults to None.

    Returns:
        int: pixel value
    """
    depth_max = depth_map.max()
    dist_max = get_interpolation(PX1, DST1, PX2, DST2, depth_max)
    if dist < dist_max:
        return get_interpolation(PX1, DST1, PX2, DST2, dist, isDistance=True)
    if horizon is None:
        horizon = get_horizon(depth_map)

    return horizon


# get_pixel(depth_image_norm, 1900)

In [5]:

def get_overlay_figure(image: np.ndarray, overlay: np.ndarray ,depth_map: np.ndarray, seg_street: np.ndarray) -> plt.figure:
    """Get overlay image from depth map.

    Args:
        image (np.ndarray): image
        depth_map (np.ndarray): normalized depth map
        seg_street (np.ndarray): street segmentation

    Returns:
        plt.figure: overlay image
    """
    # Calibration values
    PX1 = 230 # 1m
    PX2 = 169 # 2m
    X_DISTANCE = 300 # cm
    x_distance_halve = int(X_DISTANCE / 2)
    horizon_pixel = 269

    # Set color for each distance
    distance_color = np.zeros_like(image)
    distance_color[depth_image_norm == PX1] = [255, 0, 255]
    distance_color[depth_image_norm == PX2] = [0, 255, 0]

    # plt.imshow(depth_int_color)
    overlay = cv2.addWeighted(full_overlay, 1, distance_color, 0.5, 0)
    fig = plt.figure(figsize=(10, 10))
    fig.add_subplot(111)
    fig.axes[0].imshow(overlay)
    # interpolate the length of the road
    # inverted_seg_street = np.reciprocal(seg_street)
    horizon_distance = get_distance(depth_image_norm, 0, horizon_pixel)
    n_ticks = 5

    y_ticks = np.linspace(image.shape[0], horizon_pixel, n_ticks).astype(int)
    # y_ticks_ = np.linspace(horizon_distance, 0, n_ticks)

    print("ytickz")
    print(y_ticks, horizon_pixel, horizon_distance)

    # yticks = [get_pixel(depth_image_norm, x) for x in range(0, horizon_distance, np.round(horizon_distance/n_ticks).astype(int))]
    y_labels = [get_distance(depth_image_norm, 900, int(1080-y-horizon_pixel)) for y in y_ticks[::-1]]
    print("ylabels")
    print(y_labels)

    fig.axes[0].set_yticks(y_ticks)
    fig.axes[0].set_yticklabels([f"{int(np.power((x/9)-14, 2))} m" for x in np.sort(y_labels)])

    x_ticks = np.linspace(0, image.shape[1], n_ticks)

    x_labels  = np.linspace(-x_distance_halve, x_distance_halve, n_ticks).astype(int)

    fig.axes[0].set_xticks(x_ticks)
    fig.axes[0].set_xticklabels([f"{int(x)} cm" for x in x_labels])

    fig.axes[0].set_xlabel("Distance to camera")
    fig.axes[0].set_ylabel("Distance to horizon")

    fig.axes[0].set_title("Road Segmentation and Distance-Estimation")

    return fig



In [6]:
# Process VIdeo
from tqdm import tqdm

cap = cv2.VideoCapture('data/sections.mov')
i_frame = 0
start_frame = 1697

while(cap.isOpened()):
    ret, frame = cap.read()
    if i_frame < start_frame:
        i_frame += 1
        continue

    depth_image = dpt_depth(frame)
    depth_image_norm = norm_depth(depth_image)
    seg_image = dpt_semantic(frame)
    seg_street = seg_image == 6
    full_overlay = get_dual_overlay(frame, seg_street, depth_image_norm)
    fig = get_overlay_figure(frame, full_overlay, depth_image_norm, seg_street)
    # save the figure
    fig.savefig('res/overlay/overlay_{:04d}.png'.format(i_frame), bbox_inches='tight', pad_inches=0)

    i_frame += 1
    print("Frame", i_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break


ytickz
[1080  877  674  471  269] 269 1289
ylabels
[145, 147, 150, 140, 142]
Frame 1698
ytickz
[1080  877  674  471  269] 269 1289
ylabels
[152, 152, 154, 140, 143]
Frame 1699
ytickz
[1080  877  674  471  269] 269 1289
ylabels
[157, 161, 164, 142, 145]
Frame 1700
ytickz
[1080  877  674  471  269] 269 1260
ylabels
[154, 155, 159, 142, 145]
Frame 1701
ytickz
[1080  877  674  471  269] 269 1289
ylabels
[159, 163, 166, 143, 145]
Frame 1702
ytickz
[1080  877  674  471  269] 269 1260
ylabels
[157, 161, 164, 142, 145]
Frame 1703
ytickz
[1080  877  674  471  269] 269 1274
ylabels
[152, 155, 161, 142, 143]
Frame 1704
ytickz
[1080  877  674  471  269] 269 1274
ylabels
[150, 155, 161, 139, 142]
Frame 1705
ytickz
[1080  877  674  471  269] 269 1274
ylabels
[154, 157, 163, 139, 142]
Frame 1706
ytickz
[1080  877  674  471  269] 269 1246
ylabels
[152, 157, 164, 140, 142]
Frame 1707
ytickz
[1080  877  674  471  269] 269 1274
ylabels
[155, 159, 168, 140, 143]
Frame 1708
ytickz
[1080  877  674  471  269

  fig = plt.figure(figsize=(10, 10))


ytickz
[1080  877  674  471  269] 269 1260
ylabels
[143, 148, 154, 135, 135]
Frame 1718
ytickz
[1080  877  674  471  269] 269 1274
ylabels
[150, 154, 157, 139, 139]
Frame 1719
ytickz
[1080  877  674  471  269] 269 1260
ylabels
[147, 150, 154, 137, 139]
Frame 1720
ytickz
[1080  877  674  471  269] 269 1260
ylabels
[152, 155, 161, 139, 140]
Frame 1721
ytickz
[1080  877  674  471  269] 269 1304
ylabels
[154, 155, 159, 142, 142]
Frame 1722
ytickz
[1080  877  674  471  269] 269 1304
ylabels
[154, 157, 161, 142, 142]
Frame 1723
ytickz
[1080  877  674  471  269] 269 1304
ylabels
[148, 152, 155, 140, 139]
Frame 1724
ytickz
[1080  877  674  471  269] 269 1318
ylabels
[148, 152, 155, 139, 139]
Frame 1725
ytickz
[1080  877  674  471  269] 269 1163
ylabels
[145, 147, 152, 140, 139]
Frame 1726
ytickz
[1080  877  674  471  269] 269 1334
ylabels
[148, 150, 155, 142, 142]
Frame 1727
ytickz
[1080  877  674  471  269] 269 1218
ylabels
[152, 154, 159, 143, 143]
Frame 1728
ytickz
[1080  877  674  471  269

: 

: 