In [1]:
import numpy as np
import argparse
import glob
import os
from functools import partial
import vispy
import scipy.misc as misc
from tqdm import tqdm
import time
import sys
from utils import get_MiDaS_samples, read_MiDaS_depth
import torch
import cv2
from skimage.transform import resize
import imageio
import copy
from MiDaS.monodepth_net import MonoDepthNet
import MiDaS.MiDaS_utils as MiDaS_utils
from bilateral_filtering import sparse_bilateral_filtering
import matplotlib.pyplot as plt

# Config
config is a dictionary which contains model parameters, which have been fine tuned to get optimal results.

In [2]:
config = {"depth_edge_model_ckpt": "checkpoints/edge-model.pth",
          "depth_feat_model_ckpt": "checkpoints/depth-model.pth",
          "rgb_feat_model_ckpt": "checkpoints/color-model.pth",
          "MiDaS_model_ckpt": "MiDaS/model.pt",
          "fps": 40,
          "num_frames": 240,
          "x_shift_range": [0.00, 0.00, -0.02, -0.02],
          "y_shift_range": [0.00, 0.00, -0.02, -0.00],
          "z_shift_range": [-0.05, -0.05, -0.07, -0.07],
          "traj_types": ['double-straight-line', 'double-straight-line', 'circle', 'circle'],
          "video_postfix": ['dolly-zoom-in', 'zoom-in', 'circle', 'swing'],
          "specific": '',
          "longer_side_len": 960,
          "src_folder": "../images",
          "depth_folder": "../depth",
          "mesh_folder": "../mesh",
          "video_folder": "../video",
          "output_folder": "../outputs",
          "load_ply": False,
          "save_ply": True,
          "inference_video": True,
          "gpu_ids": 0,
          "offscreen_rendering": False,
          "img_format": '.jpg',
          "depth_format": '.npy',
          "require_midas": True,
          "depth_threshold": 0.04,
          "ext_edge_threshold": 0.002,
          "sparse_iter": 5,
          "filter_size": [7, 7, 5, 5, 5],
          "sigma_s": 4.0,
          "sigma_r": 0.5,
          "redundant_number": 12,
          "background_thickness": 70,
          "context_thickness": 140,
          "background_thickness_2": 70,
          "context_thickness_2": 70,
          "discount_factor": 1.00,
          "log_depth": True,
          "largest_size": 512,
          "depth_edge_dilate": 10,
          "depth_edge_dilate_2": 5,
          "extrapolate_border": True,
          "extrapolation_thickness": 60,
          "repeat_inpaint_edge": True,
          "crop_border": [0.03, 0.03, 0.05, 0.03],
          "anti_flickering": True,
}

# run_depth
run_depth takes in the image, and predicts the depth of each point, using MiDaS library.

In [3]:
def run_depth(img_names, input_path, output_path, model_path, Net, utils, target_w=None):
    device = torch.device("cpu")
    model = Net(model_path)
    model.to(device)
    model.eval()
    num_images = len(img_names)
    os.makedirs(output_path, exist_ok=True)

    for ind, img_name in enumerate(img_names):
        print("  processing {} ({}/{})".format(img_name, ind + 1, num_images))
        img = utils.read_image(img_name)
        w = img.shape[1]
        scale = 640. / max(img.shape[0], img.shape[1])
        target_height, target_width = int(round(img.shape[0] * scale)), int(round(img.shape[1] * scale))
        img_input = utils.resize_image(img)
        img_input = img_input.to(device)

        with torch.no_grad():
            out = model.forward(img_input)
        
        depth = utils.resize_depth(out, target_width, target_height)
        img = cv2.resize((img * 255).astype(np.uint8), (target_width, target_height), interpolation=cv2.INTER_AREA)

        filename = os.path.join(
            output_path, os.path.splitext(os.path.basename(img_name))[0]
        )
        np.save(filename + '.npy', depth)
        utils.write_depth(filename, depth, bits=2)

    print("Finished run depth.")


# read_MiDaS_depth
read_MiDaS_depth takes in the depth values, smoothens the depth gradient, and takes its inverse. This ensures that the foreground is one, and background contains all other things which are in near vicinity, or far away.

In [4]:
def read_MiDaS_depth(disp_fi, disp_rescale=10., h=None, w=None):
    if 'npy' in os.path.splitext(disp_fi)[-1]:
        disp = np.load(disp_fi)
    else:
        disp = imageio.imread(disp_fi).astype(np.float32)
    disp = disp - disp.min()
    disp = cv2.blur(disp / disp.max(), ksize=(3, 3)) * disp.max()
    disp = (disp / disp.max()) * disp_rescale
    if h is not None and w is not None:
        disp = resize(disp / disp.max(), (h, w), order=1) * disp.max()
    depth = 1. / np.maximum(disp, 0.05)

    return depth

# Depth Computation and Bilateral Filtering
We have looped for each image. For each image, the depth map has been computed and smoothened. Then, bilateral median filter is used to sharpen the depth map, and discontinuity map is computed, separating the foreground from the background. Output images are stored, for further use.

In [6]:
sample_list = get_MiDaS_samples(config['src_folder'], config['depth_folder'], config, config['specific'])
for idx in tqdm(range(len(sample_list))):
    depth = None
    sample = sample_list[idx]
    image = imageio.imread(sample['ref_img_fi'])
    print("Running depth extraction on image", sample['ref_img_fi'])
    run_depth([sample['ref_img_fi']], config['src_folder'], config['depth_folder'],
              config['MiDaS_model_ckpt'], MonoDepthNet, MiDaS_utils, target_w=640)
    config['output_h'], config['output_w'] = np.load(sample['depth_fi']).shape[:2]
    frac = config['longer_side_len'] / max(config['output_h'], config['output_w'])
    config['output_h'], config['output_w'] = int(config['output_h'] * frac), int(config['output_w'] * frac)
    config['original_h'], config['original_w'] = config['output_h'], config['output_w']
    if image.ndim == 2:
        image = image[..., None].repeat(3, -1)
    if np.sum(np.abs(image[..., 0] - image[..., 1])) == 0 and np.sum(np.abs(image[..., 1] - image[..., 2])) == 0:
        config['gray_image'] = True
    else:
        config['gray_image'] = False
    image = cv2.resize(image, (config['output_w'], config['output_h']), interpolation=cv2.INTER_AREA)
    depth = read_MiDaS_depth(sample['depth_fi'], 3.0, config['output_h'], config['output_w'])
    mean_loc_depth = depth[depth.shape[0]//2, depth.shape[1]//2]
    vis_photos, vis_depths = sparse_bilateral_filtering(depth.copy(), image.copy(), config, num_iter=config['sparse_iter'], spdb=False)
    depth = vis_depths[-1]
    img = vis_photos[-1]
    img2 = np.uint8(img)
    img3 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)
    cv2.imwrite(config["output_folder"] + "/" + sample["src_pair_name"] + ".jpg", img3)
    print("Written output as " + sample["src_pair_name"] + ".jpg")

  0%|          | 0/3 [00:00<?, ?it/s]

Running depth extraction on image ../image/tiger.jpg
  processing ../image/tiger.jpg (1/1)



  0%|          | 0/5 [00:00<?, ?it/s][A

Finished run depth.
Starting Sparse Bilateral Filtering



 20%|██        | 1/5 [00:05<00:22,  5.55s/it][A
 40%|████      | 2/5 [00:10<00:16,  5.37s/it][A
 60%|██████    | 3/5 [00:14<00:09,  4.96s/it][A
 80%|████████  | 4/5 [00:18<00:04,  4.70s/it][A
100%|██████████| 5/5 [00:22<00:00,  4.49s/it][A
 33%|███▎      | 1/3 [00:24<00:48, 24.50s/it]

Written output as tiger.jpg
Running depth extraction on image ../image/moon.jpg
  processing ../image/moon.jpg (1/1)



  0%|          | 0/5 [00:00<?, ?it/s][A

Finished run depth.
Starting Sparse Bilateral Filtering



 20%|██        | 1/5 [00:05<00:21,  5.45s/it][A
 40%|████      | 2/5 [00:10<00:16,  5.42s/it][A
 60%|██████    | 3/5 [00:15<00:10,  5.22s/it][A
 80%|████████  | 4/5 [00:20<00:05,  5.03s/it][A
100%|██████████| 5/5 [00:24<00:00,  4.94s/it][A
 67%|██████▋   | 2/3 [00:51<00:25, 25.34s/it]

Written output as moon.jpg
Running depth extraction on image ../image/ball.jpg
  processing ../image/ball.jpg (1/1)



  0%|          | 0/5 [00:00<?, ?it/s][A

Finished run depth.
Starting Sparse Bilateral Filtering



 20%|██        | 1/5 [00:03<00:15,  3.98s/it][A
 40%|████      | 2/5 [00:07<00:11,  3.92s/it][A
 60%|██████    | 3/5 [00:10<00:07,  3.69s/it][A
 80%|████████  | 4/5 [00:13<00:03,  3.51s/it][A
100%|██████████| 5/5 [00:17<00:00,  3.41s/it][A
100%|██████████| 3/3 [01:10<00:00, 23.59s/it]

Written output as ball.jpg



