### Code (vitl model)

In [None]:
import sys, os, glob

REPO = "/content/drive/MyDrive/Depthanythingv2"
dirs = glob.glob(REPO + "/**/depth_anything_v2", recursive=True)

if not dirs:
    raise RuntimeError("Can't find depth_anything_v2 folder anywhere under REPO.")

module_dir = dirs[0]                      # .../depth_anything_v2
repo_root = os.path.dirname(module_dir)   # the folder that CONTAINS depth_anything_v2

if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

print("Using repo_root:", repo_root)
print("sys.path[0]:", sys.path[0])

from depth_anything_v2.dpt import DepthAnythingV2
print("OK import DepthAnythingV2")

Using repo_root: /content/drive/MyDrive/Depthanythingv2/Depth-Anything-V2
sys.path[0]: /content/drive/MyDrive/Depthanythingv2/Depth-Anything-V2




OK import DepthAnythingV2


In [None]:
import os, glob, sys, cv2, numpy as np, torch
from tqdm import tqdm

# Your image folder on Drive
IMG_DIR = "/content/drive/MyDrive/Segmentation/dataset"

# Output folders (new)
OUT_REL = "/content/drive/MyDrive/Segmentation/depth-maps/raw_da2_vitl896_tta"
OUT_VIZ = "/content/drive/MyDrive/Segmentation/depth-maps/viz_da2_vitl896_tta"

# DA2 repo path + checkpoint path
REPO = "/content/drive/MyDrive/Depthanythingv2"
CKPT = "/content/drive/MyDrive/Depthanythingv2/checkpoints/depth_anything_v2_vitl.pth"

os.makedirs(OUT_REL, exist_ok=True)
os.makedirs(OUT_VIZ, exist_ok=True)

# Make sure we can import the library
sys.path.append(REPO)


In [None]:
from depth_anything_v2.dpt import DepthAnythingV2

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

model_configs = {
    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
}

model = DepthAnythingV2(**model_configs['vitl'])
state = torch.load(CKPT, map_location='cpu')
model.load_state_dict(state, strict=True)
model = model.to(DEVICE).eval()

print("Loaded vitl on", DEVICE)


Loaded vitl on cpu


In [None]:
def letterbox(img, target=896):
    h, w = img.shape[:2]
    s = target / max(h, w)
    nh, nw = int(round(h * s)), int(round(w * s))
    img_r = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_CUBIC)
    top = (target - nh) // 2; bottom = target - nh - top
    left = (target - nw) // 2; right = target - nw - left
    img_p = cv2.copyMakeBorder(img_r, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
    return img_p, (top, bottom, left, right), (h, w)

def unletterbox(arr, pads, orig_hw):
    top, bottom, left, right = pads
    arr = arr[top:arr.shape[0]-bottom, left:arr.shape[1]-right]
    return cv2.resize(arr, (orig_hw[1], orig_hw[0]), interpolation=cv2.INTER_LINEAR)

def infer_tta_rel(img_bgr):
    # Predict and average normal + flipped; keep float32
    d0 = model.infer_image(img_bgr).astype(np.float32)
    d1 = model.infer_image(cv2.flip(img_bgr, 1)).astype(np.float32)
    d1 = np.flip(d1, axis=1)
    d = 0.5 * (d0 + d1)
    return d.astype(np.float32)

def to_u16_rel(depth):
    d = depth.astype(np.float32)
    d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0)
    if (d > 0).sum() < 10:
        return np.zeros_like(d, dtype=np.uint16)
    p1, p99 = np.percentile(d[d > 0], (1, 99))
    p1 = np.float32(p1); p99 = np.float32(p99)
    d = np.clip(d, p1, p99)
    d = (d - p1) / max(np.float32(1e-6), (p99 - p1))
    d = np.clip(d, np.float32(1e-6), np.float32(1.0))   # avoid exact zeros
    return (d * np.float32(65535.0)).astype(np.uint16)


In [None]:
paths = sorted(
    p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
    for p in glob.glob(os.path.join(IMG_DIR, ext))
)

for p in tqdm(paths):
    img = cv2.imread(p)                    # uint8
    img_p, pads, orig_hw = letterbox(img, 896)
    depth_rel_p = infer_tta_rel(img_p)     # float32
    depth_rel = unletterbox(depth_rel_p, pads, orig_hw).astype(np.float32)

    rel16 = to_u16_rel(depth_rel)
    stem = os.path.splitext(os.path.basename(p))[0]
    cv2.imwrite(f"{OUT_REL}/{stem}.png", rel16)

    viz = cv2.applyColorMap(255 - (rel16 // 256).astype(np.uint8), cv2.COLORMAP_INFERNO)
    cv2.imwrite(f"{OUT_VIZ}/{stem}.jpg", viz)

print("Saved", len(paths), "depth maps to", OUT_REL)


100%|██████████| 563/563 [11:22:28<00:00, 72.73s/it]

Saved 563 depth maps to /content/drive/MyDrive/Segmentation/depth-maps/raw_da2_vitl896_tta





In [None]:
!ls  /content/drive/MyDrive/Depthanythingv2/output-vitl/depths/raw_da2_vitl896_tta

In [None]:
import os, glob, sys, cv2, numpy as np, torch
from tqdm import tqdm

# ==== paths (same as yours) ====
IMG_DIR = "/content/drive/MyDrive/Depthanythingv2/data/eval"
OUT_REL = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depths/raw_da2_vitl896_1152_ms_tta4_jbf"
OUT_VIZ = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depths/viz_da2_vitl896_1152_ms_tta4_jbf"
REPO    = "/content/drive/MyDrive/Depthanythingv2"
CKPT    = f"{REPO}/checkpoints/depth_anything_v2_vitl.pth"
os.makedirs(OUT_REL, exist_ok=True); os.makedirs(OUT_VIZ, exist_ok=True)
sys.path.append(REPO)

from depth_anything_v2.dpt import DepthAnythingV2
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DepthAnythingV2(encoder='vitl', features=256, out_channels=[256,512,1024,1024])
state = torch.load(CKPT, map_location='cpu'); model.load_state_dict(state, strict=True)
model = model.to(DEVICE).eval()
print("Loaded vitl on", DEVICE)

# ---------- helpers ----------
def letterbox_reflect(img, target):
    h, w = img.shape[:2]
    s = target / max(h, w)
    nh, nw = int(round(h * s)), int(round(w * s))
    img_r = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA if s < 1.0 else cv2.INTER_CUBIC)
    top = (target - nh) // 2; bottom = target - nh - top
    left = (target - nw) // 2; right = target - nw - left
    img_p = cv2.copyMakeBorder(img_r, top, bottom, left, right, cv2.BORDER_REFLECT_101)
    return img_p, (top, bottom, left, right), (h, w)

def unletterbox(arr, pads, orig_hw):
    top, bottom, left, right = pads
    arr = arr[top:arr.shape[0]-bottom, left:arr.shape[1]-right]
    return cv2.resize(arr, (orig_hw[1], orig_hw[0]), interpolation=cv2.INTER_LINEAR)

@torch.no_grad()
def infer_one(img_bgr):
    # model.infer_image accepts uint8 BGR; returns float32 depth (relative)
    return model.infer_image(img_bgr).astype(np.float32)

def infer_tta4(img_bgr):
    # none, h, v, hv; unflip back and average
    d0 = infer_one(img_bgr)
    d1 = np.flip(infer_one(cv2.flip(img_bgr, 1)), axis=1)  # h
    d2 = np.flip(infer_one(cv2.flip(img_bgr, 0)), axis=0)  # v
    d3 = np.flip(np.flip(infer_one(cv2.flip(cv2.flip(img_bgr, 1), 0)), axis=1), axis=0)  # hv
    return (d0 + d1 + d2 + d3) / 4.0

def infer_ms_tta(img_bgr, sizes=(896, 1152)):
    outs = []
    for s in sizes:
        img_p, pads, orig_hw = letterbox_reflect(img_bgr, s)
        d = infer_tta4(img_p)
        d = unletterbox(d, pads, orig_hw).astype(np.float32)
        outs.append(d)
    # resize all to original (already done) and average
    return np.mean(outs, axis=0).astype(np.float32)

def joint_bilateral_depth(depth, guide_bgr, ds=7, dr=0.1, iters=2):
    # depth in [0,1] relative; guide is BGR uint8. Use domain transform-like iterative bilateral.
    d = depth.copy().astype(np.float32)
    g = guide_bgr
    for _ in range(iters):
        # OpenCV doesn't have true joint bilateral; use bilateral on depth plus small guidance mix
        # Build an edge map to preserve discontinuities
        edges = cv2.Canny(cv2.cvtColor(g, cv2.COLOR_BGR2GRAY), 50, 150).astype(np.float32)/255.0
        # Light bilateral
        d_blur = cv2.bilateralFilter(d, ds, dr*255.0, ds)
        # Keep strong edges from original, smooth elsewhere
        w = cv2.GaussianBlur(edges, (0,0), 1.0)
        w = np.clip(1.0 - w, 0.0, 1.0).astype(np.float32)
        d = w*d_blur + (1.0 - w)*d
    return d

def to_u16_rel(depth):
    d = depth.astype(np.float32)
    d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0)
    if (d > 0).sum() < 10:
        return np.zeros_like(d, dtype=np.uint16)
    p_lo, p_hi = np.percentile(d[d > 0], (0.5, 99.5))
    p_lo = np.float32(p_lo); p_hi = np.float32(p_hi)
    d = np.clip((d - p_lo) / max(np.float32(1e-6), (p_hi - p_lo)), 0, 1)
    return (d * np.float32(65535.0)).astype(np.uint16)

# ---------- run ----------
paths = sorted(p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
               for p in glob.glob(os.path.join(IMG_DIR, ext)))

for p in tqdm(paths):
    img = cv2.imread(p)
    depth_rel = infer_ms_tta(img, sizes=(896,1152))  # <- multi-scale + TTA(4)

    # edge-preserving refinement guided by RGB to sharpen boundaries / flatten planes
    x = depth_rel.copy()
    # bring to [0,1] before refinement to stabilize bilateral behavior
    if (x>0).sum() >= 10:
        q1, q99 = np.percentile(x[x>0], (1,99))
        x = np.clip((x - q1) / max(1e-6, (q99 - q1)), 0, 1).astype(np.float32)
    x = joint_bilateral_depth(x, img, ds=7, dr=0.08, iters=2)
    depth_rel = x.astype(np.float32)

    stem = os.path.splitext(os.path.basename(p))[0]
    rel16 = to_u16_rel(depth_rel)
    cv2.imwrite(f"{OUT_REL}/{stem}.png", rel16)

    viz = cv2.applyColorMap(255 - (rel16 // 256).astype(np.uint8), cv2.COLORMAP_INFERNO)
    cv2.imwrite(f"{OUT_VIZ}/{stem}.jpg", viz)

print("Saved", len(paths), "depth maps to", OUT_REL)


### Evaluation (vitl model)

In [None]:
import os, glob, cv2, numpy as np, pandas as pd
from tqdm import tqdm

# --- paths: point DEP_DIR to your vitl outputs ---
IMG_DIR    = "/content/drive/MyDrive/Depthanythingv2/data/eval"
DEP_DIR    = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depths/raw_da2_vitl896_1152_ms_tta4_jbf"
REPORT_CSV = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depth_eval_report_vitl896_1152_ms_tta4_jbf.csv"
os.makedirs(os.path.dirname(REPORT_CSV), exist_ok=True)

# --- helpers ---
def read_depth_rel(path_png16):
    d16 = cv2.imread(path_png16, -1).astype(np.float32)
    d = d16 / np.float32(65535.0)
    d[~np.isfinite(d)] = np.nan
    return d.astype(np.float32)

def sobel_grad(a):
    a = np.asarray(a, dtype=np.float32)
    gx = cv2.Sobel(a, cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(a, cv2.CV_32F, 0, 1, ksize=3)
    return np.sqrt(gx*gx + gy*gy).astype(np.float32)

def edge_metrics(rgb, depth_rel):
    g = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY).astype(np.float32)/255.0
    g = cv2.GaussianBlur(g,(3,3),0.8).astype(np.float32)
    e_img = sobel_grad(g)

    x = depth_rel.astype(np.float32)
    x[np.isnan(x)] = 0.0
    if (x>0).sum() < 10:
        return 0.0, 0.0, 0.0
    p1,p99 = np.percentile(x[x>0], (1,99))
    p1 = np.float32(p1); p99 = np.float32(p99)
    x = np.clip((x - p1) / max(np.float32(1e-6), (p99 - p1)), 0, 1).astype(np.float32)
    e_dep = sobel_grad(x)

    t1 = np.float32(np.percentile(e_img, 88))
    t2 = np.float32(np.percentile(e_dep, 88))
    E1 = (e_img >= t1).astype(np.uint8)
    E2 = (e_dep >= t2).astype(np.uint8)

    inter = (E1 & E2).sum()
    prec  = inter / (E2.sum() + 1e-6)
    rec   = inter / (E1.sum() + 1e-6)
    f1    = 2*prec*rec / max(1e-6, (prec+rec))
    return float(prec), float(rec), float(f1)

def planarity(depth_rel):
    z = depth_rel.astype(np.float32)
    mask = np.isfinite(z) & (z>0)
    if mask.sum() < 1000:
        return np.nan
    ys,xs = np.where(mask)
    sel = np.random.choice(len(xs), size=min(20000,len(xs)), replace=False)
    xs,ys = xs[sel],ys[sel]; zz = z[ys,xs].astype(np.float32)

    H,W = z.shape; f = np.float32(1.2*max(W,H)); cx = np.float32(W/2); cy = np.float32(H/2)
    X = (xs.astype(np.float32) - cx)*zz/f
    Y = (ys.astype(np.float32) - cy)*zz/f
    P = np.stack([X,Y,zz],1).astype(np.float32)

    best=None; rng = np.random.default_rng(0)
    for _ in range(100):
        i = rng.choice(len(P), 3, replace=False)
        v1 = P[i[1]] - P[i[0]]; v2 = P[i[2]] - P[i[0]]
        n  = np.cross(v1, v2).astype(np.float32)
        n_norm = np.linalg.norm(n)
        if n_norm < 1e-8: continue
        n /= n_norm; d0 = -np.dot(n, P[i[0]]).astype(np.float32)
        dist = np.abs(P @ n + d0).astype(np.float32)
        rmse = float(np.sqrt(np.mean(dist**2)))
        if (best is None) or (rmse < best): best = rmse
    if best is None: return np.nan
    med = float(np.median(zz[zz>0])) if np.any(zz>0) else 1.0
    return float(best / max(1e-6, med))

def invalid_ratio(depth_rel):
    return float(np.isnan(depth_rel).mean())

# --- compute rows ---
rows = []
img_paths = sorted(p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
                   for p in glob.glob(os.path.join(IMG_DIR, ext)))

for p_rgb in tqdm(img_paths):
    stem = os.path.splitext(os.path.basename(p_rgb))[0]
    p_dep = os.path.join(DEP_DIR, f"{stem}.png")
    if not os.path.exists(p_dep):
        continue
    rgb   = cv2.imread(p_rgb)
    depth = read_depth_rel(p_dep)
    prec, rec, f1 = edge_metrics(rgb, depth)
    plan = planarity(depth)
    inv  = invalid_ratio(depth)
    rows.append([stem, prec, rec, f1, plan, inv])

df = pd.DataFrame(rows, columns=[
    "img_id","edge_prec","edge_rec","edge_f1","planarity_residual_rel","invalid_ratio"
])

if df.empty:
    raise RuntimeError(f"No matches found. Check DEP_DIR: {DEP_DIR}")

# --- vectorized overall score ---
f1   = df["edge_f1"].astype(float).fillna(0.0)
plan = df["planarity_residual_rel"].astype(float).fillna(0.1)
inv  = df["invalid_ratio"].astype(float).fillna(0.0)
plan_ok = np.clip(1.0 - (plan/0.1), 0.0, 1.0)
inv_ok  = np.clip(1.0 - (inv /0.05), 0.0, 1.0)
df["overall_score"] = 0.6*f1 + 0.25*plan_ok + 0.15*inv_ok

df.to_csv(REPORT_CSV, index=False)
print(f"Average overall_score = {df['overall_score'].mean():.3f} ± {df['overall_score'].std():.3f}")
df.sort_values("overall_score", ascending=False).head(10)


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Depthanythingv2/output-vitl/depth_eval_report_vitl896_tta.csv")
mean = df["overall_score"].mean()
std = df["overall_score"].std()
print(f"Average overall_score = {mean:.3f} ± {std:.3f}")