### Setup

In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
!git clone https://github.com/DepthAnything/Depth-Anything-V2

fatal: destination path 'Depth-Anything-V2' already exists and is not an empty directory.


In [None]:
import sys
sys.path.append('/content/Depth-Anything-V2')

In [None]:
!cd /content/Depth-Anything-V2

/bin/bash: line 1: cd: /content/Depth-Anything-V2: No such file or directory


In [None]:
!ls

drive  sample_data


In [None]:
import numpy as np, torch
print("NumPy:", np.__version__)
print("Torch:", torch.__version__)
from depth_anything_v2.dpt import DepthAnythingV2
print("DepthAnythingV2 import OK")


NumPy: 2.0.2
Torch: 2.9.0+cpu


ModuleNotFoundError: No module named 'depth_anything_v2'




### Code (vits model)

In [None]:
import os, glob, sys, cv2, numpy as np, torch
from tqdm import tqdm

# Paths
IMG_DIR  = "/content/drive/MyDrive/Depthanythingv2/data/eval"
REPO     = "/content/Depth-Anything-V2"
CKPT     = f"{REPO}/checkpoints/depth_anything_v2_vits.pth"  # vits checkpoint
OUT_REL  = "/content/drive/MyDrive/Depthanythingv2/output-vits/depths/raw_da2_vits"
OUT_VIZ  = "/content/drive/MyDrive/Depthanythingv2/output-vits/depths/viz_da2_vits"

In [None]:
import cv2
import torch
import sys

from depth_anything_v2.dpt import DepthAnythingV2

DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

model_configs = {
    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
    'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}

encoder = 'vits' # or 'vits', 'vitb', 'vitg'

model = DepthAnythingV2(**model_configs[encoder])
model.load_state_dict(torch.load(f'/content/drive/MyDrive/Depthanythingv2/checkpoints/depth_anything_v2_{encoder}.pth', map_location='cpu'))
model = model.to(DEVICE).eval()

 # HxW raw depth map in numpy

In [None]:
def to_u16_rel(depth):
    """Normalize relative depth to 16-bit safely (no exact zeros)."""
    d = depth.astype(np.float32)
    d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0)
    if (d > 0).sum() < 10:
        return np.zeros_like(d, dtype=np.uint16)
    lo, hi = np.percentile(d[d > 0], (1, 99))
    d = np.clip(d, lo, hi)
    d = (d - lo) / max(1e-6, (hi - lo))
    d = np.clip(d, 1e-6, 1.0)  # avoid exact zeros
    return (d * 65535.0).astype(np.uint16)

def letterbox(img, target=768):
    h, w = img.shape[:2]
    s = target / max(h, w)
    nh, nw = int(round(h*s)), int(round(w*s))
    img_r = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_CUBIC)
    top = (target - nh)//2; bottom = target - nh - top
    left = (target - nw)//2; right = target - nw - left
    img_p = cv2.copyMakeBorder(img_r, top,bottom,left,right, cv2.BORDER_CONSTANT, value=(0,0,0))
    return img_p, (top,bottom,left,right), (h,w)

def unletterbox(arr, pads, orig_hw):
    top,bottom,left,right = pads
    arr = arr[top:arr.shape[0]-bottom, left:arr.shape[1]-right]
    return cv2.resize(arr, (orig_hw[1], orig_hw[0]), interpolation=cv2.INTER_LINEAR)

# --- Collect images ---
paths = sorted(p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
               for p in glob.glob(os.path.join(IMG_DIR, ext)))

# --- Inference loop (prefer 768 input; fallback if not supported) ---
import inspect
sig = str(getattr(model, "infer_image", None))
supports_size = "input_size" in (sig or "")

for p in tqdm(paths):
    img = cv2.imread(p)

    if supports_size:
        depth_rel = model.infer_image(img, input_size=768)  # HxW float (relative)
    else:
        # fallback: letterbox to 768, run, then unpad
        img_p, pads, orig_hw = letterbox(img, 768)
        depth_rel_p = model.infer_image(img_p)
        depth_rel   = unletterbox(depth_rel_p, pads, orig_hw).astype(np.float32)

    rel16 = to_u16_rel(depth_rel)
    stem = os.path.splitext(os.path.basename(p))[0]
    cv2.imwrite(f"{OUT_REL}/{stem}.png", rel16)
    viz = cv2.applyColorMap(255 - (rel16 // 256).astype(np.uint8), cv2.COLORMAP_INFERNO)
    cv2.imwrite(f"{OUT_VIZ}/{stem}.jpg", viz)

print("Saved:", len(paths), "relative depth maps to", OUT_REL)

100%|██████████| 18/18 [00:59<00:00,  3.33s/it]

Saved: 18 relative depth maps to /content/drive/MyDrive/Depthanythingv2/output-vits/depths/raw_da2_vits





### Evaluation (vits model)

In [None]:
import os, glob, cv2, numpy as np, pandas as pd
from tqdm import tqdm

IMG_DIR = "/content/drive/MyDrive/Depthanythingv2/data/eval"
DEP_DIR = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depths/raw_da2"   # relative depth PNG16
SEG_INST_DIR = "/content/drive/MyDrive/Depthanythingv2/output-vitl/seg"        # optional (uint16 instance ids), can be None
REPORT_CSV = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depth_eval_report.csv"

os.makedirs(os.path.dirname(REPORT_CSV), exist_ok=True)

def read_depth_rel(path_png16):
    d16 = cv2.imread(path_png16, -1).astype(np.float32)
    d = d16 / 65535.0
    d[~np.isfinite(d)] = np.nan
    return d


def sobel_grad(a):
    a = np.asarray(a, dtype=np.float32)  # <-- ensure float32
    gx = cv2.Sobel(a, cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(a, cv2.CV_32F, 0, 1, ksize=3)
    return np.sqrt(gx*gx + gy*gy)

def edge_alignment_metric(rgb, depth_rel):
    # RGB -> edges
    g_img = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0
    g_img = cv2.GaussianBlur(g_img, (3,3), 0.8)
    e_img = sobel_grad(g_img)

    # Depth -> edges (normalize safely, keep float32)
    x = depth_rel.copy()
    x = x.astype(np.float32)               # <-- ensure float32
    x[np.isnan(x)] = 0
    if (x>0).sum() < 10:
        return 0.0, 0.0, 0.0
    p1, p99 = np.percentile(x[x>0], (1,99))
    x = (x - np.float32(p1)) / max(np.float32(1e-6), np.float32(p99 - p1))
    x = np.clip(x, 0, 1).astype(np.float32)  # <-- ensure float32
    e_depth = sobel_grad(x)

    def binarize(e, pct=88):
        th = np.percentile(e, pct)
        return (e >= th).astype(np.uint8)

    Eimg = binarize(e_img, 88)
    Edep = binarize(e_depth, 88)

    inter = (Eimg & Edep).sum()
    dep_sum = Edep.sum() + 1e-6
    img_sum = Eimg.sum() + 1e-6
    prec = inter / dep_sum
    rec  = inter / img_sum
    f1   = 2*prec*rec / max(1e-6, (prec+rec))
    return float(prec), float(rec), float(f1)


def load_inst_mask(path):
    if (path is None) or (not os.path.exists(path)):
        return None
    m = cv2.imread(path, -1)
    if m is None:
        return None
    return m

def ordinal_person_vs_bg(depth_rel, inst=None):
    # Heuristic: person vs background ordering
    if inst is None:
        return np.nan
    # assume person ids are >0; if you have class ids, you can refine later
    # For now: treat the largest non-zero instance as foreground “person”-like
    ids, counts = np.unique(inst[inst>0], return_counts=True)
    if len(ids)==0:
        return np.nan
    fg_id = ids[np.argmax(counts)]
    fg = (inst==fg_id)
    if fg.sum() < 200:  # too tiny, skip
        return np.nan
    # background approx: farthest 20% depth pixels outside fg
    bg = (~fg) & (depth_rel>0)
    if bg.sum() < 200:
        return np.nan
    # sample N pairs
    N = 2000
    ys, xs = np.where(fg)
    idx_fg = np.random.choice(len(xs), size=min(N, len(xs)), replace=False)
    ys2, xs2 = np.where(bg)
    idx_bg = np.random.choice(len(xs2), size=min(N, len(xs2)), replace=False)
    df = depth_rel[ys[idx_fg], xs[idx_fg]]
    db = depth_rel[ys2[idx_bg], xs2[idx_bg]]
    # Nearer should have larger relative depth if we used inverse earlier,
    # but here depth_rel is arbitrary. We'll check ratio ordering: fg should be "closer"
    # Assume fg is closer => depth_rel[fg] < depth_rel[bg] (or > depending on model)
    # Robust trick: compare ranks using median thresholds
    med_fg = np.median(df[df>0]) if np.any(df>0) else 0
    med_bg = np.median(db[db>0]) if np.any(db>0) else 0
    # we don't know orientation; pick the orientation that gives higher accuracy
    acc1 = np.mean(df < db)
    acc2 = np.mean(df > db)
    return float(max(acc1, acc2))

def planarity_residual(depth_rel, rgb=None, inst=None):
    # Fit a dominant plane on the largest smooth region (no GT needed).
    # Normalize by median depth to make it scale-free.
    H,W = depth_rel.shape
    z = depth_rel.copy()
    mask = z>0
    if mask.sum() < 1000:
        return np.nan
    ys, xs = np.where(mask)
    # Downsample points for speed
    sel = np.random.choice(len(xs), size=min(20000, len(xs)), replace=False)
    xs, ys = xs[sel], ys[sel]
    zz = z[ys, xs]

    # Backproject with a default K (consistent for relative comparisons)
    f = 1.2*max(W,H); cx, cy = W/2, H/2
    X = (xs - cx) * zz / f
    Y = (ys - cy) * zz / f
    pts = np.stack([X,Y,zz], axis=1)

    # RANSAC plane fit
    # Plane ax+by+cz+d=0; solve by SVD on random 3-pt samples; keep best inliers
    best_rmse = None
    rng = np.random.default_rng(0)
    for _ in range(100):
        i = rng.choice(len(pts), 3, replace=False)
        P = pts[i]
        v1 = P[1]-P[0]; v2 = P[2]-P[0]
        n = np.cross(v1, v2)
        if np.linalg.norm(n) < 1e-8:
            continue
        n = n / np.linalg.norm(n)
        d = -np.dot(n, P[0])
        # distance to plane
        dist = np.abs(pts @ n + d)
        rmse = np.sqrt(np.mean(dist**2))
        if (best_rmse is None) or (rmse < best_rmse):
            best_rmse = rmse
    if best_rmse is None:
        return np.nan
    med_z = np.median(zz[zz>0]) if np.any(zz>0) else 1.0
    return float(best_rmse / max(1e-6, med_z))  # scale-free residual

def invalid_ratio(depth_rel):
    return float(1.0 - (depth_rel>0).mean())

rows=[]
paths = sorted([p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
                for p in glob.glob(os.path.join(IMG_DIR, ext))])

for p_rgb in tqdm(paths):
    stem = os.path.splitext(os.path.basename(p_rgb))[0]
    p_dep = os.path.join(DEP_DIR, f"{stem}.png")
    if not os.path.exists(p_dep):
        continue
    rgb = cv2.imread(p_rgb)
    depth = read_depth_rel(p_dep)

    prec, rec, f1 = edge_alignment_metric(rgb, depth)

    inst = None
    p_inst = os.path.join(SEG_INST_DIR, f"{stem}_inst.png") if SEG_INST_DIR else None
    if p_inst and os.path.exists(p_inst):
        inst = load_inst_mask(p_inst)

    ord_acc = ordinal_person_vs_bg(depth, inst)  # NaN if no masks

    plan_res = planarity_residual(depth, rgb, inst)
    inv_pct = invalid_ratio(depth)

    rows.append([stem, prec, rec, f1, ord_acc, plan_res, inv_pct])

df = pd.DataFrame(rows, columns=[
    "img_id","edge_prec","edge_rec","edge_f1","ordinal_fg_bg_acc","planarity_residual_rel","invalid_ratio"
])

# Simple overall score (higher is better):
# Edge F1 (40%), Ordinal (30%), Planarity (20%, lower is better), Invalid (10%, lower is better)
def score_row(r):
    f1 = np.nan_to_num(r.edge_f1, nan=0.0)
    ordv = np.nan_to_num(r.ordinal_fg_bg_acc, nan=0.5)  # if unknown, neutral
    plan = np.nan_to_num(r.planarity_residual_rel, nan=0.1)
    inv  = np.nan_to_num(r.invalid_ratio, nan=0.0)
    # Convert “lower is better” to [0,1]
    plan_ok = np.clip(1.0 - (plan/0.1), 0, 1)   # 0.1 ~ “ok” residual
    inv_ok  = np.clip(1.0 - (inv/0.05), 0, 1)   # 5% invalid budget
    return 0.4*f1 + 0.3*ordv + 0.2*plan_ok + 0.1*inv_ok

df["overall_score"] = df.apply(score_row, axis=1)
df.to_csv(REPORT_CSV, index=False)
df.sort_values("overall_score", ascending=False).head(10)


100%|██████████| 18/18 [00:06<00:00,  2.74it/s]


Unnamed: 0,img_id,edge_prec,edge_rec,edge_f1,ordinal_fg_bg_acc,planarity_residual_rel,invalid_ratio,overall_score
3,e14c21ec-33d6-4aee-b254-9349418d5008,0.393004,0.393005,0.393004,,0.218967,0.010059,0.387085
1,c356067f-132c-4032-a812-4d8a405f6a92,0.289001,0.289001,0.289001,,0.151955,0.019939,0.325722
0,877eef99-d724-4928-93ca-8abfb07cf90a,0.292972,0.293027,0.292999,,0.139935,0.025712,0.315776
2,de1ad16a-c56d-45ed-ae2a-35da38c4f0aa,0.281538,0.281538,0.281538,,0.151061,0.025765,0.311085


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Depthanythingv2/output-vits/depth_eval_report.csv")
mean = df["overall_score"].mean()
std = df["overall_score"].std()
print(f"Average overall_score = {mean:.3f} ± {std:.3f}")


Average overall_score = 0.331 ± 0.040


### Code (vitb model)

In [None]:
IMG_DIR  = "/content/drive/MyDrive/Depthanythingv2/data/eval"  # your test images
REPO     = "/content/Depth-Anything-V2"
CKPT     = f"{REPO}/checkpoints/depth_anything_v2_vitb.pth"     # vitb weights
OUT_REL  = "/content/drive/MyDrive/Depthanythingv2/output-vitb/depths/raw_da2_vitb896_tta"
OUT_VIZ  = "/content/drive/MyDrive/Depthanythingv2/output-vitb/depths/viz_da2_vitb896_tta"

import os, sys
os.makedirs(OUT_REL, exist_ok=True); os.makedirs(OUT_VIZ, exist_ok=True)
sys.path.append(REPO)


In [None]:
import torch
from depth_anything_v2.dpt import DepthAnythingV2

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.backends.cudnn.benchmark = True  # speed on fixed input shapes

model = DepthAnythingV2(
    encoder='vitb',
    features=128,
    out_channels=[96, 192, 384, 768]
)
state = torch.load(CKPT, map_location='cpu')
model.load_state_dict(state, strict=True)
model = model.to(DEVICE).eval()

print("Loaded vitb on", DEVICE)


FileNotFoundError: [Errno 2] No such file or directory: '/content/Depth-Anything-V2/checkpoints/depth_anything_v2_vitb.pth'

In [None]:
import cv2, glob, numpy as np
from tqdm import tqdm

def letterbox(img, target=896):
    h, w = img.shape[:2]
    s = target / max(h, w)
    nh, nw = int(round(h*s)), int(round(w*s))
    img_r = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_CUBIC)
    top = (target - nh)//2; bottom = target - nh - top
    left = (target - nw)//2; right = target - nw - left
    img_p = cv2.copyMakeBorder(img_r, top,bottom,left,right, cv2.BORDER_CONSTANT, value=(0,0,0))
    return img_p, (top,bottom,left,right), (h,w)

def unletterbox(arr, pads, orig_hw):
    top,bottom,left,right = pads
    arr = arr[top:arr.shape[0]-bottom, left:arr.shape[1]-right]
    return cv2.resize(arr, (orig_hw[1], orig_hw[0]), interpolation=cv2.INTER_LINEAR)

@torch.no_grad()
def infer_tta_rel(img_bgr):
    """Return relative depth (float32) using flip-TTA + AMP when on CUDA."""
    if DEVICE == 'cuda':
        with torch.cuda.amp.autocast(dtype=torch.float16):
            d0 = model.infer_image(img_bgr).astype(np.float32)
            d1 = model.infer_image(cv2.flip(img_bgr, 1)).astype(np.float32)
    else:
        d0 = model.infer_image(img_bgr).astype(np.float32)
        d1 = model.infer_image(cv2.flip(img_bgr, 1)).astype(np.float32)
    d1 = np.flip(d1, axis=1)
    return (0.5*(d0 + d1)).astype(np.float32)

def to_u16_rel(depth):
    """Robust 16-bit scaling for relative depth; avoids exact zeros."""
    d = depth.astype(np.float32)
    d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0)
    if (d>0).sum() < 10:
        return np.zeros_like(d, np.uint16)
    p1, p99 = np.percentile(d[d>0], (1,99))
    p1 = np.float32(p1); p99 = np.float32(p99)
    d = np.clip(d, p1, p99)
    d = (d - p1) / max(np.float32(1e-6), (p99 - p1))
    d = np.clip(d, np.float32(1e-6), np.float32(1.0))
    return (d * np.float32(65535.0)).astype(np.uint16)


In [None]:
paths = sorted(p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
               for p in glob.glob(os.path.join(IMG_DIR, ext)))

for p in tqdm(paths):
    img = cv2.imread(p)
    img_p, pads, orig_hw = letterbox(img, 896)
    d_rel_p = infer_tta_rel(img_p)                 # float32
    d_rel   = unletterbox(d_rel_p, pads, orig_hw)  # back to original size

    rel16 = to_u16_rel(d_rel)
    stem = os.path.splitext(os.path.basename(p))[0]
    cv2.imwrite(f"{OUT_REL}/{stem}.png", rel16)

    viz = cv2.applyColorMap(255 - (rel16//256).astype(np.uint8), cv2.COLORMAP_INFERNO)
    cv2.imwrite(f"{OUT_VIZ}/{stem}.jpg", viz)

print("Saved:", len(paths), "depth maps to", OUT_REL)


### Evaluation (vitb model)

In [None]:
import os, glob, cv2, numpy as np, pandas as pd
from tqdm import tqdm

IMG_DIR = IMG_DIR
DEP_DIR = OUT_REL
REPORT  = "/content/drive/MyDrive/Depthanythingv2/output-vitb/depth_eval_report_vitb896_tta.csv"
os.makedirs(os.path.dirname(REPORT), exist_ok=True)

def read_depth_rel(png16):
    d16 = cv2.imread(png16, -1).astype(np.float32)
    d = d16 / np.float32(65535.0)
    d[~np.isfinite(d)] = np.nan
    return d.astype(np.float32)

def sobel_grad(a):
    a = np.asarray(a, dtype=np.float32)
    gx = cv2.Sobel(a, cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(a, cv2.CV_32F, 0, 1, ksize=3)
    return np.sqrt(gx*gx + gy*gy).astype(np.float32)

def edge_metrics(rgb, depth_rel):
    g = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY).astype(np.float32)/255.0
    g = cv2.GaussianBlur(g,(3,3),0.8).astype(np.float32)
    e_img = sobel_grad(g)

    x = depth_rel.astype(np.float32); x[np.isnan(x)] = 0.0
    if (x>0).sum()<10: return 0.0,0.0,0.0
    p1,p99 = np.percentile(x[x>0],(1,99)); p1=np.float32(p1); p99=np.float32(p99)
    x = np.clip((x-p1)/max(np.float32(1e-6), (p99-p1)), 0, 1).astype(np.float32)
    e_dep = sobel_grad(x)

    t1 = np.float32(np.percentile(e_img,88))
    t2 = np.float32(np.percentile(e_dep,88))
    E1 = (e_img>=t1).astype(np.uint8); E2 = (e_dep>=t2).astype(np.uint8)

    inter = (E1 & E2).sum(); dep_sum = E2.sum()+1e-6; img_sum = E1.sum()+1e-6
    prec = inter/dep_sum; rec = inter/img_sum
    f1 = 2*prec*rec/max(1e-6,(prec+rec))
    return float(prec), float(rec), float(f1)

def planarity(depth_rel):
    z = depth_rel.astype(np.float32)
    mask = np.isfinite(z) & (z>0)
    if mask.sum() < 1000: return np.nan
    ys,xs = np.where(mask)
    sel = np.random.choice(len(xs), size=min(20000,len(xs)), replace=False)
    xs,ys = xs[sel],ys[sel]; zz = z[ys,xs].astype(np.float32)

    H,W = z.shape; f=np.float32(1.2*max(W,H)); cx=np.float32(W/2); cy=np.float32(H/2)
    X=(xs.astype(np.float32)-cx)*zz/f; Y=(ys.astype(np.float32)-cy)*zz/f
    P=np.stack([X,Y,zz],1).astype(np.float32)

    best=None; rng=np.random.default_rng(0)
    for _ in range(100):
        i=rng.choice(len(P),3,replace=False)
        v1=P[i[1]]-P[i[0]]; v2=P[i[2]]-P[i[0]]
        n=np.cross(v1,v2).astype(np.float32); n_norm=np.linalg.norm(n)
        if n_norm<1e-8: continue
        n/=n_norm; d0=-np.dot(n,P[i[0]]).astype(np.float32)
        dist=np.abs(P@n + d0).astype(np.float32)
        rmse=float(np.sqrt(np.mean(dist**2)))
        if (best is None) or (rmse<best): best=rmse
    if best is None: return np.nan
    med=float(np.median(zz[zz>0])) if np.any(zz>0) else 1.0
    return float(best/max(1e-6, med))

def invalid_ratio(depth_rel):
    return float(np.isnan(depth_rel).mean())

rows=[]
img_paths = sorted(p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
                   for p in glob.glob(os.path.join(IMG_DIR, ext)))
for p_rgb in tqdm(img_paths):
    stem = os.path.splitext(os.path.basename(p_rgb))[0]
    p_dep = os.path.join(DEP_DIR, f"{stem}.png")
    if not os.path.exists(p_dep): continue
    rgb = cv2.imread(p_rgb); depth = read_depth_rel(p_dep)
    prec,rec,f1 = edge_metrics(rgb, depth)
    plan = planarity(depth); inv = invalid_ratio(depth)
    rows.append([stem,prec,rec,f1,plan,inv])

import pandas as pd, numpy as np
df = pd.DataFrame(rows, columns=["img_id","edge_prec","edge_rec","edge_f1","planarity_residual_rel","invalid_ratio"])
if df.empty:
    raise RuntimeError("No matches: check that PNGs in OUT_REL share basenames with images.")

f1   = df["edge_f1"].astype(float).fillna(0.0)
plan = df["planarity_residual_rel"].astype(float).fillna(0.1)
inv  = df["invalid_ratio"].astype(float).fillna(0.0)
plan_ok = np.clip(1.0 - (plan/0.1), 0.0, 1.0)
inv_ok  = np.clip(1.0 - (inv /0.05), 0.0, 1.0)
df["overall_score"] = 0.6*f1 + 0.25*plan_ok + 0.15*inv_ok

df.to_csv(REPORT, index=False)
print(f"Average overall_score = {df['overall_score'].mean():.3f} ± {df['overall_score'].std():.3f}")
df.sort_values("overall_score", ascending=False).head(10)


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Depthanythingv2/output-vitb/depth_eval_report.csv")
mean = df["overall_score"].mean()
std = df["overall_score"].std()
print(f"Average overall_score = {mean:.3f} ± {std:.3f}")

### Code (vitl model)

In [None]:
import sys, os, glob

REPO = "/content/drive/MyDrive/Depthanythingv2"
dirs = glob.glob(REPO + "/**/depth_anything_v2", recursive=True)

if not dirs:
    raise RuntimeError("Can't find depth_anything_v2 folder anywhere under REPO.")

module_dir = dirs[0]                      # .../depth_anything_v2
repo_root = os.path.dirname(module_dir)   # the folder that CONTAINS depth_anything_v2

if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

print("Using repo_root:", repo_root)
print("sys.path[0]:", sys.path[0])

from depth_anything_v2.dpt import DepthAnythingV2
print("OK import DepthAnythingV2")

Using repo_root: /content/drive/MyDrive/Depthanythingv2/Depth-Anything-V2
sys.path[0]: /content/drive/MyDrive/Depthanythingv2/Depth-Anything-V2




OK import DepthAnythingV2


In [None]:
import os, glob, sys, cv2, numpy as np, torch
from tqdm import tqdm

# Your image folder on Drive
IMG_DIR = "/content/drive/MyDrive/Segmentation/dataset"

# Output folders (new)
OUT_REL = "/content/drive/MyDrive/Segmentation/depth-maps/raw_da2_vitl896_tta"
OUT_VIZ = "/content/drive/MyDrive/Segmentation/depth-maps/viz_da2_vitl896_tta"

# DA2 repo path + checkpoint path
REPO = "/content/drive/MyDrive/Depthanythingv2"
CKPT = "/content/drive/MyDrive/Depthanythingv2/checkpoints/depth_anything_v2_vitl.pth"

os.makedirs(OUT_REL, exist_ok=True)
os.makedirs(OUT_VIZ, exist_ok=True)

# Make sure we can import the library
sys.path.append(REPO)


In [None]:
from depth_anything_v2.dpt import DepthAnythingV2

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

model_configs = {
    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
}

model = DepthAnythingV2(**model_configs['vitl'])
state = torch.load(CKPT, map_location='cpu')
model.load_state_dict(state, strict=True)
model = model.to(DEVICE).eval()

print("Loaded vitl on", DEVICE)


Loaded vitl on cpu


In [None]:
def letterbox(img, target=896):
    h, w = img.shape[:2]
    s = target / max(h, w)
    nh, nw = int(round(h * s)), int(round(w * s))
    img_r = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_CUBIC)
    top = (target - nh) // 2; bottom = target - nh - top
    left = (target - nw) // 2; right = target - nw - left
    img_p = cv2.copyMakeBorder(img_r, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
    return img_p, (top, bottom, left, right), (h, w)

def unletterbox(arr, pads, orig_hw):
    top, bottom, left, right = pads
    arr = arr[top:arr.shape[0]-bottom, left:arr.shape[1]-right]
    return cv2.resize(arr, (orig_hw[1], orig_hw[0]), interpolation=cv2.INTER_LINEAR)

def infer_tta_rel(img_bgr):
    # Predict and average normal + flipped; keep float32
    d0 = model.infer_image(img_bgr).astype(np.float32)
    d1 = model.infer_image(cv2.flip(img_bgr, 1)).astype(np.float32)
    d1 = np.flip(d1, axis=1)
    d = 0.5 * (d0 + d1)
    return d.astype(np.float32)

def to_u16_rel(depth):
    d = depth.astype(np.float32)
    d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0)
    if (d > 0).sum() < 10:
        return np.zeros_like(d, dtype=np.uint16)
    p1, p99 = np.percentile(d[d > 0], (1, 99))
    p1 = np.float32(p1); p99 = np.float32(p99)
    d = np.clip(d, p1, p99)
    d = (d - p1) / max(np.float32(1e-6), (p99 - p1))
    d = np.clip(d, np.float32(1e-6), np.float32(1.0))   # avoid exact zeros
    return (d * np.float32(65535.0)).astype(np.uint16)


In [None]:
paths = sorted(
    p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
    for p in glob.glob(os.path.join(IMG_DIR, ext))
)

for p in tqdm(paths):
    img = cv2.imread(p)                    # uint8
    img_p, pads, orig_hw = letterbox(img, 896)
    depth_rel_p = infer_tta_rel(img_p)     # float32
    depth_rel = unletterbox(depth_rel_p, pads, orig_hw).astype(np.float32)

    rel16 = to_u16_rel(depth_rel)
    stem = os.path.splitext(os.path.basename(p))[0]
    cv2.imwrite(f"{OUT_REL}/{stem}.png", rel16)

    viz = cv2.applyColorMap(255 - (rel16 // 256).astype(np.uint8), cv2.COLORMAP_INFERNO)
    cv2.imwrite(f"{OUT_VIZ}/{stem}.jpg", viz)

print("Saved", len(paths), "depth maps to", OUT_REL)


100%|██████████| 563/563 [11:22:28<00:00, 72.73s/it]

Saved 563 depth maps to /content/drive/MyDrive/Segmentation/depth-maps/raw_da2_vitl896_tta





In [None]:
!ls  /content/drive/MyDrive/Depthanythingv2/output-vitl/depths/raw_da2_vitl896_tta

In [None]:
import os, glob, sys, cv2, numpy as np, torch
from tqdm import tqdm

# ==== paths (same as yours) ====
IMG_DIR = "/content/drive/MyDrive/Depthanythingv2/data/eval"
OUT_REL = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depths/raw_da2_vitl896_1152_ms_tta4_jbf"
OUT_VIZ = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depths/viz_da2_vitl896_1152_ms_tta4_jbf"
REPO    = "/content/drive/MyDrive/Depthanythingv2"
CKPT    = f"{REPO}/checkpoints/depth_anything_v2_vitl.pth"
os.makedirs(OUT_REL, exist_ok=True); os.makedirs(OUT_VIZ, exist_ok=True)
sys.path.append(REPO)

from depth_anything_v2.dpt import DepthAnythingV2
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DepthAnythingV2(encoder='vitl', features=256, out_channels=[256,512,1024,1024])
state = torch.load(CKPT, map_location='cpu'); model.load_state_dict(state, strict=True)
model = model.to(DEVICE).eval()
print("Loaded vitl on", DEVICE)

# ---------- helpers ----------
def letterbox_reflect(img, target):
    h, w = img.shape[:2]
    s = target / max(h, w)
    nh, nw = int(round(h * s)), int(round(w * s))
    img_r = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA if s < 1.0 else cv2.INTER_CUBIC)
    top = (target - nh) // 2; bottom = target - nh - top
    left = (target - nw) // 2; right = target - nw - left
    img_p = cv2.copyMakeBorder(img_r, top, bottom, left, right, cv2.BORDER_REFLECT_101)
    return img_p, (top, bottom, left, right), (h, w)

def unletterbox(arr, pads, orig_hw):
    top, bottom, left, right = pads
    arr = arr[top:arr.shape[0]-bottom, left:arr.shape[1]-right]
    return cv2.resize(arr, (orig_hw[1], orig_hw[0]), interpolation=cv2.INTER_LINEAR)

@torch.no_grad()
def infer_one(img_bgr):
    # model.infer_image accepts uint8 BGR; returns float32 depth (relative)
    return model.infer_image(img_bgr).astype(np.float32)

def infer_tta4(img_bgr):
    # none, h, v, hv; unflip back and average
    d0 = infer_one(img_bgr)
    d1 = np.flip(infer_one(cv2.flip(img_bgr, 1)), axis=1)  # h
    d2 = np.flip(infer_one(cv2.flip(img_bgr, 0)), axis=0)  # v
    d3 = np.flip(np.flip(infer_one(cv2.flip(cv2.flip(img_bgr, 1), 0)), axis=1), axis=0)  # hv
    return (d0 + d1 + d2 + d3) / 4.0

def infer_ms_tta(img_bgr, sizes=(896, 1152)):
    outs = []
    for s in sizes:
        img_p, pads, orig_hw = letterbox_reflect(img_bgr, s)
        d = infer_tta4(img_p)
        d = unletterbox(d, pads, orig_hw).astype(np.float32)
        outs.append(d)
    # resize all to original (already done) and average
    return np.mean(outs, axis=0).astype(np.float32)

def joint_bilateral_depth(depth, guide_bgr, ds=7, dr=0.1, iters=2):
    # depth in [0,1] relative; guide is BGR uint8. Use domain transform-like iterative bilateral.
    d = depth.copy().astype(np.float32)
    g = guide_bgr
    for _ in range(iters):
        # OpenCV doesn't have true joint bilateral; use bilateral on depth plus small guidance mix
        # Build an edge map to preserve discontinuities
        edges = cv2.Canny(cv2.cvtColor(g, cv2.COLOR_BGR2GRAY), 50, 150).astype(np.float32)/255.0
        # Light bilateral
        d_blur = cv2.bilateralFilter(d, ds, dr*255.0, ds)
        # Keep strong edges from original, smooth elsewhere
        w = cv2.GaussianBlur(edges, (0,0), 1.0)
        w = np.clip(1.0 - w, 0.0, 1.0).astype(np.float32)
        d = w*d_blur + (1.0 - w)*d
    return d

def to_u16_rel(depth):
    d = depth.astype(np.float32)
    d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0)
    if (d > 0).sum() < 10:
        return np.zeros_like(d, dtype=np.uint16)
    p_lo, p_hi = np.percentile(d[d > 0], (0.5, 99.5))
    p_lo = np.float32(p_lo); p_hi = np.float32(p_hi)
    d = np.clip((d - p_lo) / max(np.float32(1e-6), (p_hi - p_lo)), 0, 1)
    return (d * np.float32(65535.0)).astype(np.uint16)

# ---------- run ----------
paths = sorted(p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
               for p in glob.glob(os.path.join(IMG_DIR, ext)))

for p in tqdm(paths):
    img = cv2.imread(p)
    depth_rel = infer_ms_tta(img, sizes=(896,1152))  # <- multi-scale + TTA(4)

    # edge-preserving refinement guided by RGB to sharpen boundaries / flatten planes
    x = depth_rel.copy()
    # bring to [0,1] before refinement to stabilize bilateral behavior
    if (x>0).sum() >= 10:
        q1, q99 = np.percentile(x[x>0], (1,99))
        x = np.clip((x - q1) / max(1e-6, (q99 - q1)), 0, 1).astype(np.float32)
    x = joint_bilateral_depth(x, img, ds=7, dr=0.08, iters=2)
    depth_rel = x.astype(np.float32)

    stem = os.path.splitext(os.path.basename(p))[0]
    rel16 = to_u16_rel(depth_rel)
    cv2.imwrite(f"{OUT_REL}/{stem}.png", rel16)

    viz = cv2.applyColorMap(255 - (rel16 // 256).astype(np.uint8), cv2.COLORMAP_INFERNO)
    cv2.imwrite(f"{OUT_VIZ}/{stem}.jpg", viz)

print("Saved", len(paths), "depth maps to", OUT_REL)


### Evaluation (vitl model)

In [None]:
import os, glob, cv2, numpy as np, pandas as pd
from tqdm import tqdm

# --- paths: point DEP_DIR to your vitl outputs ---
IMG_DIR    = "/content/drive/MyDrive/Depthanythingv2/data/eval"
DEP_DIR    = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depths/raw_da2_vitl896_1152_ms_tta4_jbf"
REPORT_CSV = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depth_eval_report_vitl896_1152_ms_tta4_jbf.csv"
os.makedirs(os.path.dirname(REPORT_CSV), exist_ok=True)

# --- helpers ---
def read_depth_rel(path_png16):
    d16 = cv2.imread(path_png16, -1).astype(np.float32)
    d = d16 / np.float32(65535.0)
    d[~np.isfinite(d)] = np.nan
    return d.astype(np.float32)

def sobel_grad(a):
    a = np.asarray(a, dtype=np.float32)
    gx = cv2.Sobel(a, cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(a, cv2.CV_32F, 0, 1, ksize=3)
    return np.sqrt(gx*gx + gy*gy).astype(np.float32)

def edge_metrics(rgb, depth_rel):
    g = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY).astype(np.float32)/255.0
    g = cv2.GaussianBlur(g,(3,3),0.8).astype(np.float32)
    e_img = sobel_grad(g)

    x = depth_rel.astype(np.float32)
    x[np.isnan(x)] = 0.0
    if (x>0).sum() < 10:
        return 0.0, 0.0, 0.0
    p1,p99 = np.percentile(x[x>0], (1,99))
    p1 = np.float32(p1); p99 = np.float32(p99)
    x = np.clip((x - p1) / max(np.float32(1e-6), (p99 - p1)), 0, 1).astype(np.float32)
    e_dep = sobel_grad(x)

    t1 = np.float32(np.percentile(e_img, 88))
    t2 = np.float32(np.percentile(e_dep, 88))
    E1 = (e_img >= t1).astype(np.uint8)
    E2 = (e_dep >= t2).astype(np.uint8)

    inter = (E1 & E2).sum()
    prec  = inter / (E2.sum() + 1e-6)
    rec   = inter / (E1.sum() + 1e-6)
    f1    = 2*prec*rec / max(1e-6, (prec+rec))
    return float(prec), float(rec), float(f1)

def planarity(depth_rel):
    z = depth_rel.astype(np.float32)
    mask = np.isfinite(z) & (z>0)
    if mask.sum() < 1000:
        return np.nan
    ys,xs = np.where(mask)
    sel = np.random.choice(len(xs), size=min(20000,len(xs)), replace=False)
    xs,ys = xs[sel],ys[sel]; zz = z[ys,xs].astype(np.float32)

    H,W = z.shape; f = np.float32(1.2*max(W,H)); cx = np.float32(W/2); cy = np.float32(H/2)
    X = (xs.astype(np.float32) - cx)*zz/f
    Y = (ys.astype(np.float32) - cy)*zz/f
    P = np.stack([X,Y,zz],1).astype(np.float32)

    best=None; rng = np.random.default_rng(0)
    for _ in range(100):
        i = rng.choice(len(P), 3, replace=False)
        v1 = P[i[1]] - P[i[0]]; v2 = P[i[2]] - P[i[0]]
        n  = np.cross(v1, v2).astype(np.float32)
        n_norm = np.linalg.norm(n)
        if n_norm < 1e-8: continue
        n /= n_norm; d0 = -np.dot(n, P[i[0]]).astype(np.float32)
        dist = np.abs(P @ n + d0).astype(np.float32)
        rmse = float(np.sqrt(np.mean(dist**2)))
        if (best is None) or (rmse < best): best = rmse
    if best is None: return np.nan
    med = float(np.median(zz[zz>0])) if np.any(zz>0) else 1.0
    return float(best / max(1e-6, med))

def invalid_ratio(depth_rel):
    return float(np.isnan(depth_rel).mean())

# --- compute rows ---
rows = []
img_paths = sorted(p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
                   for p in glob.glob(os.path.join(IMG_DIR, ext)))

for p_rgb in tqdm(img_paths):
    stem = os.path.splitext(os.path.basename(p_rgb))[0]
    p_dep = os.path.join(DEP_DIR, f"{stem}.png")
    if not os.path.exists(p_dep):
        continue
    rgb   = cv2.imread(p_rgb)
    depth = read_depth_rel(p_dep)
    prec, rec, f1 = edge_metrics(rgb, depth)
    plan = planarity(depth)
    inv  = invalid_ratio(depth)
    rows.append([stem, prec, rec, f1, plan, inv])

df = pd.DataFrame(rows, columns=[
    "img_id","edge_prec","edge_rec","edge_f1","planarity_residual_rel","invalid_ratio"
])

if df.empty:
    raise RuntimeError(f"No matches found. Check DEP_DIR: {DEP_DIR}")

# --- vectorized overall score ---
f1   = df["edge_f1"].astype(float).fillna(0.0)
plan = df["planarity_residual_rel"].astype(float).fillna(0.1)
inv  = df["invalid_ratio"].astype(float).fillna(0.0)
plan_ok = np.clip(1.0 - (plan/0.1), 0.0, 1.0)
inv_ok  = np.clip(1.0 - (inv /0.05), 0.0, 1.0)
df["overall_score"] = 0.6*f1 + 0.25*plan_ok + 0.15*inv_ok

df.to_csv(REPORT_CSV, index=False)
print(f"Average overall_score = {df['overall_score'].mean():.3f} ± {df['overall_score'].std():.3f}")
df.sort_values("overall_score", ascending=False).head(10)


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Depthanythingv2/output-vitl/depth_eval_report_vitl896_tta.csv")
mean = df["overall_score"].mean()
std = df["overall_score"].std()
print(f"Average overall_score = {mean:.3f} ± {std:.3f}")

### visual

In [None]:
# ---- EDIT THESE IF YOUR FILES ARE ELSEWHERE ----
P_VITS = "/content/drive/MyDrive/Depthanythingv2/output-vits/depth_eval_report.csv"
P_VITB = "/content/drive/MyDrive/Depthanythingv2/output-vitb/depth_eval_report_vitb896_tta.csv"
P_VITL = "/content/drive/MyDrive/Depthanythingv2/output-vitl/depth_eval_report_vitl896_tta.csv"
# ------------------------------------------------

import os, pandas as pd, numpy as np

def load_eval(path, model_tag):
    if not os.path.exists(path):
        return None
    df = pd.read_csv(path)
    need = {"img_id","edge_prec","edge_rec","edge_f1","planarity_residual_rel","invalid_ratio","overall_score"}
    if not need.issubset(df.columns):
        return None
    df = df.copy()
    df["model"] = model_tag
    return df

dfs = []
for p, tag in [(P_VITS,"vits"), (P_VITB,"vitb"), (P_VITL,"vitl")]:
    d = load_eval(p, tag)
    if d is not None:
        dfs.append(d)

if not dfs:
    raise RuntimeError("No eval CSVs loaded. Double-check the three P_* paths at the top.")

all_df = pd.concat(dfs, ignore_index=True)

# Summary per model
summary = (
    all_df.groupby("model")[["overall_score","edge_f1","planarity_residual_rel","invalid_ratio"]]
    .agg(["count","mean","std"])
)

# Per-image wide table for overall_score (handy to compare same image across models)
wide_overall = all_df.pivot_table(index="img_id", columns="model", values="overall_score", aggfunc="mean")

# Show quick text summary
print("=== Summary by model ===")
print(summary.round(3))
print("\n=== Per-image Overall (wide) ===")
print(wide_overall.round(3).fillna("—"))
