### Code (vitb model)

In [None]:
IMG_DIR  = "/content/drive/MyDrive/Depthanythingv2/data/eval"  # your test images
REPO     = "/content/Depth-Anything-V2"
CKPT     = f"{REPO}/checkpoints/depth_anything_v2_vitb.pth"     # vitb weights
OUT_REL  = "/content/drive/MyDrive/Depthanythingv2/output-vitb/depths/raw_da2_vitb896_tta"
OUT_VIZ  = "/content/drive/MyDrive/Depthanythingv2/output-vitb/depths/viz_da2_vitb896_tta"

import os, sys
os.makedirs(OUT_REL, exist_ok=True); os.makedirs(OUT_VIZ, exist_ok=True)
sys.path.append(REPO)


In [None]:
import torch
from depth_anything_v2.dpt import DepthAnythingV2

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.backends.cudnn.benchmark = True  # speed on fixed input shapes

model = DepthAnythingV2(
    encoder='vitb',
    features=128,
    out_channels=[96, 192, 384, 768]
)
state = torch.load(CKPT, map_location='cpu')
model.load_state_dict(state, strict=True)
model = model.to(DEVICE).eval()

print("Loaded vitb on", DEVICE)


FileNotFoundError: [Errno 2] No such file or directory: '/content/Depth-Anything-V2/checkpoints/depth_anything_v2_vitb.pth'

In [None]:
import cv2, glob, numpy as np
from tqdm import tqdm

def letterbox(img, target=896):
    h, w = img.shape[:2]
    s = target / max(h, w)
    nh, nw = int(round(h*s)), int(round(w*s))
    img_r = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_CUBIC)
    top = (target - nh)//2; bottom = target - nh - top
    left = (target - nw)//2; right = target - nw - left
    img_p = cv2.copyMakeBorder(img_r, top,bottom,left,right, cv2.BORDER_CONSTANT, value=(0,0,0))
    return img_p, (top,bottom,left,right), (h,w)

def unletterbox(arr, pads, orig_hw):
    top,bottom,left,right = pads
    arr = arr[top:arr.shape[0]-bottom, left:arr.shape[1]-right]
    return cv2.resize(arr, (orig_hw[1], orig_hw[0]), interpolation=cv2.INTER_LINEAR)

@torch.no_grad()
def infer_tta_rel(img_bgr):
    """Return relative depth (float32) using flip-TTA + AMP when on CUDA."""
    if DEVICE == 'cuda':
        with torch.cuda.amp.autocast(dtype=torch.float16):
            d0 = model.infer_image(img_bgr).astype(np.float32)
            d1 = model.infer_image(cv2.flip(img_bgr, 1)).astype(np.float32)
    else:
        d0 = model.infer_image(img_bgr).astype(np.float32)
        d1 = model.infer_image(cv2.flip(img_bgr, 1)).astype(np.float32)
    d1 = np.flip(d1, axis=1)
    return (0.5*(d0 + d1)).astype(np.float32)

def to_u16_rel(depth):
    """Robust 16-bit scaling for relative depth; avoids exact zeros."""
    d = depth.astype(np.float32)
    d = np.nan_to_num(d, nan=0.0, posinf=0.0, neginf=0.0)
    if (d>0).sum() < 10:
        return np.zeros_like(d, np.uint16)
    p1, p99 = np.percentile(d[d>0], (1,99))
    p1 = np.float32(p1); p99 = np.float32(p99)
    d = np.clip(d, p1, p99)
    d = (d - p1) / max(np.float32(1e-6), (p99 - p1))
    d = np.clip(d, np.float32(1e-6), np.float32(1.0))
    return (d * np.float32(65535.0)).astype(np.uint16)


In [None]:
paths = sorted(p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
               for p in glob.glob(os.path.join(IMG_DIR, ext)))

for p in tqdm(paths):
    img = cv2.imread(p)
    img_p, pads, orig_hw = letterbox(img, 896)
    d_rel_p = infer_tta_rel(img_p)                 # float32
    d_rel   = unletterbox(d_rel_p, pads, orig_hw)  # back to original size

    rel16 = to_u16_rel(d_rel)
    stem = os.path.splitext(os.path.basename(p))[0]
    cv2.imwrite(f"{OUT_REL}/{stem}.png", rel16)

    viz = cv2.applyColorMap(255 - (rel16//256).astype(np.uint8), cv2.COLORMAP_INFERNO)
    cv2.imwrite(f"{OUT_VIZ}/{stem}.jpg", viz)

print("Saved:", len(paths), "depth maps to", OUT_REL)


### Evaluation (vitb model)

In [None]:
import os, glob, cv2, numpy as np, pandas as pd
from tqdm import tqdm

IMG_DIR = IMG_DIR
DEP_DIR = OUT_REL
REPORT  = "/content/drive/MyDrive/Depthanythingv2/output-vitb/depth_eval_report_vitb896_tta.csv"
os.makedirs(os.path.dirname(REPORT), exist_ok=True)

def read_depth_rel(png16):
    d16 = cv2.imread(png16, -1).astype(np.float32)
    d = d16 / np.float32(65535.0)
    d[~np.isfinite(d)] = np.nan
    return d.astype(np.float32)

def sobel_grad(a):
    a = np.asarray(a, dtype=np.float32)
    gx = cv2.Sobel(a, cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(a, cv2.CV_32F, 0, 1, ksize=3)
    return np.sqrt(gx*gx + gy*gy).astype(np.float32)

def edge_metrics(rgb, depth_rel):
    g = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY).astype(np.float32)/255.0
    g = cv2.GaussianBlur(g,(3,3),0.8).astype(np.float32)
    e_img = sobel_grad(g)

    x = depth_rel.astype(np.float32); x[np.isnan(x)] = 0.0
    if (x>0).sum()<10: return 0.0,0.0,0.0
    p1,p99 = np.percentile(x[x>0],(1,99)); p1=np.float32(p1); p99=np.float32(p99)
    x = np.clip((x-p1)/max(np.float32(1e-6), (p99-p1)), 0, 1).astype(np.float32)
    e_dep = sobel_grad(x)

    t1 = np.float32(np.percentile(e_img,88))
    t2 = np.float32(np.percentile(e_dep,88))
    E1 = (e_img>=t1).astype(np.uint8); E2 = (e_dep>=t2).astype(np.uint8)

    inter = (E1 & E2).sum(); dep_sum = E2.sum()+1e-6; img_sum = E1.sum()+1e-6
    prec = inter/dep_sum; rec = inter/img_sum
    f1 = 2*prec*rec/max(1e-6,(prec+rec))
    return float(prec), float(rec), float(f1)

def planarity(depth_rel):
    z = depth_rel.astype(np.float32)
    mask = np.isfinite(z) & (z>0)
    if mask.sum() < 1000: return np.nan
    ys,xs = np.where(mask)
    sel = np.random.choice(len(xs), size=min(20000,len(xs)), replace=False)
    xs,ys = xs[sel],ys[sel]; zz = z[ys,xs].astype(np.float32)

    H,W = z.shape; f=np.float32(1.2*max(W,H)); cx=np.float32(W/2); cy=np.float32(H/2)
    X=(xs.astype(np.float32)-cx)*zz/f; Y=(ys.astype(np.float32)-cy)*zz/f
    P=np.stack([X,Y,zz],1).astype(np.float32)

    best=None; rng=np.random.default_rng(0)
    for _ in range(100):
        i=rng.choice(len(P),3,replace=False)
        v1=P[i[1]]-P[i[0]]; v2=P[i[2]]-P[i[0]]
        n=np.cross(v1,v2).astype(np.float32); n_norm=np.linalg.norm(n)
        if n_norm<1e-8: continue
        n/=n_norm; d0=-np.dot(n,P[i[0]]).astype(np.float32)
        dist=np.abs(P@n + d0).astype(np.float32)
        rmse=float(np.sqrt(np.mean(dist**2)))
        if (best is None) or (rmse<best): best=rmse
    if best is None: return np.nan
    med=float(np.median(zz[zz>0])) if np.any(zz>0) else 1.0
    return float(best/max(1e-6, med))

def invalid_ratio(depth_rel):
    return float(np.isnan(depth_rel).mean())

rows=[]
img_paths = sorted(p for ext in ("*.jpg","*.jpeg","*.png","*.JPG","*.PNG")
                   for p in glob.glob(os.path.join(IMG_DIR, ext)))
for p_rgb in tqdm(img_paths):
    stem = os.path.splitext(os.path.basename(p_rgb))[0]
    p_dep = os.path.join(DEP_DIR, f"{stem}.png")
    if not os.path.exists(p_dep): continue
    rgb = cv2.imread(p_rgb); depth = read_depth_rel(p_dep)
    prec,rec,f1 = edge_metrics(rgb, depth)
    plan = planarity(depth); inv = invalid_ratio(depth)
    rows.append([stem,prec,rec,f1,plan,inv])

import pandas as pd, numpy as np
df = pd.DataFrame(rows, columns=["img_id","edge_prec","edge_rec","edge_f1","planarity_residual_rel","invalid_ratio"])
if df.empty:
    raise RuntimeError("No matches: check that PNGs in OUT_REL share basenames with images.")

f1   = df["edge_f1"].astype(float).fillna(0.0)
plan = df["planarity_residual_rel"].astype(float).fillna(0.1)
inv  = df["invalid_ratio"].astype(float).fillna(0.0)
plan_ok = np.clip(1.0 - (plan/0.1), 0.0, 1.0)
inv_ok  = np.clip(1.0 - (inv /0.05), 0.0, 1.0)
df["overall_score"] = 0.6*f1 + 0.25*plan_ok + 0.15*inv_ok

df.to_csv(REPORT, index=False)
print(f"Average overall_score = {df['overall_score'].mean():.3f} ± {df['overall_score'].std():.3f}")
df.sort_values("overall_score", ascending=False).head(10)


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Depthanythingv2/output-vitb/depth_eval_report.csv")
mean = df["overall_score"].mean()
std = df["overall_score"].std()
print(f"Average overall_score = {mean:.3f} ± {std:.3f}")