# Model Sanity Checks
This notebook loads EfficientNet, Mask R-CNN, and ZoeDepth to visualize outputs.

## Environment setup (run once)

If packages are already installed, you can skip this cell.
Restart the kernel after installation.

In [None]:
# Install base requirements
%pip install -r ../requirements.txt

## imports & paths

In [None]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt

# Make scripts importable
sys.path.append(os.path.abspath("../"))

from scripts.utils import load_rgb, ensure_uint8_rgb

## load image

In [None]:
img_path = "../data/sample.jpg"
img = ensure_uint8_rgb(load_rgb(img_path))

plt.figure()
plt.imshow(img)
plt.title("Input image")
plt.axis("off")
plt.show()

## EfficientNet classification

In [None]:
import torch
from scripts.efficientnet import load_efficientnet, run_efficientnet, load_imagenet_labels

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model, xform = load_efficientnet(
    model_name="tf_efficientnetv2_s",
    device=device,
    mode="cls",
)

labels = load_imagenet_labels()
res = run_efficientnet(model, xform, img, device=device, mode="cls", topk=5)

for i, (idx, p) in enumerate(zip(res.topk_indices, res.topk_probs), 1):
    print(f"{i}. {labels[idx]} ({p:.3f})")

## Mask R-CNN instance segmentation (torchvision)

In [None]:
from scripts.maskrcnn_torchvision import load_maskrcnn, predict_instances

model, preprocess = load_maskrcnn(device=device)

seg = predict_instances(model, preprocess, img, device=device, score_thresh=0.5)

print("Instances:", len(seg.scores))
print("Top scores:", seg.scores[:10])

## visualize masks

In [None]:
import random

vis = img.copy()
h, w = img.shape[:2]

plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.imshow(img)
plt.title("Input")
plt.axis("off")

plt.subplot(1,2,2)
plt.imshow(img)
plt.title("Masks overlay (random 5)")
plt.axis("off")

# overlay up to 5 masks
n = min(5, seg.masks.shape[0])
idxs = list(range(seg.masks.shape[0]))
random.shuffle(idxs)
idxs = idxs[:n]

overlay = img.copy().astype(np.float32)
for j in idxs:
    m = seg.masks[j]
    color = np.array([random.randint(0,255), random.randint(0,255), random.randint(0,255)], dtype=np.float32)
    overlay[m] = 0.55 * overlay[m] + 0.45 * color

plt.imshow(overlay.astype(np.uint8))
plt.show()

## ZoeDepth

In [None]:
from scripts.zoedepth import load_zoedepth, predict_depth

zoe = load_zoedepth(device=device)
d = predict_depth(zoe, img, device=device).depth

print(d.shape, d.dtype, float(np.nanmin(d)), float(np.nanmax(d)))

plt.figure()
plt.imshow(d)  # no colormap specified (your preference rule is about not forcing colors)
plt.title("ZoeDepth output (raw)")
plt.axis("off")
plt.show()

## depth inside instance masks (sanity check for later fusion)

In [None]:
if seg.masks.shape[0] > 0:
    for i in range(min(5, seg.masks.shape[0])):
        m = seg.masks[i]
        vals = d[m]
        print(f"Instance {i}: pixels={vals.size}, depth median={np.median(vals):.4f}, mean={np.mean(vals):.4f}")
else:
    print("No instances found; try a different image or lower score_thresh.")

## EfficientNet as backbone (feature extractor)

In [None]:
from scripts.efficientnet import load_efficientnet, run_efficientnet, describe_features

backbone, xform = load_efficientnet(
    model_name="tf_efficientnetv2_s",
    device=device,
    mode="backbone",
    out_indices=(1, 2, 3, 4),
)

out = run_efficientnet(backbone, xform, img, device=device, mode="backbone")
describe_features(out.features)