# Depth Estimation Training Notebook
這是一個簡單的 UNet 訓練範例，可以丟 image -> 預測 depth (NPY)。

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import numpy as np
from PIL import Image
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [2]:
class DepthDataset(Dataset):
    def __init__(self, image_dir, depth_dir, transform=None):
        self.image_files = sorted([os.path.join(image_dir, f) for f in os.listdir(image_dir)])
        self.depth_files = sorted([os.path.join(depth_dir, f) for f in os.listdir(depth_dir)])
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img = Image.open(self.image_files[idx]).convert("L")
        depth = np.load(self.depth_files[idx])

        if self.transform:
            img = self.transform(img)
        depth = torch.from_numpy(depth).float().unsqueeze(0)
        return img, depth

In [3]:
class UNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc1 = nn.Sequential(nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.Conv2d(32, 32, 3, padding=1), nn.ReLU())
        self.pool1 = nn.MaxPool2d(2)
        self.enc2 = nn.Sequential(nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.Conv2d(64, 64, 3, padding=1), nn.ReLU())
        self.pool2 = nn.MaxPool2d(2)
        self.bottleneck = nn.Sequential(nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.Conv2d(128, 128, 3, padding=1), nn.ReLU())
        self.up2 = nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.dec2 = nn.Sequential(nn.Conv2d(128, 64, 3, padding=1), nn.ReLU(), nn.Conv2d(64, 64, 3, padding=1), nn.ReLU())
        self.up1 = nn.ConvTranspose2d(64, 32, 2, stride=2)
        self.dec1 = nn.Sequential(nn.Conv2d(64, 32, 3, padding=1), nn.ReLU(), nn.Conv2d(32, 32, 3, padding=1), nn.ReLU())
        self.out_conv = nn.Conv2d(32, 1, 1)

    def forward(self, x):
        e1 = self.enc1(x)
        p1 = self.pool1(e1)
        e2 = self.enc2(p1)
        p2 = self.pool2(e2)
        b = self.bottleneck(p2)
        u2 = self.up2(b)
        d2 = self.dec2(torch.cat([u2, e2], dim=1))
        u1 = self.up1(d2)
        d1 = self.dec1(torch.cat([u1, e1], dim=1))
        return self.out_conv(d1)

In [4]:
transform = T.Compose([T.Resize((256,256)), T.ToTensor()])
train_dataset = DepthDataset("data_v2/train/images", "data_v2/train/depths", transform)
val_dataset = DepthDataset("data_v2/val/images", "data_v2/val/depths", transform)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

model = UNet().to(device)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [5]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for imgs, depths in tqdm(train_loader):
        imgs, depths = imgs.to(device), depths.to(device)
        preds = model(imgs)
        loss = criterion(preds, depths)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Train Loss: {total_loss/len(train_loader):.4f}")

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for imgs, depths in val_loader:
            imgs, depths = imgs.to(device), depths.to(device)
            preds = model(imgs)
            val_loss += criterion(preds, depths).item()
    print(f"Validation Loss: {val_loss/len(val_loader):.4f}")

torch.save(model.state_dict(), "depth_unet.pth")
print("Model saved: depth_unet.pth")

  return F.l1_loss(input, target, reduction=self.reduction)
  0%|          | 0/830 [00:00<?, ?it/s]


RuntimeError: The size of tensor a (256) must match the size of tensor b (1280) at non-singleton dimension 3