In [5]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using device:", device)


Using device: mps


In [None]:
! pip install torch torchvision torchaudio


Collecting torch
  Using cached torch-2.9.0-cp310-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting torchvision
  Using cached torchvision-0.24.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.9 kB)
Collecting torchaudio
  Using cached torchaudio-2.9.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.9 kB)
Collecting filelock (from torch)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=0.8.5 (from torch)
  Using cached fsspec-2025.10.0-py3-none-any.whl.metadata (10 kB)
Collecting numpy (from torchvision)
  Using cached numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pillow!=8.3.*,>=5.3.0 (from torchvision)
  Using cached pillow-1

In [4]:
import torch
import torch.nn as nn

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.relu(self.bn(self.conv(x)))


In [6]:
class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.layer1 = ConvBlock(channels, channels)
        self.layer2 = ConvBlock(channels, channels)
    
    def forward(self, x):
        out = self.layer2(self.layer1(x))
        return out + x  # skip connection


In [7]:
class MyBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        # Stage 1
        self.layer1 = ConvBlock(3, 32, stride=2)
        
        # Stage 2
        self.layer2 = nn.Sequential(
            ConvBlock(32, 64, stride=2),
            ResidualBlock(64)
        )
        
        # Stage 3
        self.layer3 = nn.Sequential(
            ConvBlock(64, 128, stride=2),
            ResidualBlock(128),
            ResidualBlock(128)
        )
        
        # Stage 4
        self.layer4 = nn.Sequential(
            ConvBlock(128, 256, stride=2),
            ResidualBlock(256),
            ResidualBlock(256)
        )
        
        # Stage 5
        self.layer5 = nn.Sequential(
            ConvBlock(256, 512, stride=2),
            ResidualBlock(512),
            ResidualBlock(512)
        )
    
    def forward(self, x):
        x1 = self.layer1(x)  # 320×320×32
        x2 = self.layer2(x1) # 160×160×64
        x3 = self.layer3(x2) # 80×80×128
        x4 = self.layer4(x3) # 40×40×256
        x5 = self.layer5(x4) # 20×20×512
        return x3, x4, x5  # multiple scales for detection


In [8]:

x = torch.randn(1, 3, 640, 640)  # one sample
model = MyBackbone()
f3, f4, f5 = model(x)

print(f3.shape)  # (1, 128, 80, 80)
print(f4.shape)  # (1, 256, 40, 40)
print(f5.shape)  # (1, 512, 20, 20)


torch.Size([1, 128, 80, 80])
torch.Size([1, 256, 40, 40])
torch.Size([1, 512, 20, 20])
