In [7]:
import torch
import torch.nn as nn
from torchvision import models

resnet50 = models.resnet50(pretrained=True)
resnet50_modules = list(resnet50.children())

# nn.Linear 모듈의 index확인용도
for i, module in enumerate(resnet50_modules):
    print(f'{i} indx : {module}')

# CNN 모듈만 가져와서 DETR의 Backbone으로 활용한다.
backbone = nn.Sequential(*resnet50_modules[:8])



0 indx : Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
1 indx : BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
2 indx : ReLU(inplace=True)
3 indx : MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
4 indx : Sequential(
  (0): Bottleneck(
    (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (downsample): Sequential(
      (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (1): BatchNorm2d

In [9]:
# 실제로 해상도가 1/32 로 바뀌는지 확인한다.
data = torch.randn(1,3,224,224)
print(backbone(data).size())

torch.Size([1, 2048, 7, 7])


In [22]:
import torch
import math

def get_1d_pe(max_len: int, d_model: int) -> torch.Tensor:
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)      # [L,1]
    div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) *
                         -(math.log(10000.0) / d_model))                    # [d_model/2]
    pe[:, 0::2] = torch.sin(position * div_term)                            # even idx
    pe[:, 1::2] = torch.cos(position * div_term)                            # odd idx
    return pe  # [L, d_model]

def get_2d_pe(channels: int, height: int, width: int) -> torch.Tensor:
    """
    channels must be even.  
    Returns tensor of shape [channels, height, width].
    """
    assert channels % 2 == 0, "channels must be divisible by 2"
    c_half = channels // 2

    # 1D PE for H and W
    pe_h = get_1d_pe(height, c_half)  # [H, c_half]
    pe_w = get_1d_pe(width,  c_half)  # [W, c_half]

    # expand to 2D
    # pe_h.unsqueeze(2): [H, c_half, 1] → broadcast to [H, c_half, W]
    # pe_w.unsqueeze(1): [W, c_half] → [1, c_half, W] → broadcast to [H, c_half, W]
    pe = torch.zeros(channels, height, width)
    pe[:c_half] = pe_h.transpose(0,1).unsqueeze(2).repeat(1, 1, width)
    pe[c_half:] = pe_w.transpose(0,1).unsqueeze(1).repeat(1, height, 1)

    return pe  # [C, H, W]

B, C, H, W = 1, 2048, 7, 7
data = torch.randn(B, C, H, W)
pos_enc = get_2d_pe(C, H, W) 

print(f'result = data + positional_encoding {(data+pos_enc).size()}')


result = data + positional_encoding torch.Size([1, 2048, 7, 7])


# CNN 모델을 통해서 나온 결과에 Positional Encoding을 더해주는 코드

In [None]:
data = torch.randn(1,3,224,224)

feature_map = backbone(data)
B, C, H, W = feature_map.size()

pose_em = get_2d_pe(C,H,W)
pose_em = pose_em.unsqueeze(0)

transformer_input = feature_map + pose_em
print(transformer_input.size())

transformer_input = transformer_input.flatten(start_dim=2)
print(transformer_input.size())

# Transformer의 경우 (Seqence, batch_size, Channel) 순으로 입력을 받게 됩니다.
src = transformer_input.permute(2, 0, 1)
print(src.size())



torch.Size([1, 2048, 7, 7])
torch.Size([1, 2048, 49])
torch.Size([49, 1, 2048])
