In [45]:
import os
import sys
import numpy as np

from matplotlib import pyplot as plt

In [46]:
import time
import json
import random
import argparse
import datetime
from pathlib import Path

import torch
from torch import nn
from torchvision.models import resnet50
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import DataLoader, DistributedSampler

import datasets
import util.misc as utils
from models import build_model
from engine import evaluate, train_one_epoch
from datasets import build_dataset, get_coco_api_from_dataset

In [47]:
import warnings
warnings.filterwarnings('ignore')

In [48]:
## define my device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Available Device: {device}")

Available Device: cuda


In [49]:
## Lets analyze simplified DETR
class DETRdemo(nn.Module):
    """
    Demo DETR implementation.

    Demo implementation of DETR in minimal number of lines, with the
    following differences wrt DETR in the paper:
    * learned positional encoding (instead of sine)
    * positional encoding is passed at input (instead of attention)
    * fc bbox predictor (instead of MLP)
    The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
    Only batch size 1 supported.
    """
    def __init__(self, num_classes, hidden_dim=256, nheads=8,
                    num_encoder_layers=6, num_decoder_layers=6):
        super().__init__()

        # create ResNet-50 backbone
        self.backbone = resnet50()
        del self.backbone.fc

        # create conversion layer
        self.conv = nn.Conv2d(2048, hidden_dim, 1)

        # create a default PyTorch transformer
        self.transformer = nn.Transformer(
            hidden_dim, nheads, num_encoder_layers, num_decoder_layers)

        # prediction heads, one extra class for predicting non-empty slots
        # note that in baseline DETR linear_bbox layer is 3-layer MLP
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)
        self.linear_bbox = nn.Linear(hidden_dim, 4)

        # output positional encodings (object queries)
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))

        # spatial positional encodings
        # note that in baseline DETR we use sine positional encodings
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))

    def forward(self, inputs):
        # propagate inputs through ResNet-50 up to avg-pool layer
        x = self.backbone.conv1(inputs)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        print(f"Backbone Pre-Output Shape: {x.shape}")

        # convert from 2048 to 256 feature planes for the transformer
        h = self.conv(x)

        print(f"Backbone Post-Output Shape: {h.shape}")

        # construct positional encodings
        H, W = h.shape[-2:]
        print(f"H: {H} and W: {W}")
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1)

        print(f"Positional Embeddings Shape: {pos.shape}")
        print(f"Transformer Input Tensor Shape: {h.flatten(2).permute(2, 0, 1).shape}")

        # propagate through the transformer
        h = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1),
                                self.query_pos.unsqueeze(1)).transpose(0, 1)
        
        # finally project transformer outputs to class labels and bounding boxes
        return {'pred_logits': self.linear_class(h), 
                'pred_boxes': self.linear_bbox(h).sigmoid()}

In [50]:
## initialize the model
detr = DETRdemo(num_classes=91)
state_dict = torch.hub.load_state_dict_from_url(
    url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth',
    map_location='cpu', check_hash=True)
detr.load_state_dict(state_dict)

<All keys matched successfully>

In [52]:
## create dummy input
dummy_input = torch.ones((1, 3, 224, 224))
dummy_out = detr(dummy_input)

Backbone Pre-Output Shape: torch.Size([1, 2048, 7, 7])
Backbone Post-Output Shape: torch.Size([1, 256, 7, 7])
H: 7 and W: 7
Positional Embeddings Shape: torch.Size([49, 1, 256])
Transformer Input Tensor Shape: torch.Size([49, 1, 256])


In [31]:
## load the dino model
dino_model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14').eval()

Using cache found in C:\Users\citak/.cache\torch\hub\facebookresearch_dinov2_main


In [32]:
dino_dummy_output = dino_model(dummy_input)
dino_dummy_output.shape

torch.Size([1, 1024])

In [None]:
dino_dummy_output_norm_patches = dino_model.forward_features(dummy_input)["x_norm_patchtokens"]
print(dino_dummy_output_norm_patches.shape)

dino_dummy_output_norm_patches = dino_dummy_output_norm_patches.permute(0, 2, 1)
print(dino_dummy_output_norm_patches.shape)

dino_dummy_output_norm_patches = dino_dummy_output_norm_patches.reshape(1, 1024, int(224/14), int(224/14))
print(dino_dummy_output_norm_patches.shape)

torch.Size([1, 256, 1024])
torch.Size([1, 1024, 256])
torch.Size([1, 1024, 16, 16])


In [9]:
dino_dummy_output_pathches = dino_model.get_intermediate_layers(dummy_input, n=1)[0]
dino_dummy_output_pathches.shape

torch.Size([1, 256, 1024])

In [10]:
# Reshape to 16x16 grid, then downsample to 7x7
B, N, D = dino_dummy_output_pathches.shape
B, N, D

(1, 256, 1024)

In [37]:
dino_features = dino_dummy_output_pathches.view(B, 16, 16, D)
dino_features.shape

torch.Size([1, 16, 16, 1024])

In [38]:
dino_features.permute(0, 3, 1, 2).shape

torch.Size([1, 1024, 16, 16])

In [39]:
dino_features = F.adaptive_avg_pool2d(dino_features.permute(0, 3, 1, 2), (7, 7))
dino_features = dino_features.flatten(2).permute(0, 2, 1)  # Shape: [B, 49, 1024]

print(dino_features.shape)

torch.Size([1, 49, 1024])


In [43]:
# Project 1024-dim to 256-dim using Linear layer
linear_proj = nn.Linear(1024, 256)
features_transformed = linear_proj(dino_features)  # Shape: [1, 49, 256] but DETR expects: Transformer Input Tensor Shape: torch.Size([49, 1, 256])
features_transformed = features_transformed.permute(1,0,2)
print(features_transformed.shape)

torch.Size([49, 1, 256])


In [15]:
#########################################################################################################
#########################################################################################################
#########################################################################################################
#########################################################################################################

In [16]:
print(dino_dummy_output_pathches.shape)
patch_features = dino_dummy_output_pathches[:, 1:, :] # remove the CLS token
print(patch_features.shape)

torch.Size([1, 256, 1024])
torch.Size([1, 255, 1024])


In [17]:
def unflatten_features(features, batch_size=1, patch_size=14, img_size=224):
    return features.reshape(batch_size, img_size // patch_size, img_size // patch_size, -1)

unflatten_elements = unflatten_features(dino_dummy_output_pathches, patch_size=14)
print(unflatten_elements.shape)

torch.Size([1, 16, 16, 1024])


In [18]:
class LearnableDownsample(nn.Module):
    def __init__(self, in_channels=1024, out_channels=1024):
        super(LearnableDownsample, self).__init__()
        # First convolution layer to reduce 16x16 to 8x8
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)
        # Second convolution layer to reduce 8x8 to 7x7
        self.conv2 = nn.Conv2d(in_channels, out_channels, kernel_size=4, stride=1, padding=1)
    
    def forward(self, x):
        # Reorder to [1, 1024, 16, 16]
        x = x.permute(0, 3, 1, 2)
        
        x = self.conv1(x)  # Shape: [1, 1024, 8, 8]
        x = self.conv2(x)  # Shape: [1, 1024, 7, 7]
        
        # Reorder back to [1, 7, 7, 1024]
        x = x.permute(0, 2, 3, 1)
        return x

# Example usage
x = torch.randn(1, 16, 16, 1024)  # Input tensor of shape [1, 16, 16, 1024]
downsample_model = LearnableDownsample()
output = downsample_model(x)

print("Output shape:", output.shape)  # Expected shape: [1, 7, 7, 1024]

Output shape: torch.Size([1, 7, 7, 1024])


In [19]:
#########################################################################################################
#########################################################################################################
#########################################################################################################

In [20]:
import torch.nn as nn

class PoolingAndProjection(nn.Module):
    def __init__(self, input_dim=1024, num_patches=49, output_dim=256):
        super(PoolingAndProjection, self).__init__()
        # Adaptive pooling to reduce the feature map from 256 to 49 patches
        self.pool = nn.AdaptiveAvgPool2d((7, 7))  # Reduces to a 7x7 grid, hence 49 patches
        self.linear_proj = nn.Linear(input_dim, output_dim)  # Linear projection to reduce features to 256

    def forward(self, x):
        # x shape: [256, 1, 1024]
        x = x.squeeze(1)  # Shape: [256, 1024]
        
        # Reshape to [16, 16, 1024] (simulating the spatial structure of patches)
        x = x.view(16, 16, 1024).permute(2, 0, 1)  # Shape: [1024, 16, 16]
        
        # Apply average pooling to reduce spatial dimensions to 7x7 grid
        pooled_features = self.pool(x.unsqueeze(0))  # Shape: [1, 1024, 7, 7]
        
        # Flatten to 49 patches
        pooled_features = pooled_features.flatten(2).permute(0, 2, 1)  # Shape: [1, 49, 1024]
        
        # Apply linear projection to reduce to 256 features
        output = self.linear_proj(pooled_features)  # Shape: [1, 49, 256]
        
        return output

# Example usage
dino_features = torch.randn(256, 1, 1024)  # Example input of shape [256, 1, 1024]
pooling_projection_model = PoolingAndProjection()
output = pooling_projection_model(dino_features)

print("Output shape:", output.shape)  # Expected shape: [1, 49, 256]

Output shape: torch.Size([1, 49, 256])


In [21]:
#########################################################################################################
#########################################################################################################

In [None]:
## dino object detection: https://github.com/facebookresearch/dinov2/issues/350
## https://zburkett.io/ai/2023/09/24/pca-object-detection.html
## https://github.com/itsprakhar/Yolo-DinoV2/blob/bd05c8b0afabfa39cb2cd7b1d915093c3963c37b/ultralytics/nn/modules/pretrained_vit.py#L15