<a href="https://colab.research.google.com/github/Belal-AI/FSDAwaesf/blob/main/Detr_Facebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from torch import nn
from PIL import Image
from torchvision.models import resnet50
import torch
import requests
import torchvision.transforms as T

In [5]:
class DETRdemo(nn.Module):
    """
    Demo DETR implementation.

    Demo implementation of DETR in minimal number of lines, with the
    following differences wrt DETR in the paper:
    * learned positional encoding (instead of sine)
    * positional encoding is passed at input (instead of attention)
    * fc bbox predictor (instead of MLP)
    The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
    Only batch size 1 supported.
    """
    def __init__(self, num_classes, hidden_dim=256, nheads=8,
                 num_encoder_layers=6, num_decoder_layers=6):
        super().__init__()

        # create ResNet-50 backbone
        self.backbone = resnet50()
        del self.backbone.fc

        # create conversion layer
        self.conv = nn.Conv2d(2048, hidden_dim, 1)

        # create a default PyTorch transformer
        self.transformer = nn.Transformer(
            hidden_dim, nheads, num_encoder_layers, num_decoder_layers)

        # prediction heads, one extra class for predicting non-empty slots
        # note that in baseline DETR linear_bbox layer is 3-layer MLP
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)
        self.linear_bbox = nn.Linear(hidden_dim, 4)

        # output positional encodings (object queries)
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))
        self.Flatten=nn.Flatten(start_dim=2, end_dim=3)

        # spatial positional encodings
        # note that in baseline DETR we use sine positional encodings

        self.pos=nn.Parameter(torch.rand(850,1,hidden_dim),requires_grad=True)

    def forward(self, inputs):
        # propagate inputs through ResNet-50 up to avg-pool layer
        x = self.backbone.conv1(inputs)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)


        # convert from 2048 to 256 feature planes for the transformer

        h = self.conv(x)


        # construct positional encodings


        H, W = h.shape[-2:]
        print(h.shape)
        print("flattenrd: ",self.Flatten(h).permute(2, 0, 1).shape)
        print("the pose : ",self.pos.shape)
        #print(""self.query_pos.unsqueeze(1).transpose(0, 1).shape)

        # propagate through the transformer

        h = self.transformer(self.pos + 0.1 * self.Flatten(h).permute(2, 0, 1), ## 850,1,256
                             self.query_pos.unsqueeze(1)).transpose(0, 1)

        # finally project transformer outputs to class labels and bounding boxes
        return {'pred_logits': self.linear_class(h),
                'pred_boxes': self.linear_bbox(h).sigmoid()}

In [6]:
model=DETRdemo(91)
optmizer=torch.optim.AdamW(model.parameters())

In [7]:
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
im = Image.open(requests.get(url, stream=True).raw)
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
im=transform(im).unsqueeze(0)
s=model(im)

torch.Size([1, 256, 25, 34])
flattenrd:  torch.Size([850, 1, 256])
the pose :  torch.Size([850, 1, 256])
