In [1]:
import sys
sys.path.append("/home/siyi/project/PCL/pcl")

In [1]:
import argparse
import builtins
import math
import os
import random
import shutil
import time
import warnings
from tqdm import tqdm
import numpy as np
import faiss

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

In [2]:
p1 = torch.rand(size=(1, 256, 80, 80))
p2 = torch.rand(size=(1, 512, 40, 40))
p3 = torch.rand(size=(1, 1024, 20, 20))

In [4]:
p1 = nn.functional.adaptive_avg_pool2d(p1, (20, 20))  # torch.Size([1, 256, 20, 20])
p2 = nn.functional.adaptive_avg_pool2d(p2, (20, 20))  # torch.Size([1, 512, 20, 20])
fused = torch.cat((p1, p2, p3), dim=1)

In [6]:
fused.shape

torch.Size([1, 1792, 20, 20])

In [4]:
print("=> creating model '{}'".format("resnet50"))

=> creating model 'resnet50'


In [5]:
import torch
import torch.nn as nn
from random import sample

In [32]:
"""
@Description :
@Author      : siyiren1@foxmail.com
@Time        : 2024/07/21 15:33:48
"""

import torch
import torch.nn as nn
from random import sample
from neck import Yolov8Neck


class DetectionCL(nn.Module):
    """
    Build a DetectionCL model, change the MLP layer into detection layer.
    """
    def __init__(self, base_encoder, head=None, dim=128, r=10, m=0.999, T=0.1, loss_lambda=0.5,  mlp=True) -> None:
        super(DetectionCL, self).__init__()

        self.r, self.m, self.T = r, m, T

        # 创建编码器 其中num_classes=dim是fc层的输出维度
        self.encoder_q = nn.Sequential(
            base_encoder(num_classes=dim),
            nn.Sequential()
        )
        self.encoder_k = nn.Sequential(
            base_encoder(num_classes=dim),
            nn.Sequential()
        )

        # 硬编码mlp层
        if mlp:
            dim_mlp = self.encoder_q[0].fc.weight.shape[1]
            # 删除原avgpool/fc层 并替换mlp
            self.encoder_q[0].avgpool = nn.Identity()
            self.encoder_q[0].avgpool = nn.Identity()
            self.encoder_q[0].fc = nn.Identity()
            self.encoder_k[0].avgpool = nn.Identity()
            self.encoder_k[0].fc = nn.Identity()

            # 更新neck
            # self.encoder_q[1] = Yolov8Neck()
            # self.encoder_k[1] = Yolov8Neck()

        for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
            param_k.data.copy_(param_q.data)  # 初始化encoder_k的参数为encoder_q的参数
            param_k.requires_grad = False  # encoder_k不进行梯度更新

        # 创建两个队列 分别为global和dense
        self.register_buffer("queue", torch.randn(dim, r))
        self.queue = nn.functional.normalize(self.queue, dim=0)
        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))

        self.register_buffer("queue2",  torch.randn(dim, r))
        self.queue2 = nn.functional.normalize(self.queue2, dim=0)
        self.register_buffer("queue2_ptr", torch.zeros(1, dtype=torch.long))

    
    @torch.no_grad()
    def _momentum_update_key_encoder(self):
        """
        Momentum update of the key encoder
        """
        for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
            param_k.data = param_k.data * self.m + param_q.data * (1. - self.m)

    
    @torch.no_grad()
    def _dequeue_and_enqueue(self, keys):
        # gather keys before updating queue
        keys = concat_all_gather(keys)

        batch_size = keys.shape[0]

        ptr = int(self.queue_ptr)
        assert self.queue_len % batch_size == 0  # for simplicity

        # replace the keys at ptr (dequeue and enqueue)
        self.queue[:, ptr:ptr + batch_size] = keys.transpose(0, 1)
        ptr = (ptr + batch_size) % self.queue_len  # move pointer

        self.queue_ptr[0] = ptr

    
    @torch.no_grad()
    def _dequeue_and_enqueue2(self, keys):
        # gather keys before updating queue
        keys = concat_all_gather(keys)

        batch_size = keys.shape[0]

        ptr = int(self.queue2_ptr)
        assert self.queue_len % batch_size == 0  # for simplicity

        # replace the keys at ptr (dequeue and enqueue)
        self.queue2[:, ptr:ptr + batch_size] = keys.transpose(0, 1)
        ptr = (ptr + batch_size) % self.queue_len  # move pointer

        self.queue2_ptr[0] = ptr


    @torch.no_grad()
    def _batch_shuffle_ddp(self, x):
        """
        Batch shuffle, for making use of BatchNorm.
        *** Only support DistributedDataParallel (DDP) model. ***
        """
        # gather from all gpus
        batch_size_this = x.shape[0]
        x_gather = concat_all_gather(x)
        batch_size_all = x_gather.shape[0]

        num_gpus = batch_size_all // batch_size_this

        # random shuffle index
        idx_shuffle = torch.randperm(batch_size_all).cuda()

        # broadcast to all gpus
        torch.distributed.broadcast(idx_shuffle, src=0)

        # index for restoring
        idx_unshuffle = torch.argsort(idx_shuffle)

        # shuffled index for this gpu
        gpu_idx = torch.distributed.get_rank()
        idx_this = idx_shuffle.view(num_gpus, -1)[gpu_idx]

        return x_gather[idx_this], idx_unshuffle
    

    @torch.no_grad()
    def _batch_unshuffle_ddp(self, x, idx_unshuffle):
        """
        Undo batch shuffle.
        *** Only support DistributedDataParallel (DDP) model. ***
        """
        # gather from all gpus
        batch_size_this = x.shape[0]
        x_gather = concat_all_gather(x)
        batch_size_all = x_gather.shape[0]

        num_gpus = batch_size_all // batch_size_this

        # restored index for this gpu
        gpu_idx = torch.distributed.get_rank()
        idx_this = idx_unshuffle.view(num_gpus, -1)[gpu_idx]

        return x_gather[idx_this]
    

    def forward(self, im_q, im_k=None, is_eval=False, cluster_global=None, cluster_dense=None, index=None):
        pass


    def get_encoderq_features(self, im_q):
        features = im_q
        output_list = []
        for name, layer in self.encoder_q[0].named_children():
            if name == 'avgpool':
                break
            features = layer(features)
            if name.startswith("layer"):
                output_list.append(features)
        return output_list, features


# utils
@torch.no_grad()
def concat_all_gather(tensor):
    """
    Performs all_gather operation on the provided tensors.
    *** Warning ***: torch.distributed.all_gather has no gradient.
    """
    tensors_gather = [torch.ones_like(tensor)
        for _ in range(torch.distributed.get_world_size())]
    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)

    output = torch.cat(tensors_gather, dim=0)
    return output

In [33]:
base_encoder = models.__dict__["resnet50"]

In [34]:
model = DetectionCL(base_encoder)

In [38]:
x = torch.Tensor(1, 3, 640, 640)

In [39]:
output_list, features = model.get_encoderq_features(x)

In [40]:
output = model.encoder_q[0](x)

In [41]:
for f in output_list:
    print(f.shape)

torch.Size([1, 256, 160, 160])
torch.Size([1, 512, 80, 80])
torch.Size([1, 1024, 40, 40])
torch.Size([1, 2048, 20, 20])


In [37]:
print(features.shape)

torch.Size([1, 2048, 7, 7])


In [5]:
import torch
import torch.nn as nn
from ultralytics.nn.modules.block import C2f

class Yolov8Head(nn.Module):
    def __init__(self, channels):
        super(Yolov8Head, self).__init__()
        # channels is a list of input channels for each stage
        c1, c2, c3, c4 = channels

        self.up1 = nn.Upsample(scale_factor=2, mode='nearest')
        self.c2f_1 = C2f(c3 + c4, 512)
        
        self.up2 = nn.Upsample(scale_factor=2, mode='nearest')
        self.c2f_2 = C2f(512 + c2, 256)
        
        self.conv2 = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)
        
        self.c2f_3 = C2f(256 + c2, 512)
        self.conv3 = nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1)
        
        self.c2f_4 = C2f(512 + c4, 1024)

        # FPN layers
        self.lateral_conv1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
        self.lateral_conv2 = nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0)
        self.lateral_conv3 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)        

        self.smooth_conv1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth_conv2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth_conv3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)


    def forward(self, input_list):
        assert len(input_list) == 4
        x4, x3, x2, x1 = input_list

        x = self.up1(x1)    # torch.Size([1, 2048, 40, 40])
        print("up: ", x.shape)
        x = torch.cat((x, x2), dim=1) # torch.Size([1, 3072, 40, 40])
        print("concat: ", x.shape)
        x = self.c2f_1(x)   # torch.Size([1, 512, 40, 40])
        print("c2f: ", x.shape)
        hidden_x = x

        x = self.up2(x)
        print("up: ", x.shape)
        x = torch.cat((x, x3), dim=1)
        print("concat: ", x.shape)
        x = self.c2f_2(x)
        p1 = x
        print("c2f: ", x.shape)

        x = self.conv2(x)
        print("conv: ", x.shape)
        x = torch.cat((x, hidden_x), dim=1)
        print("concat: ", x.shape)
        x = self.c2f_3(x)
        p2 = x
        print("c2f: ", x.shape)

        x = self.conv3(x)
        print("conv: ", x.shape)
        x = torch.cat((x, x1), dim=1)
        print("concat: ", x.shape)
        x = self.c2f_4(x)
        p3 = x
        print("c2f: ", x.shape)
        
        return x

In [6]:
# channels = [feature.shape[1] for feature in output_list]
# model = Yolov8Head(channels)
channels = [256, 512, 1024, 2048]
model = Yolov8Head(channels)
input_list = [
    torch.randn(1, 256, 160, 160),
    torch.randn(1, 512, 80, 80),
    torch.randn(1, 1024, 40, 40),
    torch.randn(1, 2048, 20, 20)
]
output = model(input_list)

up:  torch.Size([1, 2048, 40, 40])
concat:  torch.Size([1, 3072, 40, 40])
c2f:  torch.Size([1, 512, 40, 40])
up:  torch.Size([1, 512, 80, 80])
concat:  torch.Size([1, 1024, 80, 80])
c2f:  torch.Size([1, 256, 80, 80])
conv:  torch.Size([1, 256, 40, 40])
concat:  torch.Size([1, 768, 40, 40])
c2f:  torch.Size([1, 512, 40, 40])
conv:  torch.Size([1, 512, 20, 20])
concat:  torch.Size([1, 2560, 20, 20])
c2f:  torch.Size([1, 1024, 20, 20])


In [None]:
c2f:  torch.Size([1, 256, 80, 80])
c2f:  torch.Size([1, 512, 40, 40])
c2f:  torch.Size([1, 1024, 20, 20])

In [104]:
import torch
import torch.nn as nn
from ultralytics.nn.modules.block import C2f

class Yolov8Head(nn.Module):
    def __init__(self, channels):
        super(Yolov8Head, self).__init__()
        # channels is a list of input channels for each stage
        c1, c2, c3, c4 = channels

        self.up1 = nn.Upsample(scale_factor=2, mode='nearest')
        self.c2f_1 = C2f(c3 + c4, 512)
        
        self.up2 = nn.Upsample(scale_factor=2, mode='nearest')
        self.c2f_2 = C2f(512 + c2, 256)
        
        self.conv2 = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)
        
        self.c2f_3 = C2f(256 + c2, 512)
        self.conv3 = nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1)
        
        self.c2f_4 = C2f(512 + c4, 1024)

        # FPN layers
        self.lateral_conv1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
        self.lateral_conv2 = nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0)
        self.lateral_conv3 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)        

        self.smooth_conv1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth_conv2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth_conv3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)

        # Output layers for final feature maps
        self.output_conv = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)

    def forward(self, input_list):
        assert len(input_list) == 4
        x4, x3, x2, x1 = input_list

        # Bottom-up pathway
        x = self.up1(x1)    # torch.Size([1, 2048, 40, 40])
        x = torch.cat((x, x2), dim=1) # torch.Size([1, 3072, 40, 40])
        x = self.c2f_1(x)   # torch.Size([1, 512, 40, 40])
        hidden_x = x

        x = self.up2(x)
        x = torch.cat((x, x3), dim=1)
        x = self.c2f_2(x)
        p1 = x

        x = self.conv2(x)
        x = torch.cat((x, hidden_x), dim=1)
        x = self.c2f_3(x)
        p2 = x

        x = self.conv3(x)
        x = torch.cat((x, x1), dim=1)
        x = self.c2f_4(x)
        p3 = x

        # Top-down pathway and lateral connections
        print(p3.shape)
        print(p2.shape)
        p3_upsampled = self.up1(p3)   # Upsample to match p2
        p2 = self.lateral_conv1(p2) + p3_upsampled
        p2 = self.smooth_conv1(p2)

        p2_upsampled = self.up1(p2)   # Upsample to match p1
        p1 = self.lateral_conv2(p1) + p2_upsampled
        p1 = self.smooth_conv2(p1)

        # Final output
        p1 = self.smooth_conv3(p1)

        return p1

# Example usage:
channels = [256, 512, 1024, 2048]
model = Yolov8Head(channels)
input_list = [
    torch.randn(1, 256, 160, 160),
    torch.randn(1, 512, 80, 80),
    torch.randn(1, 1024, 40, 40),
    torch.randn(1, 2048, 20, 20)
]
output = model(input_list)
print(output.shape)  # Expected output shape will depend on the final FPN output

torch.Size([1, 1024, 20, 20])
torch.Size([1, 512, 40, 40])


RuntimeError: Given groups=1, weight of size [256, 1024, 1, 1], expected input[1, 512, 40, 40] to have 1024 channels, but got 512 channels instead

In [105]:
import torch
import torch.nn as nn
from ultralytics.nn.modules.block import C2f

class Yolov8Head(nn.Module):
    def __init__(self, channels):
        super(Yolov8Head, self).__init__()
        # channels is a list of input channels for each stage
        c1, c2, c3, c4 = channels

        self.up1 = nn.Upsample(scale_factor=2, mode='nearest')
        self.c2f_1 = C2f(c3 + c4, 512)
        
        self.up2 = nn.Upsample(scale_factor=2, mode='nearest')
        self.c2f_2 = C2f(512 + c2, 256)
        
        self.conv2 = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)
        
        self.c2f_3 = C2f(256 + c2, 512)
        self.conv3 = nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1)
        
        self.c2f_4 = C2f(512 + c4, 1024)

        # FPN layers
        self.lateral_conv1 = nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0)
        self.lateral_conv2 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)
        self.lateral_conv3 = nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0)        

        self.smooth_conv1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth_conv2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth_conv3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)

        # Output layers for final feature maps
        self.output_conv = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)

    def forward(self, input_list):
        assert len(input_list) == 4
        x4, x3, x2, x1 = input_list

        # Bottom-up pathway
        x = self.up1(x1)    # torch.Size([1, 2048, 40, 40])
        x = torch.cat((x, x2), dim=1) # torch.Size([1, 3072, 40, 40])
        x = self.c2f_1(x)   # torch.Size([1, 512, 40, 40])
        hidden_x = x

        x = self.up2(x)
        x = torch.cat((x, x3), dim=1)
        x = self.c2f_2(x)
        p1 = x

        x = self.conv2(x)
        x = torch.cat((x, hidden_x), dim=1)
        x = self.c2f_3(x)
        p2 = x

        x = self.conv3(x)
        x = torch.cat((x, x1), dim=1)
        x = self.c2f_4(x)
        p3 = x

        print("p3: ", p3.shape)
        print("p2: ", p2.shape)

        # Top-down pathway and lateral connections
        p3_upsampled = self.up1(p3)   # Upsample to match p2
        print("p3_upsampled: ", p3_upsampled.shape)
        p2 = self.lateral_conv1(p2) + p3_upsampled
        p2 = self.smooth_conv1(p2)

        p2_upsampled = self.up1(p2)   # Upsample to match p1
        print("p2_upsampled: ", p2_upsampled.shape)
        p1 = self.lateral_conv2(p1) + p2_upsampled
        p1 = self.smooth_conv2(p1)

        # Final output
        p1 = self.smooth_conv3(p1)

        return p1

# Example usage:
channels = [256, 512, 1024, 2048]
model = Yolov8Head(channels)
input_list = [
    torch.randn(1, 256, 160, 160),
    torch.randn(1, 512, 80, 80),
    torch.randn(1, 1024, 40, 40),
    torch.randn(1, 2048, 20, 20)
]
output = model(input_list)
print(output.shape)  # Expected output shape will depend on the final FPN output

p3:  torch.Size([1, 1024, 20, 20])
p2:  torch.Size([1, 512, 40, 40])
p3_upsampled:  torch.Size([1, 1024, 40, 40])


RuntimeError: The size of tensor a (256) must match the size of tensor b (1024) at non-singleton dimension 1

In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F


def euclidean_dist(x, y):  # x:[n, d]  y:[m, d]
  xx = (x*x).sum(dim=1, keepdim=True)  # (n, 1)
  yy = (y*y).sum(dim=1, keepdim=True).transpose(0, 1)  # (1, m)
  xy = torch.mm(x, y.transpose(0, 1))  # (n, m)
  return xx - 2*xy + yy


def cosin_sim(x, y):  # x:[n, d]  y:[m, d]
  x = x.unsqueeze(1)  # [n, 1, d]
  y = y.unsqueeze(0)  # [1, m, d]
  return F.cosine_similarity(x, y, dim=-1)

In [3]:
features = torch.rand(size=(4, 2, 32, 32))

In [5]:
features = features.transpose(0, 1).reshape(-1, *features.size()[-3:])

In [6]:
probs = (logits/np.sqrt(features.size(1))).softmax(1)  # [N, C]

torch.Size([4, 2, 32, 32])

In [2]:
class CPN_WCP(nn.Module):  # CPN + Wasserstein Confidence Penalty
  def __init__(self, backbone, aug_num, gamma):
    super(CPN_WCP, self).__init__()
    self.feature = backbone()
    self.aug_num = aug_num
    self.gamma = gamma

  # def cuda(self):
  #   self.feature = nn.DataParallel(self.feature, device_ids=[0, 1, 2, 3]).cuda()
  #   return self

  def forward(self, x, temp=5.):  # x:[batch, aug_num, C, H, W]
    batch = x.size(0)
    x = x.transpose(0, 1).reshape(-1, *x.size()[-3:])  # [aug_num*batch, C, H, W]
    features = self.feature(x)  # [aug_num*batch, d]
    eudis = euclidean_dist(features, features)  # [aug_num*batch, aug_num*batch]
    logits = -eudis.reshape(self.aug_num*batch*self.aug_num, batch)  # [aug_num*batch*aug_num, batch]
    # Cross Entropy Loss
    targets = torch.from_numpy(np.repeat(range(batch), self.aug_num)).repeat(self.aug_num).cuda()  # [aug_num*batch*aug_num]
    ce_loss = nn.CrossEntropyLoss()(logits/temp, targets)
    # Wasserstein Distance Regularization
    probs = (logits/np.sqrt(features.size(1))).softmax(1)  # [N, C]
    target_probs = torch.ones(probs.size()).cuda()/batch  # [N, C]
    with torch.no_grad():
      features = features.reshape(self.aug_num, batch, -1).mean(0)  # [batch, d]
      cost = 1.-cosin_sim(features, features)  # [batch, batch]
      cost = (cost-cost.min(-1, keepdims=True)[0])/(cost.max(-1, keepdims=True)[0]-cost.min(-1, keepdims=True)[0])
      cost = (self.gamma*cost+torch.eye(batch).cuda()).unsqueeze(0).repeat(probs.size(0), 1, 1)
    wcp_loss = self.SinkhornDistance(probs, target_probs, cost)

    loss = ce_loss + wcp_loss
    return loss

  def M(self, C, u, v, eps):
    "Modified cost for logarithmic updates"
    return (-C+u.unsqueeze(-1)+v.unsqueeze(-2))/eps

  def SinkhornDistance(self, p1, p2, C, itr=5, eps=0.5):
    u = torch.zeros_like(p1)
    v = torch.zeros_like(p2)
    for _ in range(itr):
      u = eps*(torch.log(p1+1e-12)-torch.logsumexp(self.M(C, u, v, eps), dim=-1)) + u
      v = eps*(torch.log(p2+1e-12)-torch.logsumexp(self.M(C, u, v, eps).transpose(-2, -1), dim=-1)) + v

    pi = torch.exp(self.M(C, u, v, eps))
    return (pi*C).sum((-2, -1)).mean()