In [1]:
import open_clip
import torch
from PIL import Image

In [10]:
tensor = torch.rand(16, 384, 384)  
sum_tensor = torch.sum(tensor, dim=(1,2))  # 沿着第1维求和

print(sum_tensor.size())

torch.Size([16])


In [2]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
tokenizer = open_clip.get_tokenizer('ViT-B-32')

In [3]:
text = tokenizer(["a diagram", 'a cat'])
image = preprocess(Image.open("CLIP.png")).unsqueeze(0)

In [4]:
with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)


In [6]:
image_features.shape

torch.Size([1, 512])

In [7]:
text_features.shape

torch.Size([2, 512])

In [5]:
print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

Label probs: tensor([[9.9991e-01, 8.5352e-05]])


In [9]:
array = torch.cat([text, text2], dim=0)
array.shape

torch.Size([2, 1, 77])

In [15]:
text_features = model.encode_text(text)

In [16]:
text_features.shape

torch.Size([2, 512])

In [6]:
from models_mae_cross import SupervisedMAE
from torchsummary import summary

model = SupervisedMAE()
summary(model, input_size=(3, 384, 384))

Layer (type:depth-idx)                   Param #
├─PatchEmbed: 1-1                        --
|    └─Conv2d: 2-1                       787,456
|    └─Identity: 2-2                     --
├─ModuleList: 1-2                        --
|    └─Block: 2-3                        --
|    |    └─LayerNorm: 3-1               2,048
|    |    └─Attention: 3-2               4,198,400
|    |    └─Identity: 3-3                --
|    |    └─Identity: 3-4                --
|    |    └─LayerNorm: 3-5               2,048
|    |    └─Mlp: 3-6                     8,393,728
|    |    └─Identity: 3-7                --
|    |    └─Identity: 3-8                --
|    └─Block: 2-4                        --
|    |    └─LayerNorm: 3-9               2,048
|    |    └─Attention: 3-10              4,198,400
|    |    └─Identity: 3-11               --
|    |    └─Identity: 3-12               --
|    |    └─LayerNorm: 3-13              2,048
|    |    └─Mlp: 3-14                    8,393,728
|    |    └─Identity: 3-15

Layer (type:depth-idx)                   Param #
├─PatchEmbed: 1-1                        --
|    └─Conv2d: 2-1                       787,456
|    └─Identity: 2-2                     --
├─ModuleList: 1-2                        --
|    └─Block: 2-3                        --
|    |    └─LayerNorm: 3-1               2,048
|    |    └─Attention: 3-2               4,198,400
|    |    └─Identity: 3-3                --
|    |    └─Identity: 3-4                --
|    |    └─LayerNorm: 3-5               2,048
|    |    └─Mlp: 3-6                     8,393,728
|    |    └─Identity: 3-7                --
|    |    └─Identity: 3-8                --
|    └─Block: 2-4                        --
|    |    └─LayerNorm: 3-9               2,048
|    |    └─Attention: 3-10              4,198,400
|    |    └─Identity: 3-11               --
|    |    └─Identity: 3-12               --
|    |    └─LayerNorm: 3-13              2,048
|    |    └─Mlp: 3-14                    8,393,728
|    |    └─Identity: 3-15

: 