In [1]:
import torch
from PIL import Image
from torchvision import transforms
from models.blip import blip_decoder

In [2]:
def load_image(image_path: str, image_size: int, device: str) -> torch.Tensor:
    """读取图片并做与 CLIP/BLIP 一致的预处理，返回 1×3×H×W 的 Tensor."""
    img = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([
        transforms.Resize((image_size, image_size), interpolation=Image.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.48145466, 0.4578275, 0.40821073],
            std=[0.26862954, 0.26130258, 0.27577711],
        ),
    ])
    return transform(img).unsqueeze(0).to(device)

In [4]:
# 使用 ViT-B 的 caption checkpoint
ckpt_path   = "/root/autodl-fs/pretrain_UniLSeg/model_caption_coco_large.pth"
# 如果想用 ViT-L，则：
# ckpt_path = "/root/autodl-fs/pretrain_BLIP/model_caption_coco_large.pth"
# vit="large"
vit_type    = "large"  

In [5]:
# —— 加载模型 —— 
model = blip_decoder(pretrained=ckpt_path, image_size=image_size, vit=vit_type)
model.eval().to(device)

BertLMHeadModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


load checkpoint from /root/autodl-fs/pretrain_UniLSeg/model_caption_coco_large.pth


BLIP_Decoder(
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1024, out_features=3072, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        

In [24]:
# —— 配置区域 —— 
i = 45
device      = "cuda" if torch.cuda.is_available() else "cpu"
# image_path  = "/root/data_preprocessing/box_labeled/sam_test/COCO_train2014_000000098304_ann116865_overlay.jpg"
image_path  = f"/root/data_preprocessing/box_labeled/sam_test/COCO_train2014_000000098304_ann116865_obj_{i}.png"

image_size  = 384

In [25]:
# —— 读取并预处理图片 —— 
image = load_image(image_path, image_size, device)

# —— 生成描述 —— 
with torch.no_grad():
    caption = model.generate(
        image,
        sample=False,      # beam search
        num_beams=1,       # beam size
        max_length=20,     # 最长 20 token
        min_length=5,      # 最短 5 token
    )
print("🤖 BLIP Caption:", caption[0])

🤖 BLIP Caption: a giraffe is standing in the grass with a frisbee
