In [2]:
import numpy as np
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
# 自定义数据集类，用于加载图像
class ImageDataset(Dataset):
    def __init__(self, image_files, transform=None):
        self.image_files = image_files
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_path = self.image_files[idx]
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image

def blip_inference_and_save(image_files, batch_size=8, device='cuda', output_file='image_embeds.npy'):
    # 加载 BLIP 模型和处理器
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    # 使用 DataParallel 包装模型，支持多GPU推理
    model = torch.nn.DataParallel(model)
    model = model.to(device)

    # 定义图像预处理变换
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    # 创建自定义数据集和DataLoader
    dataset = ImageDataset(image_files, transform=transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    all_embeds = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader):
            # 将图像批量转移到设备（例如GPU）
            batch = batch.to(device)
            # 通过processor生成输入
            inputs = processor(images=batch, return_tensors="pt").to(device)
            # 执行模型推理
            outputs = model.module.vision_model(pixel_values=inputs["pixel_values"])
            # 提取图像嵌入特征
            image_embeds = outputs[1]
            all_embeds.append(image_embeds.cpu().numpy())  # 将结果转移到CPU并转换为numpy数组
    
    # 拼接所有特征向量
    all_embeds = np.concatenate(all_embeds, axis=0)
    
    # 保存为.npy文件
    np.save(output_file, all_embeds)
    print(f"特征向量已保存到 {output_file}")


In [3]:
import os
from tqdm import tqdm
def list_images_in_folder_recursive(folder_path):
    # 支持的图片格式
    image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff')

    # 存储所有图片文件的路径
    image_files = []

    # 使用os.walk递归遍历文件夹及其子文件夹
    for root, dirs, files in tqdm(os.walk(folder_path)):
        for file in files:
            if file.lower().endswith(image_extensions):
                image_files.append(os.path.join(root, file))

    return image_files


In [4]:

# 输入文件夹路径
folder_path = "/public/home/msskx/shot5/data" # 用户输入文件夹路径

# 输出所有图片文件的路径
image_files = list_images_in_folder_recursive(folder_path)


137it [00:00, 1292.10it/s]


In [5]:


# 批量推理并保存特征向量
batch_size = 8  # 根据你的GPU显存调整
device = 'cuda' if torch.cuda.is_available() else 'cpu'
output_file = 'image_embeds.npy'  # 输出文件的名字

blip_inference_and_save(image_files, batch_size=batch_size, device=device, output_file=output_file)


  return self.fget.__get__(instance, owner)()
  0%|          | 0/575 [00:00<?, ?it/s]It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
100%|██████████| 575/575 [02:10<00:00,  4.39it/s]

特征向量已保存到 image_embeds.npy





In [7]:
d = np.load("image_embeds.npy")