In [4]:
from transformers import AutoProcessor, BlipForConditionalGeneration

processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("blip-finetuned")

In [5]:
import os
import base64

def save_annot(image, annot):
    # create a folder with image name and save the annot.txt file with the blip annotation and the image.txt file with the base64 encoded image
    image_name = image.split("/")[-1].split(".")[0]

    data_path = os.path.join('data', image_name)
    if not os.path.exists(data_path):
        
        os.mkdir(data_path)

    else:
        return

    with open(os.path.join(data_path, "annotation.txt"), "w") as f:
        f.write(annot)
    
    base64_image = base64.b64encode(open(image, "rb").read())

    with open(os.path.join(data_path, "image.txt"), "w") as f:
        f.write(base64_image.decode("utf-8"))

In [6]:
from PIL import Image
import time
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

for image_name in tqdm(os.listdir("images")):

    if image_name == ".DS_Store":
        continue

    image_path = os.path.join("images", image_name)

    image = Image.open(image_path)

    inputs = processor(images=image, return_tensors="pt").to(device)
    pixel_values = inputs.pixel_values

    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    save_annot(image_path, generated_caption)

    os.remove(image_path)
    

100%|██████████| 16647/16647 [8:41:49<00:00,  1.88s/it]  
