In [1]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import os
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
torch.cuda.is_available()

True

In [3]:
# ! possibly change dir_path

BATCH_SIZE = 30
DIR_PATH = r"C:\Users\FMari\Sirius2024\images_final"

In [4]:
class IC_model():
    """
    Класс, который принимает на вход изображения
    и выдает описания к ним
    """
    def __init__(self):
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", 
                                                                  torch_dtype=torch.float16)
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()
        

    def predict(self, img_paths, conditional=False, texts=None) -> str:
        """
        :img_paths: list(str) - путь, где лежит картинка
        :conditional: bool, optional - с условием или без
        :texts: list(str), optional - текст, на который должен быть ориентирован ответ
        """   
        
        images = [Image.open(path).convert(mode="RGB") for path in img_paths]
        if conditional:
            inputs = self.processor(images, 
                                    text=texts, 
                                    return_tensors="pt").to(self.device, torch.float16)
        else:
            inputs = self.processor(images, 
                                    return_tensors="pt").to(self.device, torch.float16)

        outs = self.model.generate(**inputs)
        answers = self.processor.batch_decode(outs, skip_special_tokens=True)
        
        return answers

In [5]:
model = IC_model()



In [6]:
img_paths = [os.path.join(DIR_PATH, p) for p in os.listdir(DIR_PATH)]

In [7]:
captions = np.empty(len(img_paths), dtype="<U100")

In [11]:
for idx in range(0, len(img_paths), BATCH_SIZE):
    stop = min(len(img_paths), idx + BATCH_SIZE)
    
    preds = np.array(model.predict(img_paths[idx:stop]))
    captions[idx:stop] = preds
    
    if idx % 100 == 0:
        print(idx)

0
300


In [12]:
captions

array(['a close up of a small backpack with a zipper on the front',
       'arafed woman in black and white striped shirt and black pants',
       'araffe woman in red pajamas posing for a picture',
       'woman in black coat standing in front of a window looking out',
       'a close up of a glass bowl on a table',
       'arafed image of a little girl in a black onesuit',
       'there is a small christmas tree on a table with a red pot',
       'a close up of a pair of shorts with a white trim',
       'a woman in a pink coat and jeans standing in front of a brick wall',
       'arafed bed with a plaid comforter and pillows on a wooden floor',
       'someone is putting on a pair of brown shoes with holes',
       'a close up of a black box with a green eye shadow',
       'there is a red heart shaped object hanging from a string',
       'pair of pink gloves with rose embroidered on the wrist',
       'a close up of a mannequin torso with a white shirt',
       'purple thermos are

In [13]:
DIR2SAVE = r"C:\Users\FMari\Sirius2024\short_captions"

In [14]:
filenames = os.listdir(DIR_PATH)

for idx in range(len(img_paths)):
    caption = captions[idx]
    name = filenames[idx].split('.')[0]
    with open(os.path.join(DIR2SAVE, name+".txt"), "w") as f:
        f.write(caption)