In [1]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import os
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# ! possibly change dir_path

batch_size = 30
dir_path = r'C:\Users\FMari\TinkoffSirius\images'

In [2]:
class IC_model():
    """
    Класс, который принимает на вход изображения
    и выдает описания к ним
    """
    def __init__(self):
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16)
        
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()
        

    def predict(self, img_paths, conditional=False, texts=None) -> str:
        """
        :img_paths: list(str) - путь, где лежит картинка
        :conditional: bool, optional - с условием или без
        :texts: list(str), optional - текст, на который должен быть ориентирован ответ
        """   
        
        images = [Image.open(path).convert(mode="RGB") for path in img_paths]
        if conditional:
            inputs = self.processor(images, 
                                    text=texts, 
                                    return_tensors="pt").to(self.device, torch.float16)
        else:
            inputs = self.processor(images, 
                                    return_tensors="pt").to(self.device, torch.float16)

        outs = self.model.generate(**inputs)
        answers = self.processor.batch_decode(outs, skip_special_tokens=True)
        
        return answers

In [3]:
model = IC_model()

Создадим массив путей к картинкам, чтобы было удобнее собирать по батчам

In [6]:
img_paths = []

for chain_id in os.listdir(dir_path):
    chain_dir = os.path.join(dir_path, chain_id)
    
    for hotel_id in os.listdir(chain_dir):
        hotel_dir = os.path.join(chain_dir, hotel_id)
        
        for source in os.listdir(hotel_dir):
            path = os.path.join(hotel_dir, source)
            
            for img in os.listdir(path):
                img_paths.append(os.path.join(path, img))

In [97]:
# сохраним его

with open(r'img_paths.txt', 'w') as f:
    for path in img_paths:
        f.write("%s\n" % path)
    print('Done')

Done


In [27]:
captions = np.empty(len(img_paths), dtype="<U100")

In [28]:
for idx in range(0, len(img_paths), batch_size):
    stop = min(len(img_paths), idx+batch_size)
    preds = np.array(model.predict(img_paths[idx:stop]))
    captions[idx:stop] = preds
    
    if idx % 100 == 0:
        print(idx)

0
300
600
900
1200
1500
1800
2100
2400
2700
3000
3300
3600
3900
4200
4500
4800
5100
5400
5700
6000
6300
6600
6900
7200
7500
7800
8100
8400
8700
9000
9300
9600
9900
10200
10500
10800
11100
11400
11700
12000
12300
12600
12900


OSError: unrecognized data stream contents when reading image file

In [46]:
made_captions = captions[:13170]
np.save('made_captions', made_captions)

# TF-IDF #

In [119]:
# индекс определяет на каком месте в path.split('\\') находится папка train с изображениями
IDX0 = 0  

for i, path_part in enumerate(img_paths[0].split('\\')):
    if path_part == 'train':
        IDX0 = i

Сгруппируем созданные описания по отелям (словарь *book*) и по сетям отелей (массив *capts_for_chain*)

In [102]:
book = {}
capts_for_chain = np.zeros(85, dtype="<U10000")

for i in range(size):
    hotel = int(img_paths[i].split('\\')[IDX0+2])
    chain = int(img_paths[i].split('\\')[IDX0+1]) + 1
    capts_for_chain[chain] += ' ' + made_captions[i] + ' '
    
    try:
        book[hotel] += ' ' + made_captions[i] + ' '
    except:
        book[hotel] = ' ' + made_captions[i] + ' '

In [105]:
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(values)
vocab = vectorizer.get_feature_names_out()

Оставим только слова с высоким tf-idf, которые характеризуют отель

In [106]:
new_comms = []
for i in range(len(values)):
    idxs = tf_idf[i, :]
    non_zero_idxs = idxs.nonzero()
    new_comms.append([])
    for j in non_zero_idxs[1]:
        value = idxs[0, j]
        if value > 0.3 and value < 0.9:  # убираем слова c плохим tf-idf
            new_comms[i].append(vocab[j])

In [107]:
new_comms

[['pillow', 'brown', 'blanket', 'bed'],
 ['table', 'couch', 'living'],
 ['lamp', 'lot', 'bedroom', 'of'],
 ['mirror', 'bathtub', 'sink', 'bathroom'],
 ['sheets', 'mirror', 'arafed', 'pillows'],
 ['corner', 'cabinet', 'doorway'],
 ['bathtub', 'shower'],
 ['pillows', 'blanket'],
 ['bag', 'suitcase', 'it', 'on'],
 ['taking', 'shining', 'light', 'purple'],
 ['above', 'painting', 'and'],
 ['fireplace'],
 ['toilet', 'sink', 'shower', 'bathroom'],
 ['lamp', 'couch'],
 ['sheets', 'pillows', 'in'],
 ['lamp', 'table', 'couch', 'living'],
 ['checkered', 'picture'],
 ['herself', 'woman', 'taking'],
 ['to', 'next', 'sitting', 'shower'],
 ['small', 'toilet', 'bathtub', 'bathroom'],
 ['small', 'toilet', 'bathtub', 'bathroom'],
 ['four'],
 ['small', 'couch', 'television'],
 ['bedroom', 'it', 'desk'],
 ['toilet', 'sink', 'shower', 'bathroom'],
 ['in', 'desk', 'room', 'bed'],
 ['boy', 'little'],
 ['curtains', 'window'],
 ['freezer', 'hallway', 'refrigerator'],
 ['blankets', 'lot', 'the'],
 ['small', 'co

In [117]:
# приведем массив слов в строку слов
comms = []

for i in range(len(new_comms)):
    comms.append(', '.join(new_comms[i]))
    
comms[0:10]

['pillow, brown, blanket, bed',
 'table, couch, living',
 'lamp, lot, bedroom, of',
 'mirror, bathtub, sink, bathroom',
 'sheets, mirror, arafed, pillows',
 'corner, cabinet, doorway',
 'bathtub, shower',
 'pillows, blanket',
 'bag, suitcase, it, on',
 'taking, shining, light, purple']

In [118]:
hotel_comms = np.array(comms)
np.save('hotel_comms', hotel_comms)
np.save('hotel_keys', np.array(list(book.keys())))