In [1]:
import requests
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import os
import torch
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [35]:
class CLIP_model():
    """
    Класс, который принимает на вход изображения
    и выдает описания к ним
    """

    
    def __init__(self):
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()
        

    def predict(self, img_paths, texts) -> str:
        """
        :img_paths: list(str) - путь, где лежит картинка
        :texts: list(str) - тексты, среди которых выбирается наиболее похожий
        """   
        
        images = [Image.open(path).convert(mode="RGB") for path in img_paths]
        
        inputs = self.processor(text=texts, images=images, 
                                return_tensors="pt", padding=True)
        inputs = inputs.to(self.device)
        outputs = self.model(**inputs)
        with torch.no_grad():
            # this is the image-text similarity score
            logits_per_images = outputs.logits_per_image
            # we can take the softmax to get the label probabilities
            probs = logits_per_images.softmax(dim=1)
        return probs.cpu()

In [3]:
clip_model = CLIP_model()

In [4]:
img_paths = []

with open(r'img_paths.txt', 'r') as f:
    for line in f:
        img_paths.append(line[:-1])

In [28]:
batch_size = 50
size = 13170

# классифицируем тип комнаты
classes = ['living room', 'kitchen', 'bathroom', 'hallway', 
           'dining room', 'bedroom']
# вспомогательное сообщение, добавляется к началу класса
message = 'it is a '

defined_classes = np.empty(size, dtype="<U100")

In [29]:
texts = [message + clas for clas in classes]

for idx in range(0, len(img_paths), batch_size):
    stop = min(len(img_paths), idx+batch_size)
    preds = clip_model.predict(img_paths[idx:stop], texts)
    max_idxs = np.argmax(np.array(preds), axis=1)
    defined_classes[idx:stop] = [classes[max_idx] for max_idx in max_idxs]

OSError: unrecognized data stream contents when reading image file

In [31]:
defined_classes = defined_classes[0:13150]
np.save('defined_classes', defined_classes)

In [34]:
np.unique(defined_classes, return_counts=True)

(array(['bathroom', 'bedroom', 'dining room', 'hallway', 'kitchen',
        'living room'], dtype='<U100'),
 array([3091, 8593,  278,  487,  150,  551], dtype=int64))