In [4]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModel, AutoTokenizer
import os
from deep_translator import GoogleTranslator
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
MODEL_NAME = "google/siglip2-base-patch16-224"
model = AutoModel.from_pretrained(MODEL_NAME, dtype=torch.float32, device_map="cpu", attn_implementation="sdpa")
processor = AutoProcessor.from_pretrained(MODEL_NAME)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
translator = GoogleTranslator(source='pl', target="en")
text1 = "stek pocięty na kawałki z solą i rozmarynem na desce drewnianej"
print(translator.translate  (text1))

steak cut into pieces with salt and rosemary on a wooden board


In [7]:
image_dir = 'photos'
images = os.listdir(image_dir)
list_images = []
file_names = []
for file_path in images:
    path = os.path.join(image_dir, file_path)

    try: 
        image = Image.open(path)
        if image.mode != "RGB":
            image = image.convert("RGB")
        inputs = processor(images=image, return_tensors="pt")

        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
        list_images.append(image_features)
        file_names.append(file_path)
    except Exception as e:
        print(f"Pominięto {file_path}: {e}")

vector_images = torch.cat(list_images, dim=0)
print(vector_images.shape)

torch.Size([11, 768])


In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

text_input = input("Opisz jakiego zdjęcia szukasz")
if not text_input:
    raise Exception("Zapytanie nie zostało wpisane")
text_input_en = translator.translate(text_input)
print(text_input_en)
text_input_en = f"A photo of a {text_input_en}"
input_token = processor(text=text_input_en, padding="max_length", return_tensors="pt")


with torch.no_grad():
    text_features = model.get_text_features(**input_token)

print(text_features.shape)

dachshund
torch.Size([1, 768])


In [9]:
text_features_normalized = F.normalize(text_features)
image_features_normalized = F.normalize(vector_images)

similarity = text_features_normalized @ image_features_normalized.T
probs = torch.sigmoid((similarity * model.logit_scale.exp()) + model.logit_bias)
print(probs)

tensor([[8.4971e-08, 1.5968e-08, 5.6683e-07, 1.9511e-08, 4.1677e-04, 9.1449e-07,
         9.0049e-05, 7.5863e-02, 2.3419e-04, 4.2589e-04, 8.3641e-09]],
       grad_fn=<SigmoidBackward0>)


In [None]:
print(text_features_normalized.type())
print(image_features_normalized.type())

torch.FloatTensor
<built-in method type of Tensor object at 0x000001FAFA4004D0>


In [11]:
print(text_input_en)

number_of_photos = 4

prob, idx = torch.topk(similarity, k=number_of_photos)
top_indices = idx[0].tolist()
top_probs = prob[0].tolist()

for i in range(len(top_indices)):
    idx_act = top_indices[i]
    score_pct = top_probs[i] * 100 # Zamiana na %
    
    path_top_file = file_names[idx_act]
    path_of_file = os.path.join(image_dir, path_top_file)
    
    print(f"#{i+1}: {path_top_file} (Pewność: {score_pct:.2f}%)")
    
    try:
        image = Image.open(path_of_file)
        image.show()
    except:
        print("Nie można wyświetlić pliku.")

A photo of a dachshund
#1: pobrane (1).jpg (Pewność: 12.67%)
#2: pobrane.jpg (Pewność: 8.00%)
#3: images (4).jpg (Pewność: 7.98%)
#4: pobrane (2).jpg (Pewność: 7.47%)
