In [18]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import os

# Load model and processor
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Ask user for a text query
query = input("What are you looking for? ").strip()

# Set the gallery directory
gallery_dir = "gallery"

# Collect all images from gallery
gallery_filenames = []
gallery_images = []

for filename in os.listdir(gallery_dir):
    if filename.lower().endswith((".jpg", ".jpeg", ".png", ".webp")):
        path = os.path.join(gallery_dir, filename)
        try:
            img = Image.open(path).convert("RGB")
            gallery_images.append(img)
            gallery_filenames.append(path)
        except Exception as e:
            print(f"Could not open {filename}: {e}")

# Check if any images were loaded
if not gallery_images:
    print("No valid images found in the gallery folder.")
    exit()

# Encode the text query
text_inputs = processor(text=[query], return_tensors="pt").to(device)
with torch.no_grad():
    text_features = model.get_text_features(**text_inputs)
    text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)

# Encode all gallery images
image_inputs = processor(images=gallery_images, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
    image_features = model.get_image_features(**image_inputs)
    image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)

# Compute cosine similarity
similarities = torch.matmul(text_features, image_features.T) 

# Find top match
best_index = similarities.argmax().item()
best_filename = gallery_filenames[best_index]
best_image = gallery_images[best_index]


best_image.show(title=f"Best match: {os.path.basename(best_filename)}")



What are you looking for?  guitar
