In [24]:
from qdrant_client import QdrantClient, models

client = QdrantClient(url="http://localhost:6333/")

In [None]:
# Install additional packages that might be required
!pip install --upgrade sentence-transformers
!pip install --upgrade transformers
!pip install pillow
!pip install torch torchvision torchaudio

# For Mac with MPS support
!ip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import numpy as np

# Load CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Set device
device = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)

documents = [
    {"caption": "An image about plane emergency safety.", "image": "images/image-1.png"},
    {"caption": "An image about airplane components.", "image": "images/image-2.png"},
    {"caption": "An image about COVID safety restrictions.", "image": "images/image-3.png"},
    {"caption": "An confidential image about UFO sightings.", "image": "images/image-4.png"},
    {"caption": "An image about unusual footprints on Aralar 2011.", "image": "images/image-5.png"},
]

# Get text embeddings
texts = [doc["caption"] for doc in documents]
text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True)
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}

with torch.no_grad():
    text_embeddings = model.get_text_features(**text_inputs)
    text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)

# Get image embeddings
try:
    images = [Image.open(doc["image"]).convert('RGB') for doc in documents]
    image_inputs = processor(images=images, return_tensors="pt")
    image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
    
    with torch.no_grad():
        image_embeddings = model.get_image_features(**image_inputs)
        image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)
    
    print(f"Text embeddings: {text_embeddings.shape}")
    print(f"Image embeddings: {image_embeddings.shape}")
    
    # Convert to numpy
    text_embeddings = text_embeddings.cpu().numpy()
    image_embeddings = image_embeddings.cpu().numpy()
    
except FileNotFoundError as e:
    print(f"Make sure your image files exist: {e}")

    


Create a **Collection**

In [None]:
COLLECTION_NAME = "llama-multi"

if not client.collection_exists(COLLECTION_NAME):
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={
            "image": models.VectorParams(size=len(image_embeddings[0]), distance=models.Distance.COSINE),
            "text": models.VectorParams(size=len(text_embeddings[0]), distance=models.Distance.COSINE),
        }
    )

Now let's upload our images with captions to the **Collection**. Each image with its caption will create a [Point](https://qdrant.tech/documentation/concepts/points/) in Qdrant.

In [42]:
client.upload_points(
    collection_name=COLLECTION_NAME,
    points=[
        models.PointStruct(
            id=idx,
            vector={
                "text": text_embeddings[idx],
                "image": image_embeddings[idx],
            },
            payload=doc
        )
        for idx, doc in enumerate(documents)
    ]
)

Let'see what image we will get to the query "*Adventures on snow hills*"

In [None]:
from PIL import Image

# Fix: usa il metodo corretto per ottenere l'embedding del testo
query_text = "Adventures on snow hills"
text_inputs = processor(text=[query_text], return_tensors="pt", padding=True, truncation=True)
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}

with torch.no_grad():
    find_image = model.get_text_features(**text_inputs)
    find_image = find_image / find_image.norm(dim=-1, keepdim=True)
    find_image = find_image.cpu().numpy()[0]  # Prendi il primo (e unico) embedding

# Ora usa find_image nella query
Image.open(client.query_points(
    collection_name=COLLECTION_NAME,
    query=find_image,
    using="image",
    with_payload=["image"],
    limit=1
).points[0].payload['image'])

Let's also run the same query in Italian and compare the results.

In [None]:
query_text = "Avventure sulle colline innevate"
text_inputs = processor(text=[query_text], return_tensors="pt", padding=True, truncation=True)
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}

with torch.no_grad():
    query_embedding = model.get_text_features(**text_inputs)
    query_embedding = query_embedding / query_embedding.norm(dim=-1, keepdim=True)
    query_embedding = query_embedding.cpu().numpy()[0]

Image.open(client.query_points(
    collection_name=COLLECTION_NAME,
    query=query_embedding,
    using="image",
    with_payload=["image"],
    limit=1
).points[0].payload['image'])

Now let's do a reverse search for the follwing image:

In [None]:
Image.open('images/image-2.png')

In [None]:
# Fix: processa prima l'immagine
from PIL import Image

image = Image.open("images/image-2.png").convert('RGB')
image_inputs = processor(images=[image], return_tensors="pt")
image_inputs = {k: v.to(device) for k, v in image_inputs.items()}

with torch.no_grad():
    query_embedding = model.get_image_features(**image_inputs)
    query_embedding = query_embedding / query_embedding.norm(dim=-1, keepdim=True)
    query_embedding = query_embedding.cpu().numpy()[0]

client.query_points(
    collection_name=COLLECTION_NAME,
    query=query_embedding,
    using="text",
    with_payload=["caption"],
    limit=1
).points[0].payload['caption']