In [13]:
import torch
import clip
from PIL import Image
import os
from tqdm import tqdm

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [15]:
@torch.no_grad()
def image_to_vec(image_path: str) -> torch.Tensor:
    img = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)
    feat = model.encode_image(img)              # shape: [1, d]
    feat = feat / feat.norm(dim=-1, keepdim=True)  # normalize for cosine similarity
    return feat.squeeze(0).cpu()      

In [16]:
dataset_path = "dataset"
img_paths = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path)]
print(img_paths)
len(img_paths)

['dataset/9516.png', 'dataset/5663.png', 'dataset/5838.png', 'dataset/1609.png', 'dataset/9009.png', 'dataset/4187.png', 'dataset/1548.png', 'dataset/6197.png', 'dataset/9437.png', 'dataset/2280.png', 'dataset/8728.png', 'dataset/2711.png', 'dataset/2952.png', 'dataset/1690.png', 'dataset/2841.png', 'dataset/5084.png', 'dataset/3898.png', 'dataset/7059.png', 'dataset/7835.png', 'dataset/6506.png', 'dataset/1353.png', 'dataset/4381.png', 'dataset/0003.png', 'dataset/3615.png', 'dataset/7143.png', 'dataset/5804.png', 'dataset/2269.png', 'dataset/4582.png', 'dataset/5364.png', 'dataset/3280.png', 'dataset/4924.png', 'dataset/2385.png', 'dataset/4376.png', 'dataset/2500.png', 'dataset/0273.png', 'dataset/1648.png', 'dataset/8160.png', 'dataset/8506.png', 'dataset/8878.png', 'dataset/1067.png', 'dataset/8841.png', 'dataset/5196.png', 'dataset/6694.png', 'dataset/1730.png', 'dataset/5844.png', 'dataset/7833.png', 'dataset/5417.png', 'dataset/7999.png', 'dataset/8393.png', 'dataset/0484.png',

9605

In [17]:
vecs = torch.stack([image_to_vec(p) for p in tqdm(img_paths, desc="Embedding images")])

Embedding images:   0%|          | 23/9605 [00:08<56:35,  2.82it/s] 


KeyboardInterrupt: 