In [1]:
import torch
import clip
from PIL import Image

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

100%|███████████████████████████████████████| 338M/338M [00:05<00:00, 59.9MiB/s]


In [4]:
# Load the image to be encoded
image = preprocess(Image.open("cat.jpg")).unsqueeze(0).to(device)
# Load the text prompts to be encoded
class_texts = ["a diagram", "a dog", "a cat"]
text = clip.tokenize(class_texts).to(device)

In [5]:
with torch.no_grad():
    # Encode the image/s and text/s
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

In [6]:
# Pick the top 3 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarities = (100.0 * image_features @ text_features.T).softmax(dim=-1)
probabilities, indices = similarities[0].topk(3)

In [8]:
# Print the result
print("\nTop predictions:\n")
for probability, index in zip(probabilities, indices):
    print(f"{class_texts[index]}: {100 * probability.item():.2f}%")


Top predictions:

a cat: 99.41%
a dog: 0.49%
a diagram: 0.09%
