In [3]:
import torch
import clip
from PIL import Image
import requests

device = "cuda" if torch.cuda.is_available() else "cpu"

# ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
model, preprocess = clip.load("RN50x64", device=device)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image = preprocess(image).unsqueeze(0).to(device)
labels = ["quirky kittens on a couch", "chaotic remote controls", "a work of art"]
text = clip.tokenize(labels).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:")
for label, prob in zip(labels, probs[0]):
    print(f"\t{label}: {prob:.2f}")

Label probs:
	quirky kittens on a couch: 0.87
	chaotic remote controls: 0.13
	a work of art: 0.00
