In [None]:
# clip_zeroshot_basic.py
import torch, requests
from PIL import Image
import open_clip

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Nạp model & preprocess
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name="ViT-B-32", pretrained="openai"   # hoặc pretrained="laion2b_s34b_b79k"
)
tokenizer = open_clip.get_tokenizer("ViT-B-32")
model = model.to(DEVICE).eval()

# 2) Ảnh demo (URL) — có thể thay bằng đường dẫn local: Image.open("path/to.jpg")
url = "https://ultralytics.com/images/bus.jpg"
img = Image.open(requests.get(url, stream=True).raw).convert("RGB")
image = preprocess(img).unsqueeze(0).to(DEVICE)   # [1,3,224,224]

# 3) Tập nhãn văn bản (tiếng Việt)
labels = ["con chó", "con mèo", "chiếc xe buýt", "ô tô", "xe máy", "con người"]
# Prompting nhẹ: “a photo of a {label}”
prompts = [f"a photo of a {t}" for t in labels]
text = tokenizer(prompts).to(DEVICE)

# 4) Mã hoá & tính độ tương đồng cosine
with torch.no_grad(), torch.autocast(device_type=DEVICE if DEVICE=="cuda" else "cpu", enabled=(DEVICE=="cuda")):
    image_features = model.encode_image(image)       # [1, D]
    text_features  = model.encode_text(text)         # [L, D]
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features  = text_features / text_features.norm(dim=-1, keepdim=True)
    logits = (image_features @ text_features.T).squeeze(0)  # [L]

probs = logits.softmax(dim=-1)
topk = probs.topk(5)
print("\nTop-5 dự đoán:")
for p, i in zip(topk.values.tolist(), topk.indices.tolist()):
    print(f"- {labels[i]}: {p:.3f}")
