In [3]:
import torch
from PIL import Image
import clip
from transformers import AutoProcessor, AutoModel
import torch.nn.functional as F

# ====== 1. Load models ======
clip_model, clip_preprocess = clip.load("ViT-B/16", device="cuda" if torch.cuda.is_available() else "cpu")
clip_model.eval()
clip_model.load_state_dict(torch.load('../clipseg/weights/rd64-uni-refined.pth'), strict=False)

siglip_model_path = "./siglip2"
siglip_model = AutoModel.from_pretrained(siglip_model_path).eval()
siglip_processor = AutoProcessor.from_pretrained(siglip_model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
siglip_model = siglip_model.to(device)

# ====== 2. Load image and text ======
image_path = "roses.jpg"  
text = "Turn the white roses into red ones."

# CLIP preprocessing
clip_image = clip_preprocess(Image.open(image_path)).unsqueeze(0).to(device)
clip_text = clip.tokenize([text]).to(device)

# SigLIP preprocessing
siglip_inputs = siglip_processor(images=Image.open(image_path), text=[text], return_tensors="pt").to(device)

# ====== 3. Forward pass ======
with torch.no_grad():
    # CLIP output
    clip_img_emb = clip_model.encode_image(clip_image)  # (1, D)
    clip_txt_emb = clip_model.encode_text(clip_text)    # (1, D)

    # SigLIP output
    siglip_outputs = siglip_model(**siglip_inputs)
    siglip_img_emb = siglip_outputs.image_embeds        # (1, D)
    siglip_txt_emb = siglip_outputs.text_embeds         # (1, D)

# ====== 4. Results and Comparison ======
def cosine_sim(a, b):
    return F.cosine_similarity(a, b).item()

clip_img_emb_norm = F.normalize(clip_img_emb, dim=-1)
clip_txt_emb_norm = F.normalize(clip_txt_emb, dim=-1)

siglip_img_emb_norm = F.normalize(siglip_img_emb, dim=-1)
siglip_txt_emb_norm = F.normalize(siglip_txt_emb, dim=-1)

print("\n===== CLIP 输出 =====")
print("图像 embedding:", clip_img_emb.squeeze().tolist()[:5], "...")
print("文本 embedding:", clip_txt_emb.squeeze().tolist()[:5], "...")
print("图像 embedding 维度:", clip_img_emb.shape)
print("文本 embedding 维度:", clip_txt_emb.shape)
print("Embd matching Cosine Similarity:", cosine_sim(clip_img_emb_norm, clip_txt_emb_norm))

print("\n===== SigLIP 输出 =====")
print("图像 embedding:", siglip_img_emb.squeeze().tolist()[:5], "...")
print("文本 embedding:", siglip_txt_emb.squeeze().tolist()[:5], "...")
print("图像 embedding 维度:", siglip_img_emb.shape)
print("文本 embedding 维度:", siglip_txt_emb.shape)
print("Embd matching Cosine Similarity:", cosine_sim(siglip_img_emb_norm, siglip_txt_emb_norm))



===== CLIP 输出 =====
图像 embedding: [-0.2666015625, -0.5107421875, 0.1229248046875, -0.4853515625, -0.2242431640625] ...
文本 embedding: [-0.493408203125, 0.1875, 0.1368408203125, 0.114501953125, -0.0006265640258789062] ...
图像 embedding 维度: torch.Size([1, 512])
文本 embedding 维度: torch.Size([1, 512])
Embd matching Cosine Similarity: 0.279541015625

===== SigLIP 输出 =====
图像 embedding: [0.010202853940427303, 0.018973883241415024, -0.03359675779938698, -0.0034157102927565575, 0.05803455784916878] ...
文本 embedding: [-0.04627535864710808, 0.013235348276793957, 0.015011128969490528, 0.022088630124926567, 0.30756810307502747] ...
图像 embedding 维度: torch.Size([1, 768])
文本 embedding 维度: torch.Size([1, 768])
Embd matching Cosine Similarity: 0.007766037248075008
