# SigLIP Baseline

In [25]:
import dataset as ds
from PIL import Image
import requests
from transformers import AutoProcessor, AutoModel
import torch
from torch.utils.data import DataLoader
from torchvision import transforms as tt
import numpy as np

In [26]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [27]:
model = AutoModel.from_pretrained("openai/clip-vit-base-patch32").to(device).eval()
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")



In [28]:
config = model.config.to_dict()

print('** LOAD MODEL ** ')
print(f'\tDevice: {device} ({torch.cuda.device_count()} gpus)')
print(f"\tModel parameters: {np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print(f"\tInput resolution: {config['vision_config']['image_size']}")
print(f"\tContext length: {config['text_config']['max_position_embeddings']}")
print(f"\tVocab size: {config['text_config']['vocab_size']}")

** LOAD MODEL ** 
	Device: cuda (1 gpus)
	Model parameters: 151,277,313
	Input resolution: 224
	Context length: 77
	Vocab size: 49408


In [29]:
scale = tt.Resize((224, 224))
tensor = tt.ToTensor()
image_composed = tt.transforms.Compose([tensor])

test_set = ds.VisualWSDDataset(mode="test", image_transform=image_composed)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False)

In [30]:
# if k is 1 gives all instances with the correct prediction as top prediction
# if k > 1 the correct prediction is in the top k predictions of the model
def hit(results, k):
    counter = 0

    for r in results:
        sims = np.absolute(r[1])
        sorted = np.argsort(sims)[:k]

        if r[0] in sorted:
            counter += 1

    return counter / len(results)

def mrr(results):
    sum = 0

    for r in results:
        sims = np.absolute(r[1])
        sorted = np.argsort(sims)
        sum += 1/(np.where(sorted==r[0])[0][0]+1)

    return sum / len(results)

In [31]:
# if k is 1 gives all instances with the correct prediction as top prediction
# if k > 1 the correct prediction is in the top k predictions of the model
def hit_max(results, k):
    counter = 0

    for r in results:
        sims = np.absolute(r[1])
        sorted = np.argsort(sims)[::-1][:k]

        if r[0] in sorted:
            counter += 1

    return counter / len(results)

def mrr_max(results):
    sum = 0

    for r in results:
        sims = np.absolute(r[1])
        sorted = np.argsort(sims)[::-1]
        sum += 1/(np.where(sorted==r[0])[0][0]+1)

    return sum / len(results)

In [32]:
cos_distance = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
a = torch.randn(1, 10)
cos_distance(a, a)

tensor([1.])

In [33]:
def test():
    #correct = 0
    #total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():

        results = []
        correct = 0
        max_correct = 0

        for a,data in enumerate(test_loader):
            images = data["imgs"]
            text = data["label_context"][0]
            correct_idx = data["correct_idx"].item()


            print("----------------------------")
            print("batch: " + str(a+1) + "/" + str(len(test_loader)))
            print("label: " + str(text))
            print("correct index: " + str(correct_idx))

            input_text = processor(text=text, padding="max_length", return_tensors="pt")

            input_imgs = []
            for img in images:
                input_imgs.append(processor(images=img, padding="max_length", return_tensors="pt"))

            img_output = []
            for img in input_imgs:
                img_output.append(model.get_image_features(**img.to(device)))

            text_output = model.get_text_features(**input_text.to(device))

            cos_distance = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

            print("text output shape: " + str(text_output.shape))
            print("img output shape: " + str(img_output[0].shape))
            #print(text_output)
            #print(img_output[0])
            
            sims = []
            for i in range(len(img_output)):
                sims.append(cos_distance(text_output, img_output[i]).cpu().item())
                #print(sims[-1]*100)

            sims = np.absolute(np.array(sims))
            min_idx = np.argmin(sims)
            max_idx = np.argmax(sims)

            print("sims: " + str(sims))
            print("min val: " + str(sims[min_idx]) + " min index: " + str(min_idx))
            print("max val: " + str(sims[max_idx]) + " max index: " + str(max_idx))

            results.append((correct_idx, sims))

            if min_idx == correct_idx:
                correct += 1

            if max_idx == correct_idx:
                max_correct += 1

            #if a > 49:
            #    break

    print(correct)
    print(max_correct)

    return results

In [34]:
results = test()

----------------------------
batch: 1/463
label: football goal
correct index: 8
text output shape: torch.Size([1, 512])
img output shape: torch.Size([1, 512])
sims: [0.25042769 0.25297874 0.25210094 0.25395525 0.25409392 0.25466567
 0.2522065  0.25364503 0.25360468 0.24799681]
min val: 0.24799680709838867 min index: 9
max val: 0.25466567277908325 max index: 5
----------------------------
batch: 2/463
label: mustard seed
correct index: 0
text output shape: torch.Size([1, 512])
img output shape: torch.Size([1, 512])
sims: [0.2267164  0.22427182 0.2270903  0.22655883 0.22513828 0.22576922
 0.22583589 0.2246207  0.22539225 0.22162765]
min val: 0.22162765264511108 min index: 9
max val: 0.22709029912948608 max index: 2
----------------------------
batch: 3/463
label: eating seat
correct index: 5
text output shape: torch.Size([1, 512])
img output shape: torch.Size([1, 512])
sims: [0.21670344 0.21864194 0.21746886 0.21774925 0.21894921 0.21950743
 0.21844622 0.21787637 0.21923448 0.2193229 ]
m

In [35]:
print("hit1: " + str(hit(results, 1)))
print("mrr: " + str(mrr(results)))
# mit absolutbetrag
# mit richtiger Bildgröße

hit1: 0.11764705882352941
mrr: 0.28078120136943663


In [36]:
print("hit1: " + str(hit_max(results, 1)))
print("mrr: " + str(mrr_max(results)))

hit1: 0.21568627450980393
mrr: 0.3932384064737005


In [37]:
# calcualte MRR
def mrr(results):

    rr = 0

    for r in results:
        correct_idx = r[0]
        sims = r[1]
        sim_correct = sims[correct_idx]
        sim_sort = np.sort(sims)
        new_idx = np.where(sim_sort == sim_correct)
        rr += 1/(new_idx+1)

    return 1/len(results)*mrr

# calculate HIT@1
def hit1(results):
    correct = 0
    for r in results:
        correct_idx = r[0]
        sims = r[1]
        if correct_idx == np.argmin(sims):
            correct += 1

    return correct/len(results)

print(hit1(results))
print(mrr(results))

0.11764705882352941


TypeError: can only concatenate tuple (not "int") to tuple

In [None]:




url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

texts = ["a photo of 2 cats", "a photo of 2 dogs"]
# important: we pass `padding=max_length` since the model was trained with this
inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = torch.sigmoid(logits_per_image) # these are the probabilities
print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")

31.9% that image 0 is 'a photo of 2 cats'
