# CLIP TESTING

This file is used to test the vanilla and fine tuned CLIP models under different BLIP configurations. The main usage was to check if the fine tuning was correctly boosting the performance of the base model.

Originally just two set of cards were being used for this testing, the one used in the BLIP fine tuning (original set of cards) and the other one not used (found inside this repo: https://github.com/jminuscula/dixit-online/tree/master/cards). After a while, I got access to a new set of cards (Dixit Odissey edition) and I've added also those.

In [1]:
from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel

from PIL import Image
import os
from tqdm import tqdm
import random

import torch

random.seed(42)
torch.manual_seed(42)

device = "cuda" if torch.cuda.is_available else "cpu"

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

blip_model.eval()
clip_model.eval()

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [2]:
def get_cards(path):
    images = dict()
    for image in os.listdir(path):
        raw_image = Image.open(os.path.join(path, image)).convert("RGB")
        raw_image = raw_image.resize((224,224))
        images[image] = raw_image

    return images

In [3]:
def get_captions(images, blip_processor, blip_model):
    dictionary = dict()
    for image_index, raw_image in images.items():
        inputs = blip_processor(raw_image, return_tensors="pt").to("cuda")

        with torch.no_grad():
            caption_ids = blip_model.generate(
                    **inputs,
                    max_length=50,  
                    num_return_sequences=1,
                    do_sample=True, 
                    top_k=50,       
                    top_p=0.95,     
                    temperature=0.7,
                    repetition_penalty=1.2 
                )

        caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)

        dictionary[image_index] = caption

    return dictionary

In [4]:
def check_corrects(images, dictionary, clip_processor, clip_model):
    corrects = 0
    for image_name, caption in dictionary.items():
        to_be_compared = [image_name]
        
        good_sampling = False
        while not good_sampling:
            to_be_added = random.sample(list(images.keys()), 4)
            if not (image_name in to_be_added):
                good_sampling = True
                to_be_compared.extend(to_be_added)

        images_names = to_be_compared
        to_be_compared = [images[x] for x in to_be_compared]

        inputs = clip_processor(text=caption, images=to_be_compared, return_tensors="pt", padding="max_length", truncation=True).to(device)

        with torch.no_grad():
            outputs = clip_model(**inputs)
        
        logits_per_image = outputs.logits_per_image

        probs_per_image = logits_per_image.softmax(dim=0).squeeze()
        max_score_idx = torch.argmax(probs_per_image).item()
        best_image = images_names[max_score_idx]

        if best_image == image_name:
            corrects += 1
    return corrects / len(dictionary)

In [5]:
def check_accuracy_over_cards(path, blip_processor, blip_model, clip_processor, clip_model):
    images = get_cards(path)
    dictionary = get_captions(images, blip_processor, blip_model)
    accuracy = check_corrects(images, dictionary, clip_processor, clip_model)

    return accuracy

In [6]:
def compare_accuracies(blip_processor, blip_model, clip_processor, clip_model, path1 = "../cards/original_cards", path2 = "../cards/online_cards", clip_weights_path = None, blip_weights_path = None, num_iterations = 5):

    if clip_weights_path != None:
        clip_model.load_state_dict(torch.load(clip_weights_path))

    if blip_weights_path != None:
        blip_model.load_state_dict(torch.load(blip_weights_path))

    tot_first_accuracy = []
    tot_second_accuracy = []
    for _ in tqdm(range(num_iterations)):
        first_cards_accuracy = check_accuracy_over_cards(path1, blip_processor, blip_model, clip_processor, clip_model)
        second_cards_accuracy = check_accuracy_over_cards(path2, blip_processor, blip_model, clip_processor, clip_model)

        tot_first_accuracy.append(first_cards_accuracy)
        tot_second_accuracy.append(second_cards_accuracy)

    print(f"First card set: {sum(tot_first_accuracy)/len(tot_first_accuracy)}")
    print(f"Second card set: {sum(tot_second_accuracy)/len(tot_second_accuracy)}")

In [7]:
def compare_accuracies_all(blip_processor, blip_model, clip_processor, clip_model, path1 = "../cards/original_cards", path2 = "../cards/online_cards", path3="../cards/odissey_cards", clip_weights_path = None, blip_weights_path = None, num_iterations = 5):
    if clip_weights_path != None:
        clip_model.load_state_dict(torch.load(clip_weights_path))

    if blip_weights_path != None:
        blip_model.load_state_dict(torch.load(blip_weights_path))

    tot_first_accuracy = []
    tot_second_accuracy = []
    tot_third_accuracy = []
    for _ in tqdm(range(num_iterations)):
        first_cards_accuracy = check_accuracy_over_cards(path1, blip_processor, blip_model, clip_processor, clip_model)
        second_cards_accuracy = check_accuracy_over_cards(path2, blip_processor, blip_model, clip_processor, clip_model)
        third_cards_accuracy = check_accuracy_over_cards(path3, blip_processor, blip_model, clip_processor, clip_model)

        tot_first_accuracy.append(first_cards_accuracy)
        tot_second_accuracy.append(second_cards_accuracy)
        tot_third_accuracy.append(third_cards_accuracy)

    print(f"First card set: {sum(tot_first_accuracy)/len(tot_first_accuracy)}")
    print(f"Second card set: {sum(tot_second_accuracy)/len(tot_second_accuracy)}")
    print(f"Third card set: {sum(tot_third_accuracy)/len(tot_third_accuracy)}")

## How testing is performed

The models are tested on different card sets for $n$ iterations (five by default). The number printed beside the names of the card sets is the average accuracy over the iterations of the CLIP model. Captions to be used with images for the CLIP model are extracted using the fine tuned (or base) Blip model.

Base Blip model with base CLIP model

In [8]:
compare_accuracies(blip_processor, blip_model, clip_processor, clip_model)

100%|██████████| 5/5 [05:38<00:00, 67.61s/it]

First card set: 0.9428571428571428
Second card set: 0.9299999999999999





Fine tuned Blip model on first set of rephrased captions and base CLIP

In [9]:
compare_accuracies(blip_processor, blip_model, clip_processor, clip_model, blip_weights_path="../weights/rephrased_blip/epoch35.pt")

100%|██████████| 5/5 [10:19<00:00, 123.89s/it]

First card set: 0.7904761904761906
Second card set: 0.778





Fine tuned Blip model on first set of rephrased captions. Fine tuned CLIP model using fine tuned Blip on COCO dataset

In [10]:
#clip patch16
compare_accuracies(blip_processor, blip_model, clip_processor, clip_model, blip_weights_path="../weights/rephrased_blip/epoch35.pt", clip_weights_path="../weights/rephrased_coco_clip/epoch14.pt")

100%|██████████| 5/5 [10:07<00:00, 121.55s/it]

First card set: 0.8619047619047621
Second card set: 0.7979999999999999





Fine tuned Blip model on second set of rephrased captions with base CLIP

In [11]:
#clip patch16
compare_accuracies_all(blip_processor, blip_model, clip_processor, clip_model, blip_weights_path="../weights/rephrased_blip(2nd)/epoch50.pt")

100%|██████████| 5/5 [09:01<00:00, 108.25s/it]

First card set: 0.45714285714285713
Second card set: 0.48200000000000004
Third card set: 0.37857142857142856





Fine tuned Blip model on second set of reprhased captions. Fine tuned CLIP model using fine tuned Blip on COCO dataset

In [12]:
#clip patch16
compare_accuracies_all(blip_processor, blip_model, clip_processor, clip_model, blip_weights_path="../weights/rephrased_blip(2nd)/epoch50.pt", clip_weights_path="../weights/rephrased_coco_clip(2nd)/epoch13.pt")

100%|██████████| 5/5 [09:12<00:00, 110.46s/it]

First card set: 0.488095238095238
Second card set: 0.512
Third card set: 0.41904761904761906



