In [None]:
import pickle
from tqdm import tqdm
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from torchmetrics.multimodal.clip_score import CLIPScore
import torch

# Load the dataset
with open('./dataset/testset.pkl', 'rb') as f:
    dataset = pickle.load(f)

# Path to the image folder
img_root = "./output/"

# Initialize the CLIPScore metric
metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")

# Define image preprocessing
preprocess = Compose([
    Resize(224, interpolation=Image.BICUBIC),
    CenterCrop(224),
    ToTensor(),
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)),
])

clip_score = 0
num_images = 250  # Total number of images to process

# Loop through the dataset and compute CLIP scores
for idx in tqdm(range(num_images)):
    caption = dataset[idx]['textual_condition']
    img_path = img_root + str(idx) + '.png'
    
    try:
        # Load and preprocess the image
        image = Image.open(img_path).convert("RGB")
        image_tensor = preprocess(image).unsqueeze(0)  # Add batch dimension

        # Compute the CLIP score
        score = metric(image_tensor, caption)
        clip_score += score.detach().item()
    except Exception as e:
        print(f"Error processing image {img_path}: {e}")

# Compute the average CLIP score
average_clip_score = clip_score / num_images
print(f"Average CLIP Score: {average_clip_score:.4f}")

    