In [None]:
import os
import torch
import numpy as np
from PIL import Image
from transformers import Blip2ForConditionalGeneration, Blip2Processor
from torchmetrics.multimodal import CLIPScore

# Initialize BLIP-2 model and processor for captioning
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", revision="51572668da0eb669e01a189dc22abe6088589a24")
blip2_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", revision="51572668da0eb669e01a189dc22abe6088589a24"
).to(device)

# Initialize CLIPScore metric
clip_score_metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16").to(device)


2024-11-28 03:09:37.441657: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-28 03:09:37.474051: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-28 03:09:37.474083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-28 03:09:37.474882: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 03:09:37.480412: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()
Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


Average CLIP Score: 29.861747904979822


In [None]:

# Directory containing PNG images
image_dir = "logs/upsample-featup/log-dinov2-snr10/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Function to compute CLIP score for an image
def compute_clip_score(image_path):
    # Open image and preprocess it
    image = Image.open(image_path).convert("RGB")
    image_tensor = np.array(image).transpose(2, 0, 1)  # HWC -> CHW
    image_tensor = torch.tensor(image_tensor).unsqueeze(0).float()  # Add batch dimension
    image_tensor = image_tensor.to(device)  # Ensure image tensor is on the same device as model
    
    # Generate caption using BLIP-2 model
    inputs = blip2_processor(images=image, return_tensors="pt").to(device)
    caption_ids = blip2_model.generate(**inputs)
    caption = blip2_processor.decode(caption_ids[0], skip_special_tokens=True)

    # Ensure the caption is a list (since the CLIPScore metric expects a list)
    caption = [caption]

    # Calculate CLIP score
    clip_score = clip_score_metric(image_tensor, caption)
    return clip_score.detach().item()

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")


In [2]:

# Directory containing PNG images
image_dir = "logs/upsample-featup/log-dinov2-snr30/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Function to compute CLIP score for an image
def compute_clip_score(image_path):
    # Open image and preprocess it
    image = Image.open(image_path).convert("RGB")
    image_tensor = np.array(image).transpose(2, 0, 1)  # HWC -> CHW
    image_tensor = torch.tensor(image_tensor).unsqueeze(0).float()  # Add batch dimension
    image_tensor = image_tensor.to(device)  # Ensure image tensor is on the same device as model
    
    # Generate caption using BLIP-2 model
    inputs = blip2_processor(images=image, return_tensors="pt").to(device)
    caption_ids = blip2_model.generate(**inputs)
    caption = blip2_processor.decode(caption_ids[0], skip_special_tokens=True)

    # Ensure the caption is a list (since the CLIPScore metric expects a list)
    caption = [caption]

    # Calculate CLIP score
    clip_score = clip_score_metric(image_tensor, caption)
    return clip_score.detach().item()

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")


Average CLIP Score: 30.041159531564425


In [3]:

# Directory containing PNG images
image_dir = "logs/upsample-featup/log-vit-snr10/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Function to compute CLIP score for an image
def compute_clip_score(image_path):
    # Open image and preprocess it
    image = Image.open(image_path).convert("RGB")
    image_tensor = np.array(image).transpose(2, 0, 1)  # HWC -> CHW
    image_tensor = torch.tensor(image_tensor).unsqueeze(0).float()  # Add batch dimension
    image_tensor = image_tensor.to(device)  # Ensure image tensor is on the same device as model
    
    # Generate caption using BLIP-2 model
    inputs = blip2_processor(images=image, return_tensors="pt").to(device)
    caption_ids = blip2_model.generate(**inputs)
    caption = blip2_processor.decode(caption_ids[0], skip_special_tokens=True)

    # Ensure the caption is a list (since the CLIPScore metric expects a list)
    caption = [caption]

    # Calculate CLIP score
    clip_score = clip_score_metric(image_tensor, caption)
    return clip_score.detach().item()

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")


Average CLIP Score: 29.974704021396057


In [4]:

# Directory containing PNG images
image_dir = "logs/upsample-featup/log-vit-snr30/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Function to compute CLIP score for an image
def compute_clip_score(image_path):
    # Open image and preprocess it
    image = Image.open(image_path).convert("RGB")
    image_tensor = np.array(image).transpose(2, 0, 1)  # HWC -> CHW
    image_tensor = torch.tensor(image_tensor).unsqueeze(0).float()  # Add batch dimension
    image_tensor = image_tensor.to(device)  # Ensure image tensor is on the same device as model
    
    # Generate caption using BLIP-2 model
    inputs = blip2_processor(images=image, return_tensors="pt").to(device)
    caption_ids = blip2_model.generate(**inputs)
    caption = blip2_processor.decode(caption_ids[0], skip_special_tokens=True)

    # Ensure the caption is a list (since the CLIPScore metric expects a list)
    caption = [caption]

    # Calculate CLIP score
    clip_score = clip_score_metric(image_tensor, caption)
    return clip_score.detach().item()

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")


Average CLIP Score: 30.09947035962885


In [5]:

# Directory containing PNG images
image_dir = "logs/upsample-featup/log-resnet50-snr10/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Function to compute CLIP score for an image
def compute_clip_score(image_path):
    # Open image and preprocess it
    image = Image.open(image_path).convert("RGB")
    image_tensor = np.array(image).transpose(2, 0, 1)  # HWC -> CHW
    image_tensor = torch.tensor(image_tensor).unsqueeze(0).float()  # Add batch dimension
    image_tensor = image_tensor.to(device)  # Ensure image tensor is on the same device as model
    
    # Generate caption using BLIP-2 model
    inputs = blip2_processor(images=image, return_tensors="pt").to(device)
    caption_ids = blip2_model.generate(**inputs)
    caption = blip2_processor.decode(caption_ids[0], skip_special_tokens=True)

    # Ensure the caption is a list (since the CLIPScore metric expects a list)
    caption = [caption]

    # Calculate CLIP score
    clip_score = clip_score_metric(image_tensor, caption)
    return clip_score.detach().item()

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")


Average CLIP Score: 29.99231551921729


In [6]:

# Directory containing PNG images
image_dir = "logs/upsample-featup/log-resnet50-snr30/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Function to compute CLIP score for an image
def compute_clip_score(image_path):
    # Open image and preprocess it
    image = Image.open(image_path).convert("RGB")
    image_tensor = np.array(image).transpose(2, 0, 1)  # HWC -> CHW
    image_tensor = torch.tensor(image_tensor).unsqueeze(0).float()  # Add batch dimension
    image_tensor = image_tensor.to(device)  # Ensure image tensor is on the same device as model
    
    # Generate caption using BLIP-2 model
    inputs = blip2_processor(images=image, return_tensors="pt").to(device)
    caption_ids = blip2_model.generate(**inputs)
    caption = blip2_processor.decode(caption_ids[0], skip_special_tokens=True)

    # Ensure the caption is a list (since the CLIPScore metric expects a list)
    caption = [caption]

    # Calculate CLIP score
    clip_score = clip_score_metric(image_tensor, caption)
    return clip_score.detach().item()

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")


Average CLIP Score: 30.06799986290209
