In [1]:
import os
import torch
import numpy as np
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
from torchmetrics.multimodal import CLIPScore

# Initialize LLaMA model and tokenizer for captioning
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    "qresearch/llama-3.1-8B-vision-378",
    trust_remote_code=True,
    torch_dtype=torch.float16,
).to(device)

tokenizer = AutoTokenizer.from_pretrained("qresearch/llama-3.1-8B-vision-378", use_fast=True)

# Initialize CLIPScore metric
clip_score_metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16").to(device)

# Function to compute CLIP score for an image
def compute_clip_score(image_path):
    # Open image and preprocess it
    image = Image.open(image_path).convert("RGB")
    image_tensor = np.array(image).transpose(2, 0, 1)  # HWC -> CHW
    image_tensor = torch.tensor(image_tensor).unsqueeze(0).float()  # Add batch dimension
    image_tensor = image_tensor.to(device)  # Ensure image tensor is on the same device as model

    # Generate caption using LLaMA model
    generated_text = model.answer_question(
         image, "Caption the image", tokenizer, max_new_tokens=128, do_sample=True, temperature=0.3
    )
    

    # Calculate CLIP score
    clip_score = clip_score_metric(image_tensor, generated_text)
    return clip_score.detach().item()

2024-11-28 03:59:40.428994: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-28 03:59:40.460448: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-28 03:59:40.460482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-28 03:59:40.461313: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 03:59:40.466588: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [2]:
# Directory containing PNG images
image_dir = "logs/upsample-featup/log-dinov2-snr10/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")

Token indices sequence length is longer than the specified maximum sequence length for this model (79 > 77). Running this sequence through the model will result in indexing errors


Average CLIP Score: 27.607014443296375


In [3]:
# Directory containing PNG images
image_dir = "logs/upsample-featup/log-dinov2-snr30/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")

Average CLIP Score: 27.763186086307872


In [4]:
# Directory containing PNG images
image_dir = "logs/upsample-featup/log-vit-snr10/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")

Average CLIP Score: 28.064116371039187


In [5]:
# Directory containing PNG images
image_dir = "logs/upsample-featup/log-vit-snr30/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")

Average CLIP Score: 27.55332503680027


In [6]:
# Directory containing PNG images
image_dir = "logs/upsample-featup/log-resnet50-snr10/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")

Average CLIP Score: 27.700846541289128


In [7]:
# Directory containing PNG images
image_dir = "logs/upsample-featup/log-resnet50-snr30/50/output"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.png')]

# Calculate CLIP scores for all images and compute the average
clip_scores = []
for image_path in image_paths:
    clip_score = compute_clip_score(image_path)
    if clip_score is not None:
        clip_scores.append(clip_score)

if clip_scores:
    avg_clip_score = np.mean(clip_scores)
    print(f"Average CLIP Score: {avg_clip_score}")
else:
    print("No valid clip scores computed.")

Average CLIP Score: 27.90542115045316
