#### CLIP Score ####

In [1]:
pip install transformers torch torchvision tqdm


Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/a9/b6/5257d04ae327b44db31f15cce39e6020cc986333c715660b1315a9724d82/transformers-4.51.3-py3-none-any.whl.metadata
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.30.0 from https://files.pythonhosted.org/packages/93/27/1fb384a841e9661faad1c31cbfa62864f59632e876df5d795234da51c395/huggingface_hub-0.30.2-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/60/85/cebcc0aff603ea0a201667b203f13ba75d9fc8668fab917ac5b2de3967bc/regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl.metadata
  Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.w

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoModel, AutoProcessor, AutoTokenizer
import os
import os.path as osp
from tqdm import tqdm

class CLIPDataset(Dataset):
    def __init__(self, image_paths, text_prompts, processor, tokenizer):
        self.image_paths = image_paths
        self.text_prompts = text_prompts
        self.processor = processor
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img = Image.open(self.image_paths[idx]).convert("RGB")
        prompt = self.text_prompts[idx]
        
        image_input = self.processor(images=img, return_tensors="pt")["pixel_values"].squeeze(0)
        text_input = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        
        return image_input, {k: v.squeeze(0) for k, v in text_input.items()}

@torch.no_grad()
def evaluate_clip_score(image_paths, text_prompts, model_name="openai/clip-vit-base-patch32", batch_size=32, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load CLIP model
    model = AutoModel.from_pretrained(model_name).to(device)
    processor = AutoProcessor.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    dataset = CLIPDataset(image_paths, text_prompts, processor, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    total_score = 0.0
    total_count = 0

    for images, texts in tqdm(dataloader):
        images = images.to(device)
        for k in texts:
            texts[k] = texts[k].to(device)

        image_features = model.get_image_features(pixel_values=images)
        text_features = model.get_text_features(**texts)

        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)

        scores = (image_features * text_features).sum(dim=1)
        total_score += scores.sum().item()
        total_count += scores.size(0)

    return total_score / total_count


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
image_paths = ["output/image1.png", "output/image2.png", ...]
text_prompts = ["A woman in a red dress", "A man wearing blue jeans", ...]

clip_score = evaluate_clip_score(image_paths, text_prompts)
print(f"Mean CLIP Score: {clip_score:.4f}")


KeyboardInterrupt: 

#  Fréchet Inception Distance (FID)

In [4]:
pip install pytorch-fid


Collecting pytorch-fid
  Obtaining dependency information for pytorch-fid from https://files.pythonhosted.org/packages/dd/2b/e8c875b9380f34c70d4b5d98deaaa8bcac4922388efad08a72cf129118ee/pytorch_fid-0.3.0-py3-none-any.whl.metadata
  Downloading pytorch_fid-0.3.0-py3-none-any.whl.metadata (5.3 kB)
Downloading pytorch_fid-0.3.0-py3-none-any.whl (15 kB)
Installing collected packages: pytorch-fid
Successfully installed pytorch-fid-0.3.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
from pytorch_fid import fid_score

def compute_fid(real_dir, generated_dir, batch_size=50, device=None, dims=2048):
    """
    real_dir: folder with real images
    generated_dir: folder with generated images
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    fid_value = fid_score.calculate_fid_given_paths(
        [real_dir, generated_dir],
        batch_size=batch_size,
        device=device,
        dims=dims
    )
    return fid_value


In [None]:
real_dir = "path/to/real/images"
generated_dir = "path/to/generated/images"

fid = compute_fid(real_dir, generated_dir)
print(f"FID: {fid:.2f}")


#### LPIPS (Learned Perceptual Image Patch Similarity) ####

In [6]:
pip install lpips

Collecting lpips
  Obtaining dependency information for lpips from https://files.pythonhosted.org/packages/9b/13/1df50c7925d9d2746702719f40e864f51ed66f307b20ad32392f1ad2bb87/lpips-0.1.4-py3-none-any.whl.metadata
  Downloading lpips-0.1.4-py3-none-any.whl.metadata (10 kB)
Downloading lpips-0.1.4-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lpips
Successfully installed lpips-0.1.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
import lpips
from torchvision import transforms
from PIL import Image

# Preprocessing transform
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # LPIPS expects same resolution
    transforms.ToTensor(),
])

def compute_lpips(real_paths, gen_paths, net='alex', device=None):
    """
    real_paths, gen_paths: lists of image file paths (must match 1-to-1)
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    loss_fn = lpips.LPIPS(net=net).to(device)
    total_score = 0.0

    for real_path, gen_path in zip(real_paths, gen_paths):
        img0 = transform(Image.open(real_path).convert('RGB')).unsqueeze(0).to(device)
        img1 = transform(Image.open(gen_path).convert('RGB')).unsqueeze(0).to(device)
        score = loss_fn(img0, img1)
        total_score += score.item()

    return total_score / len(real_paths)


In [None]:
real_imgs = ["real/img1.png", "real/img2.png", ...]
gen_imgs = ["gen/img1.png", "gen/img2.png", ...]

lpips_score = compute_lpips(real_imgs, gen_imgs)
print(f"LPIPS (AlexNet): {lpips_score:.4f}")
