In [1]:
import os
import torch
from PIL import Image
from torch.nn import CosineSimilarity
from transformers import CLIPModel, CLIPProcessor, logging

# Setup
clip_model_id = 'openai/clip-vit-large-patch14'
device = 'cuda'
logging.set_verbosity_error()
torch.manual_seed(1)

# Textual inversion settings
property_name = 'grooty'  # Name of learned property
target_path = f'../data/{property_name}'  # Path to target images
samples_path = f'generated_images/samples/{property_name}'  # Path to generated images

# Hugging Face access token
token = ''
with open('hugging_face_token.txt', 'r') as secret:
    token = secret.readline().strip()

In [2]:
# Load CLIP components
model = CLIPModel.from_pretrained(clip_model_id)
processor = CLIPProcessor.from_pretrained(clip_model_id)
model.to(device)
model.eval()
print('Loaded CLIP model successfully!')

Loaded CLIP model successfully!


In [3]:
# Load embeddings by passing images through CLIP
def load_embeddings(image_dir):
    file_paths = [os.path.join(image_dir, file_path) for file_path in os.listdir(image_dir)]
    images = [Image.open(path) for path in file_paths]
    with torch.no_grad():
        embeddings = model.get_image_features(**processor(images=images, return_tensors='pt').to(device))
    return embeddings

In [4]:
# Load target images CLIP embeddings
target_features = load_embeddings(target_path)

# Load sample images CLIP embeddings
sample_features = load_embeddings(samples_path)

# Measure image similarity
similarity_acc = 0
for t in target_features:
    for s in sample_features:
        cosine_similarity = CosineSimilarity(dim=0)
        similarity = cosine_similarity(t, s)
        similarity_acc += round(similarity.item(), 4)

# Get score
reconstruction_score = round(similarity_acc / (target_features.shape[0] * sample_features.shape[0]), 4)
print(f'Reconstruction Score (Image Similarity): {reconstruction_score}')

Reconstruction Score (Image Similarity): 0.7622
