In [None]:
import pandas as pd
from pathlib import Path

background_map = {
    'The city of London': 'The city of London World',
    'The Parthenon in front of the Great Pyramid': 'The Parthenon in front of the Great Pyramid',
    'A single beam of light enters the room from the ceiling The beam of light is illuminating an easel On the easel there is a Rembrandt painting of a raccoon': 'A single beam of light enters the room from the ceiling. The beam of light is illuminating an easel. On the easel, there is a Rembrandt painting of a raccoon.',
    'A sunset': 'A sunset',
    'Photograph of a wall along a city street with a watercolor mural of foes in a jazz band': 'Photograph of a wall along a city street with a watercolor mural of foxes in a jazz band.'
}

clothes_map = {
    'A_scientist': 'A scientist',
    'A_photograph_of_a_knight_in_shining_armor_holding_a_basketball': 'A photograph of a knight in shining armor holding a basketball',
    'The_Mona_Lisa': 'The Mona Lisa',
    'Salvador_Dalí': 'Salvador Dalí',
    'A_person_with_arms_like_a_tree_branch': 'A person with arms like a tree branch'
}

def parse_threshold(th_str: str) -> float:
    return int(th_str[2:]) / 10.0

rows = []
for file in sorted(Path('experiment').glob('*.jpg')):
    name = file.name
    parts = name[:-4].split('__')
    img_name = parts[0]
    background_raw = parts[2]
    th_raw = parts[3]
    clothing_raw = parts[4]
    row = {
        'nome_arquivo': name,
        'imagem': img_name,
        'threshold': parse_threshold(th_raw),
        'prompt_fundo': background_map.get(background_raw, background_raw),
        'prompt_roupa': clothes_map.get(clothing_raw, clothing_raw.replace('_', ' '))
    }
    rows.append(row)

df = pd.DataFrame(rows)


In [None]:

import torch
import clip
from PIL import Image
from torchvision import transforms as T
from torchvision.metrics import FrechetInceptionDistance

REFERENCE_IMAGES = {
    'Photograph of a wall along a city street with a watercolor mural of foxes in a jazz band.': 'reference_images/fox_mural.png',
    'The city of London World': 'reference_images/london.png',
    'The Parthenon in front of the Great Pyramid': 'reference_images/partenon_great_pyramid.png',
    'A sunset': 'reference_images/sunset.png',
    'A single beam of light enters the room from the ceiling. The beam of light is illuminating an easel. On the easel, there is a Rembrandt painting of a raccoon.': 'reference_images/racoon.png',
    'A scientist': 'reference_images/scientist.png',
    'A photograph of a knight in shining armor holding a basketball': 'reference_images/knight_basketball.png',
    'The Mona Lisa': 'reference_images/monalisa.jpg',
    'Salvador Dalí': 'reference_images/salvador_dali.jpeg',
    'A person with arms like a tree branch': 'reference_images/tree_arms.png'
}

fid_transform = None

def setup_fid_transform():
    global fid_transform
    fid_transform = T.Compose([T.Resize((299, 299)), T.ToTensor()])

def compute_clip_score(image_path, prompt, model, preprocess, device):
    image = preprocess(Image.open(image_path).convert('RGB')).unsqueeze(0).to(device)
    text = clip.tokenize([prompt]).to(device)
    with torch.no_grad():
        img_feat = model.encode_image(image)
        txt_feat = model.encode_text(text)
        img_feat /= img_feat.norm(dim=-1, keepdim=True)
        txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
        score = (img_feat @ txt_feat.T).squeeze().cpu().item()
    return float(score)

def compute_fid(image_path, ref_path, fid_metric):
    if fid_transform is None:
        setup_fid_transform()
    img = fid_transform(Image.open(image_path).convert('RGB')).unsqueeze(0)
    ref = fid_transform(Image.open(ref_path).convert('RGB')).unsqueeze(0)
    fid_metric.reset()
    fid_metric.update(ref, real=True)
    fid_metric.update(img, real=False)
    return float(fid_metric.compute().item())


In [None]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, preprocess = clip.load('ViT-B/32', device=device)
fid_metric = FrechetInceptionDistance(feature=64)

max_images = None  # set to an int to limit processing for tests
df_proc = df if max_images is None else df.head(max_images)

clip_scores = []
fid_scores = []
for _, row in df_proc.iterrows():
    img_path = Path('experiment') / row['nome_arquivo']
    text_prompt = f"{row['prompt_roupa']} in {row['prompt_fundo']}"
    clip_score = compute_clip_score(img_path, text_prompt, model, preprocess, device)
    back_ref = Path(REFERENCE_IMAGES[row['prompt_fundo']])
    cloth_ref = Path(REFERENCE_IMAGES[row['prompt_roupa']])
    fid_back = compute_fid(img_path, back_ref, fid_metric)
    fid_cloth = compute_fid(img_path, cloth_ref, fid_metric)
    clip_scores.append(clip_score)
    fid_scores.append((fid_back + fid_cloth) / 2.0)

df_proc = df_proc.copy()
df_proc['clip_score'] = clip_scores
df_proc['fid'] = fid_scores
df = df.merge(df_proc[['nome_arquivo','clip_score','fid']], on='nome_arquivo', how='left')

df.to_csv('experiment_metrics.csv', index=False)
df.head()


In [None]:

threshold_metrics = df.groupby('threshold')[['clip_score','fid']].mean().reset_index()
threshold_metrics


In [None]:

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1,2, figsize=(10,4), sharex=True)
axes[0].plot(threshold_metrics['threshold'], threshold_metrics['clip_score'], marker='o')
axes[0].set_title('CLIP Score')
axes[0].set_xlabel('threshold')
axes[0].set_ylabel('score')

axes[1].plot(threshold_metrics['threshold'], threshold_metrics['fid'], marker='o', color='orange')
axes[1].set_title('FID')
axes[1].set_xlabel('threshold')
axes[1].set_ylabel('score')

plt.tight_layout()
plt.show()
