In [None]:
import os
import json
import torch
import pyiqa
import logging
import datasets
import torchvision
import numpy as np
from PIL import Image
from tqdm.auto import tqdm

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["MODELSCOPE_LOG_LEVEL"] = str(logging.ERROR)
torchvision.disable_beta_transforms_warning()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
seed = 42
images_evaluated_per_prompt = 3

## Load Dataset

In [None]:
result_dir = "cifar/original_prompt/"
image_gen_dir = "generated_images/" + result_dir
eval_dir = "eval/" + result_dir

classes = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"] # CIFAR10
prompts = [f"a photo of a {cls}" for cls in classes]

## Generate Images for Evaluation

In [None]:
from diffusers import DiffusionPipeline

model_id = "CompVis/stable-diffusion-v1-4"
pipeline = DiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype = torch.bfloat16,
    safety_checker = None
).to(device)
pipeline.set_progress_bar_config(disable = True)

In [None]:
os.makedirs(image_gen_dir, exist_ok = True)
os.makedirs(eval_dir, exist_ok = True)
for prompt in tqdm(prompts):
    images = pipeline(prompt.strip(), num_images_per_prompt = images_evaluated_per_prompt, generator = torch.manual_seed(seed)).images
    for i, image in enumerate(images):
        image.save(image_gen_dir + f"{prompt.split(' ')[-1]}_{i}.png")

## Alignment Evaluation Metrics

### DSG

In [None]:
from DSG.dsg.query_utils import generate_dsg
from DSG.dsg.vqa_utils import MPLUG, calc_vqa_score
from DSG.dsg.parse_utils import parse_question_output
from transformers import AutoTokenizer, AutoModelForCausalLM

vqa_model = MPLUG()
vqa_model.pipeline_vqa.use_reentrant = False
llm_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
llm = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B", device_map = device, torch_dtype = torch.bfloat16)
llm_tokenizer.pad_token = llm_tokenizer.eos_token
llm.generation_config.pad_token_id = llm_tokenizer.pad_token_id

def autocomplete(prompt, max_new_tokens = 256, **kwargs):
    inputs = llm_tokenizer([prompt], return_tensors = "pt", padding = True).to(device)
    output_ids = llm.generate(**inputs, generation_config = llm.generation_config, max_new_tokens = max_new_tokens, **kwargs)
    return llm_tokenizer.batch_decode(output_ids[:, inputs.input_ids.size(dim = 1):])[0].rstrip(llm_tokenizer.eos_token)

In [None]:
id2prompts = {i: {"input": prompt} for i, prompt in enumerate(prompts)}

_, id2question_outputs, _ = generate_dsg(id2prompts, generate_fn = autocomplete, verbose = False)

In [None]:
result = {"data": []}
for i, prompt in enumerate(tqdm(prompts)):
    images = [Image.open(image_gen_dir + f"{prompt.split(' ')[-1]}_{j}.png") for j in range(images_evaluated_per_prompt)]
    qid2question = parse_question_output(id2question_outputs[i]["output"])
    qid2answers = [{qid: vqa_model.vqa(image, question).lower() for qid, question in qid2question.items()} for image in images]
    dsg_scores = [calc_vqa_score(qid2answer)["average_score_without_dependency"] for qid2answer in qid2answers]
    result["data"].append({"Prompt": prompt, "VQA": {"Question": qid2question, **{f"Answer {j}": qid2answer for j, qid2answer in enumerate(qid2answers)}}, "Score": sum(dsg_scores) / len(dsg_scores)})
    
with open("eval/cifar_prompts_dsg.json", "w") as f:
    f.write(json.dumps(result))
    f.close()

In [None]:
dsg_eval_dataset = datasets.load_dataset(
    "json",
    data_files = {"eval": "eval/cifar_prompts_dsg.json"},
    split = "eval",
    field = "data"
)

print("Sample Eval -", dsg_eval_dataset[0])
print("DSG -", sum(dsg_eval_dataset["Score"]) / len(dsg_eval_dataset))

### VQAScore

In [None]:
from t2v_metrics.t2v_metrics import VQAScore

clip_flant5_score = VQAScore(model = "clip-flant5-xl")

result = {"data": []}
for prompt in tqdm(prompts):
    result["data"].append({"Prompt": prompt, "Score": clip_flant5_score(images = [image_gen_dir + f"{prompt.split(' ')[-1]}_{i}.png" for i in range(images_evaluated_per_prompt)], texts = [prompt]).detach().cpu().mean().item()})

with open(eval_dir + "vqascore.json", "w") as f:
    f.write(json.dumps(result))
    f.close()

In [None]:
vqascore_eval_dataset = datasets.load_dataset(
    "json",
    data_files = {"eval": eval_dir + "vqascore.json"},
    split = "eval",
    field = "data"
)

print("Sample Eval -", vqascore_eval_dataset[0])
print("VQAScore -", sum(vqascore_eval_dataset["Score"]) / len(vqascore_eval_dataset))

### CLIPScore

In [None]:
import numpy as np
from torchmetrics.multimodal.clip_score import CLIPScore

clip_score = CLIPScore(model_name_or_path = "openai/clip-vit-large-patch14").to(device)

result = {"data": []}
for prompt in tqdm(prompts):
    images = [Image.open(image_gen_dir + f"{prompt.split(' ')[-1]}_{i}.png") for i in range(images_evaluated_per_prompt)]
    clip_scores = [clip_score(torch.tensor(np.asarray(image)).permute(2, 0, 1).to(device), prompt).detach().cpu().item() for image in images]
    result["data"].append({"Prompt": prompt, "Score": sum(clip_scores) / len(clip_scores)})
    
with open(eval_dir + "clipscore.json", "w") as f:
    f.write(json.dumps(result))
    f.close()

In [None]:
clip_eval_dataset = datasets.load_dataset(
    "json",
    data_files = {"eval": eval_dir + "clipscore.json"},
    split = "eval",
    field = "data"
)

print("Sample Eval -", clip_eval_dataset[0])
print("CLIPScore -", sum(clip_eval_dataset["Score"]) / len(clip_eval_dataset))

## Quality Evaluation Metrics

### LIQE

In [None]:
liqe = pyiqa.create_metric("liqe", device = device)

result = {"data": []}
for prompt in tqdm(prompts):
    images = [Image.open(image_gen_dir + f"{prompt.split(' ')[-1]}_{i}.png") for i in range(images_evaluated_per_prompt)]
    images = torch.tensor(np.stack(images)).permute(0, 3, 1, 2).to(device) / 255
    liqe_score = liqe(images).detach().cpu().mean().item()
    result["data"].append({"Prompt": prompt, "Score": liqe_score})
    
with open(eval_dir + "liqe.json", "w") as f:
    f.write(json.dumps(result))
    f.close()

In [None]:
liqe_eval_dataset = datasets.load_dataset(
    "json",
    data_files = {"eval": eval_dir + "liqe.json"},
    split = "eval",
    field = "data"
)

print("Sample Eval -", liqe_eval_dataset[0])
print("LIQE -", sum(liqe_eval_dataset["Score"]) / len(liqe_eval_dataset))

### MUSIQ

In [None]:
musiq = pyiqa.create_metric("musiq", device = device)

result = {"data": []}
for prompt in tqdm(prompts):
    images = [Image.open(image_gen_dir + f"{prompt.split(' ')[-1]}_{i}.png") for i in range(images_evaluated_per_prompt)]
    images = torch.tensor(np.stack(images)).permute(0, 3, 1, 2).to(device) / 255
    musiq_score = musiq(images).detach().cpu().mean().item()
    result["data"].append({"Prompt": prompt, "Score": musiq_score})
    
with open(eval_dir + "musiq.json", "w") as f:
    f.write(json.dumps(result))
    f.close()

In [None]:
musiq_eval_dataset = datasets.load_dataset(
    "json",
    data_files = {"eval": eval_dir + "musiq.json"},
    split = "eval",
    field = "data"
)

print("Sample Eval -", musiq_eval_dataset[0])
print("MUSIQ -", sum(musiq_eval_dataset["Score"]) / len(musiq_eval_dataset))

### PIQE

In [None]:
piqe = pyiqa.create_metric("piqe", device = device)

result = {"data": []}
for prompt in tqdm(prompts):
    images = [Image.open(image_gen_dir + f"{prompt.split(' ')[-1]}_{i}.png") for i in range(images_evaluated_per_prompt)]
    images = torch.tensor(np.stack(images)).permute(0, 3, 1, 2).to(device) / 255
    piqe_score = piqe(images).detach().cpu().mean().item()
    result["data"].append({"Prompt": prompt, "Score": piqe_score})
    
with open(eval_dir + "piqe.json", "w") as f:
    f.write(json.dumps(result))
    f.close()

In [None]:
piqe_eval_dataset = datasets.load_dataset(
    "json",
    data_files = {"eval": eval_dir + "piqe.json"},
    split = "eval",
    field = "data"
)

print("Sample Eval -", piqe_eval_dataset[0])
print("PIQE -", sum(piqe_eval_dataset["Score"]) / len(piqe_eval_dataset))