In [None]:
import os
import pandas as pd
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import torchvision.transforms as transforms
from torch_fidelity import calculate_metrics
import shutil

# Disable tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load the Stable Diffusion pipeline
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

# Load the CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define the paths to your datasets and the output directory
dataset_files = ["captions_20to40.csv","captions_40to60.csv", "captions_60to80.csv"]
output_dirs = ["cat_gener_image_20to40", "cat_gener_image_40to60", "cat_gener_image_60to80"]

# Define a function to calculate CLIP score
def calculate_clip_score(image, text):
    inputs = clip_processor(text=[text], images=image, return_tensors="pt")
    outputs = clip_model(**inputs)
    logits_per_image = outputs.logits_per_image
    return logits_per_image.item()

# Iterate over each dataset file
for dataset_path, output_dir in zip(dataset_files, output_dirs):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    print(f"Processing dataset: {dataset_path}, saving to: {output_dir}")

    # Read the CSV file
    data = pd.read_csv(dataset_path)

    # Initialize lists for storing images and captions for Inception Score calculation
    generated_images = []

    # List to store image info
    image_info = []

    # List to store CLIP scores
    clip_scores = []

    # Iterate over each row in the CSV file
    for index, row in data.iterrows():
        image_name = row['image']
        caption = row['caption']

        # Generate the image using the caption
        generated_image = pipe(caption).images[0]
        generated_images.append(generated_image)

        # Create a unique output file name
        output_file_name = image_name
        output_path = os.path.join(output_dir, output_file_name)
        
        # Save the generated image to the output directory
        generated_image.save(output_path)
        print(f"Saved generated image: {output_path}")

        # Calculate and print the CLIP score
        clip_score = calculate_clip_score(generated_image, caption)
        print(f"CLIP score for {output_file_name}: {clip_score}")

        # Store the CLIP score
        clip_scores.append(clip_score)

        # Store the information in the list
        image_info.append({
            'image_name': output_file_name,
            'caption': caption,
            'clip_score': clip_score
        })

    # Calculate the average CLIP score
    average_clip_score = np.mean(clip_scores)
    print(f"Average CLIP score for {dataset_path}: {average_clip_score}")

    # Preprocess images for Inception Score calculation
    preprocessed_images = [transforms.ToTensor()(img).unsqueeze(0) for img in generated_images]
    preprocessed_images = torch.cat(preprocessed_images, dim=0).to("cuda")

    # Save preprocessed images to a temporary directory
    temp_dir = "temp_images"
    os.makedirs(temp_dir, exist_ok=True)
    for i, img in enumerate(preprocessed_images):
        img_pil = transforms.ToPILImage()(img)
        img_pil.save(os.path.join(temp_dir, f"img_{i}.png"))

    # Calculate Inception Score using torch-fidelity
    metrics = calculate_metrics(input1=temp_dir, input1_model="inception-v3", isc=True)
    inception_score = metrics["inception_score_mean"]
    inception_score_std = metrics["inception_score_std"]
    print(f"Inception Score: {inception_score} ± {inception_score_std}")

    # Clean up temporary directory
    shutil.rmtree(temp_dir)

    # Add Inception Score to the image info list
    image_info.append({
        'inception_score': inception_score,
        'inception_score_std': inception_score_std
    })

    # Save the image info to a CSV file
    image_info_df = pd.DataFrame(image_info)
    image_info_csv_path = os.path.join(output_dir, 'image_info.csv')
    image_info_df.to_csv(image_info_csv_path, index=False)
    print(f"Saved image info to: {image_info_csv_path}")

    # Print the average CLIP score
    print(f"Average CLIP score for {output_dir}: {average_clip_score}")


2024-06-14 07:09:15.953467: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


Processing dataset: captions_20to40.csv, saving to: cat_gener_image_20to40


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/15_cat.jpg
CLIP score for 15_cat.jpg: 29.760818481445312


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/13_cat.jpg
CLIP score for 13_cat.jpg: 31.52541160583496


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/8_cat.jpg
CLIP score for 8_cat.jpg: 31.817960739135742


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/14_cat.jpg
CLIP score for 14_cat.jpg: 27.86254119873047


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/20_cat.jpg
CLIP score for 20_cat.jpg: 29.704147338867188


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/11_cat.jpg
CLIP score for 11_cat.jpg: 30.879558563232422


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/18_cat.jpg
CLIP score for 18_cat.jpg: 29.525375366210938


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/2_cat.jpg
CLIP score for 2_cat.jpg: 33.00474548339844


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/12_cat.jpg
CLIP score for 12_cat.jpg: 35.862117767333984


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_20to40/16_cat.jpg
CLIP score for 16_cat.jpg: 29.44328498840332
Average CLIP score for captions_20to40.csv: 30.938596153259276


Creating feature extractor "inception-v3-compat" with features ['logits_unbiased']
Extracting features from input1
Looking for samples non-recursivelty in "temp_images" with extensions png,jpg,jpeg
Found 10 samples
Processing samples                                                      


Inception Score: 1.0 ± 7.850462293418876e-17
Saved image info to: cat_gener_image_20to40/image_info.csv
Average CLIP score for cat_gener_image_20to40: 30.938596153259276
Processing dataset: captions_40to60.csv, saving to: cat_gener_image_40to60


Inception Score: 1.0 ± 7.850462293418876e-17


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_40to60/5_cat.jpg
CLIP score for 5_cat.jpg: 32.93206024169922


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_40to60/20_cat.jpg
CLIP score for 20_cat.jpg: 30.60572052001953


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: cat_gener_image_40to60/17_cat.jpg
CLIP score for 17_cat.jpg: 33.797950744628906


  0%|          | 0/50 [00:00<?, ?it/s]