In [1]:
import os
import pandas as pd
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import torchvision.transforms as transforms
from torch_fidelity import calculate_metrics
import shutil

# Disable tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load the Stable Diffusion pipeline
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

# Load the CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define the paths to your datasets and the output directory
dataset_files = ["captions_20to40.csv","captions_40to60.csv", "captions_60to80.csv"]
output_dirs = ["dog_gener_image_20to40", "dog_gener_image_40to60", "dog_gener_image_60to80"]

# Define a function to calculate CLIP score
def calculate_clip_score(image, text):
    inputs = clip_processor(text=[text], images=image, return_tensors="pt")
    outputs = clip_model(**inputs)
    logits_per_image = outputs.logits_per_image
    return logits_per_image.item()

# Iterate over each dataset file
for dataset_path, output_dir in zip(dataset_files, output_dirs):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    print(f"Processing dataset: {dataset_path}, saving to: {output_dir}")

    # Read the CSV file
    data = pd.read_csv(dataset_path)

    # Initialize lists for storing images and captions for Inception Score calculation
    generated_images = []

    # List to store image info
    image_info = []

    # List to store CLIP scores
    clip_scores = []

    # Iterate over each row in the CSV file
    for index, row in data.iterrows():
        image_name = row['image']
        caption = row['caption']

        # Generate the image using the caption
        generated_image = pipe(caption).images[0]
        generated_images.append(generated_image)

        # Create a unique output file name
        output_file_name = image_name
        output_path = os.path.join(output_dir, output_file_name)
        
        # Save the generated image to the output directory
        generated_image.save(output_path)
        print(f"Saved generated image: {output_path}")

        # Calculate and print the CLIP score
        clip_score = calculate_clip_score(generated_image, caption)
        print(f"CLIP score for {output_file_name}: {clip_score}")

        # Store the CLIP score
        clip_scores.append(clip_score)

        # Store the information in the list
        image_info.append({
            'image_name': output_file_name,
            'caption': caption,
            'clip_score': clip_score
        })

    # Calculate the average CLIP score
    average_clip_score = np.mean(clip_scores)
    print(f"Average CLIP score for {dataset_path}: {average_clip_score}")

    # Preprocess images for Inception Score calculation
    preprocessed_images = [transforms.ToTensor()(img).unsqueeze(0) for img in generated_images]
    preprocessed_images = torch.cat(preprocessed_images, dim=0).to("cuda")

    # Save preprocessed images to a temporary directory
    temp_dir = "temp_images"
    os.makedirs(temp_dir, exist_ok=True)
    for i, img in enumerate(preprocessed_images):
        img_pil = transforms.ToPILImage()(img)
        img_pil.save(os.path.join(temp_dir, f"img_{i}.png"))

    # Calculate Inception Score using torch-fidelity
    metrics = calculate_metrics(input1=temp_dir, input1_model="inception-v3", isc=True)
    inception_score = metrics["inception_score_mean"]
    inception_score_std = metrics["inception_score_std"]
    print(f"Inception Score: {inception_score} ± {inception_score_std}")

    # Clean up temporary directory
    shutil.rmtree(temp_dir)

    # Add Inception Score to the image info list
    image_info.append({
        'inception_score': inception_score,
        'inception_score_std': inception_score_std
    })

    # Save the image info to a CSV file
    image_info_df = pd.DataFrame(image_info)
    image_info_csv_path = os.path.join(output_dir, 'image_info.csv')
    image_info_df.to_csv(image_info_csv_path, index=False)
    print(f"Saved image info to: {image_info_csv_path}")

    # Print the average CLIP score
    print(f"Average CLIP score for {output_dir}: {average_clip_score}")


2024-06-14 06:31:12.075205: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


Processing dataset: captions_20to40.csv, saving to: dog_gener_image_20to40


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/5_dog.jpg
CLIP score for 5_dog.jpg: 28.338825225830078


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/15_dog.jpg
CLIP score for 15_dog.jpg: 37.8287353515625


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/17_dog.jpg
CLIP score for 17_dog.jpg: 31.945594787597656


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/13_dog.jpg
CLIP score for 13_dog.jpg: 27.944067001342773


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/14_dog.jpg
CLIP score for 14_dog.jpg: 26.746850967407227


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/12_dog.jpg
CLIP score for 12_dog.jpg: 35.50504684448242


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/1_dog.jpg
CLIP score for 1_dog.jpg: 31.08976936340332


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/18_dog.jpg
CLIP score for 18_dog.jpg: 27.53058624267578


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/3_dog.jpg
CLIP score for 3_dog.jpg: 33.88639831542969


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_20to40/4_dog.jpg
CLIP score for 4_dog.jpg: 27.706804275512695
Average CLIP score for captions_20to40.csv: 30.852267837524415


Creating feature extractor "inception-v3-compat" with features ['logits_unbiased']
Extracting features from input1
Looking for samples non-recursivelty in "temp_images" with extensions png,jpg,jpeg
Found 10 samples
Processing samples                                                      
Inception Score: 1.0 ± 1.0532500405730103e-16


Inception Score: 1.0 ± 1.0532500405730103e-16
Saved image info to: dog_gener_image_20to40/image_info.csv
Average CLIP score for dog_gener_image_20to40: 30.852267837524415
Processing dataset: captions_40to60.csv, saving to: dog_gener_image_40to60


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/16_dog.jpg
CLIP score for 16_dog.jpg: 36.41520309448242


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/15_dog.jpg
CLIP score for 15_dog.jpg: 37.72999954223633


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/8_dog.jpg
CLIP score for 8_dog.jpg: 33.179893493652344


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/6_dog.jpg
CLIP score for 6_dog.jpg: 34.43321990966797


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/20_dog.jpg
CLIP score for 20_dog.jpg: 30.435510635375977


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/10_dog.jpg
CLIP score for 10_dog.jpg: 35.11435317993164


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/14_dog.jpg
CLIP score for 14_dog.jpg: 31.939970016479492


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/11_dog.jpg
CLIP score for 11_dog.jpg: 33.861663818359375


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/5_dog.jpg
CLIP score for 5_dog.jpg: 30.77560806274414


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_40to60/17_dog.jpg
CLIP score for 17_dog.jpg: 31.758975982666016
Average CLIP score for captions_40to60.csv: 33.56443977355957


Creating feature extractor "inception-v3-compat" with features ['logits_unbiased']
Extracting features from input1
Looking for samples non-recursivelty in "temp_images" with extensions png,jpg,jpeg
Found 10 samples
Processing samples                                                      


Inception Score: 1.0 ± 6.080941944488117e-17
Saved image info to: dog_gener_image_40to60/image_info.csv
Average CLIP score for dog_gener_image_40to60: 33.56443977355957
Processing dataset: captions_60to80.csv, saving to: dog_gener_image_60to80


Inception Score: 1.0 ± 6.080941944488117e-17


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/2_dog.jpg
CLIP score for 2_dog.jpg: 35.28153991699219


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/9_dog.jpg
CLIP score for 9_dog.jpg: 33.252323150634766


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/11_dog.jpg
CLIP score for 11_dog.jpg: 37.32475280761719


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/5_dog.jpg
CLIP score for 5_dog.jpg: 28.98691749572754


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/4_dog.jpg
CLIP score for 4_dog.jpg: 34.53257751464844


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/12_dog.jpg
CLIP score for 12_dog.jpg: 34.28617477416992


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/13_dog.jpg
CLIP score for 13_dog.jpg: 29.612104415893555


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/15_dog.jpg
CLIP score for 15_dog.jpg: 38.98825454711914


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/16_dog.jpg
CLIP score for 16_dog.jpg: 36.5889892578125


  0%|          | 0/50 [00:00<?, ?it/s]

Saved generated image: dog_gener_image_60to80/17_dog.jpg
CLIP score for 17_dog.jpg: 31.337146759033203
Average CLIP score for captions_60to80.csv: 34.01907806396484


Creating feature extractor "inception-v3-compat" with features ['logits_unbiased']
Extracting features from input1
Looking for samples non-recursivelty in "temp_images" with extensions png,jpg,jpeg
Found 10 samples
                                                                        

Inception Score: 1.0 ± 7.850462293418876e-17
Saved image info to: dog_gener_image_60to80/image_info.csv
Average CLIP score for dog_gener_image_60to80: 34.01907806396484


Processing samples
Inception Score: 1.0 ± 7.850462293418876e-17
