In [3]:
!pip install datasets
!pip install diffusers transformers torchvision

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash

In [3]:
import os
from datasets import Dataset, DatasetDict
from PIL import Image
from torchvision import transforms
import pandas as pd

# Define the base path for your datasets
base_image_folder = 'dataset'
base_caption_folder = 'dataset'

# Define categories
categories = ['cat', 'dog', 'swan']

def load_custom_dataset(base_image_folder, base_caption_folder, category):
    image_folder = os.path.join(base_image_folder, category, 'selected_images')
    caption_file = os.path.join(base_caption_folder, category, 'selected_images_captions', 'captions.csv')
    
    data = pd.read_csv(caption_file)
    images = [os.path.join(image_folder, img) for img in data['image']]
    captions = data['caption'].tolist()
    
    return images, captions  # Return images and captions as lists

# Load datasets for each category
train_datasets = {}
test_datasets = {}
for category in categories:
    images, captions = load_custom_dataset(base_image_folder, base_caption_folder, category)
    
    # Split into train and test (assuming 30 train, 10 test for each category)
    train_datasets[category] = Dataset.from_dict({"image": images[:30], "caption": captions[:30]})
    test_datasets[category] = Dataset.from_dict({"image": images[30:], "caption": captions[30:]})

# Concatenate train datasets into one
train_dataset = DatasetDict({"train": train_datasets['cat'].concatenate(train_datasets['dog'], train_datasets['swan'])})

# Concatenate test datasets into one
test_dataset = DatasetDict({"test": test_datasets['cat'].concatenate(test_datasets['dog'], test_datasets['swan'])})

# Preprocess function
preprocess = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

def transform(examples):
    images = [preprocess(Image.open(image).convert("RGB")) for image in examples["image"]]
    return {"image": images, "caption": examples["caption"]}

# Apply the transform
train_dataset = train_dataset.map(transform, batched=True)
test_dataset = test_dataset.map(transform, batched=True)



AttributeError: 'Dataset' object has no attribute 'concatenate'

In [None]:
from diffusers import StableDiffusionPipeline
from transformers import AdamW
from torch.utils.data import DataLoader

# Load the pre-trained model
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

# Setup optimizer
optimizer = AdamW(pipe.unet.parameters(), lr=5e-6)

# Create DataLoader
train_dataloader = DataLoader(train_dataset["train"], batch_size=4, shuffle=True)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    for step, batch in enumerate(train_dataloader):
        images, captions = batch["image"], batch["caption"]
        images = images.to("cuda")

        # Forward pass
        outputs = pipe(images, captions)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if step % 10 == 0:
            print(f"Epoch {epoch} Step {step} Loss {loss.item()}")

# Save the fine-tuned model
pipe.save_pretrained("fine-tuned-stable-diffusion")


In [None]:
# Load the fine-tuned model
pipe = StableDiffusionPipeline.from_pretrained("fine-tuned-stable-diffusion", torch_dtype=torch.float16)
pipe = pipe.to("cuda")

# Generate images from the test captions for each category
generated_images = []
test_captions = []
for i, category in enumerate(categories):
    start_idx = i * 10
    end_idx = start_idx + 10
    category_test_dataset = test_dataset["test"].select(range(start_idx, end_idx))
    
    for j, caption in enumerate(category_test_dataset["caption"]):
        image = pipe(caption).images[0]
        image_path = f"{category}_generated_image_{j}.png"
        image.save(image_path)
        generated_images.append(image_path)
        test_captions.append(caption)


In [None]:
import clip
import torch
from PIL import Image

# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def calculate_clip_score(image_paths, captions):
    clip_scores = []
    
    for image_path, caption in zip(image_paths, captions):
        image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
        text = clip.tokenize([caption]).to(device)

        # Calculate feature vectors
        with torch.no_grad():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text)

        # Calculate cosine similarity
        cosine_similarity = torch.nn.functional.cosine_similarity(image_features, text_features).item()
        clip_scores.append(cosine_similarity)
    
    return clip_scores

# Calculate CLIP scores
clip_scores = calculate_clip_score(generated_images, test_captions)
print("CLIP Scores:", clip_scores)


In [None]:
import torch
import torchvision.transforms as transforms
from torchvision.models import inception_v3
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
from scipy.stats import entropy
from PIL import Image

# Custom dataset for loading generated images
class GeneratedImageDataset(Dataset):
    def __init__(self, image_paths):
        self.image_paths = image_paths
        self.transform = transforms.Compose([
            transforms.Resize((299, 299)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")
        return self.transform(image)

def inception_score(image_paths, batch_size=32, splits=10):
    dataset = GeneratedImageDataset(image_paths)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # Load the InceptionV3 model
    inception_model = inception_v3(pretrained=True, transform_input=False).to(device)
    inception_model.eval()

    preds = []
    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            pred = inception_model(batch)
            preds.append(F.softmax(pred, dim=1).cpu().numpy())
    
    preds = np.concatenate(preds, axis=0)

    # Compute the Inception Score
    split_scores = []
    N = preds.shape[0]
    for k in range(splits):
        part = preds[k * (N // splits): (k + 1) * (N // splits), :]
        py = np.mean(part, axis=0)
        scores = []
        for i in range(part.shape[0]):
            pyx = part[i, :]
            scores.append(entropy(pyx, py))
        split_scores.append(np.exp(np.mean(scores)))

    return np.mean(split_scores), np.std(split_scores)

# Calculate Inception Score
is_mean, is_std = inception_score(generated_images)
print("Inception Score: Mean =", is_mean, "Std =", is_std)
