In [1]:
# Install required libraries
!pip install diffusers transformers datasets accelerate torch torchvision tensorboard

# Import necessary modules
import os
import pandas as pd
from datasets import Dataset
from PIL import Image
import torch
from torch.utils.data import DataLoader
from diffusers import StableDiffusionPipeline, UNet2DConditionModel, AutoencoderKL, DDPMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from accelerate import Accelerator
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from torch.optim.lr_scheduler import CosineAnnealingLR


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [3]:
# Define file paths
image_folder = "/content/images"  # Path to your image folder
csv_file = "/content/test_cleaned_object_title_img_id.csv"  # Path to your CSV file

# Load and process dataset
data = pd.read_csv(csv_file)

# Function to process data
def process_data(row):
    image_path = os.path.join(image_folder, f"{row['Image ID']}.jpg")
    return {
        "image": Image.open(image_path).convert("RGB"),
        "text": row["Title"]
    }

# Create dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.map(process_data, remove_columns=["Object Number", "Image ID", "Title"])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [4]:
# Define image preprocessing and text tokenization
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
image_transforms = Compose([
    Resize(512, interpolation=Image.BICUBIC),
    CenterCrop(512),
    ToTensor(),
    Normalize([0.5], [0.5])
])

# Preprocess dataset
def preprocess_data(example):
    example["pixel_values"] = image_transforms(example["image"])
    example["input_ids"] = tokenizer(example["text"], truncation=True, max_length=77, padding="max_length")["input_ids"]
    return example

dataset = dataset.map(preprocess_data, remove_columns=["image"])
dataset.set_format(type="torch", columns=["pixel_values", "input_ids"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [5]:
# Load pretrained Stable Diffusion components
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
text_encoder = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder")
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
scheduler = DDPMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")

# Define training parameters
training_args = {
    "output_dir": "/content/fine_tuned_sd",
    "num_train_epochs": 10,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "learning_rate": 1e-5,
    "lr_scheduler_type": "cosine",
    "save_steps": 500,
    "save_total_limit": 2,
    "logging_dir": "./logs",
    "logging_steps": 10,
    "push_to_hub": False,
    "report_to": "tensorboard",
    "mixed_precision": "fp16",
}

# Set up dataloader and optimizer
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
optimizer = torch.optim.AdamW(unet.parameters(), lr=training_args["learning_rate"])
scheduler_cosine = CosineAnnealingLR(optimizer, T_max=len(dataloader) * training_args["num_train_epochs"])

# Accelerator setup
accelerator = Accelerator(mixed_precision=training_args["mixed_precision"])
unet, optimizer, dataloader = accelerator.prepare(unet, optimizer, dataloader)
text_encoder = text_encoder.to(accelerator.device)

vae/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [6]:
# Move VAE to the same device as the rest of the pipeline
vae = vae.to(accelerator.device)

# Training loop
for epoch in range(training_args["num_train_epochs"]):
    unet.train()
    for step, batch in enumerate(dataloader):
        pixel_values = batch["pixel_values"].to(accelerator.device)  # Move pixel_values to the correct device
        input_ids = batch["input_ids"].to(accelerator.device)  # Move input_ids to the correct device

        # Normalize pixel values to [-1, 1]
        pixel_values = pixel_values * 2.0 - 1.0

        # Encode pixel values into latent space
        latents = vae.encode(pixel_values).latent_dist.sample()  # Now on the same device
        latents = latents * 0.18215

        # Add noise to latents
        noise = torch.randn_like(latents).to(accelerator.device)
        timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (latents.size(0),), device=accelerator.device).long()
        noisy_latents = scheduler.add_noise(latents, noise, timesteps)

        # Generate text encoder hidden states
        encoder_hidden_states = text_encoder(input_ids).last_hidden_state  # Now on the correct device

        # Get predicted noise from UNet
        model_pred = unet(noisy_latents, timesteps, encoder_hidden_states=encoder_hidden_states).sample

        # Compute the loss
        loss = torch.nn.functional.mse_loss(model_pred, noise)

        # Backpropagation
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()

        if step % training_args["logging_steps"] == 0:
            print(f"Epoch {epoch}, Step {step}, Loss: {loss.item()}")

    # Step the learning rate scheduler
    scheduler_cosine.step()

    # Save model state after each epoch
    unet.save_pretrained(f"{training_args['output_dir']}/epoch_{epoch}")

# Save the final model
unet.save_pretrained(training_args["output_dir"])


Epoch 0, Step 0, Loss: 0.115119569003582
Epoch 0, Step 10, Loss: 0.07886326313018799
Epoch 0, Step 20, Loss: 0.0027358378283679485
Epoch 0, Step 30, Loss: 0.008272948674857616
Epoch 0, Step 40, Loss: 0.14874137938022614
Epoch 0, Step 50, Loss: 0.014814546331763268
Epoch 0, Step 60, Loss: 0.1888720542192459
Epoch 0, Step 70, Loss: 0.3650081753730774
Epoch 0, Step 80, Loss: 0.03545592725276947
Epoch 0, Step 90, Loss: 0.004483832977712154
Epoch 1, Step 0, Loss: 0.08628613501787186
Epoch 1, Step 10, Loss: 0.018793253228068352
Epoch 1, Step 20, Loss: 0.07949702441692352
Epoch 1, Step 30, Loss: 0.10758841782808304
Epoch 1, Step 40, Loss: 0.23152479529380798
Epoch 1, Step 50, Loss: 0.18028989434242249
Epoch 1, Step 60, Loss: 0.165002703666687
Epoch 1, Step 70, Loss: 0.17273013293743134
Epoch 1, Step 80, Loss: 0.19940492510795593
Epoch 1, Step 90, Loss: 0.0017771918792277575
Epoch 2, Step 0, Loss: 0.0803208202123642
Epoch 2, Step 10, Loss: 0.28991860151290894
Epoch 2, Step 20, Loss: 0.00556933

In [11]:
from diffusers import StableDiffusionPipeline, UNet2DConditionModel, AutoencoderKL, DDPMScheduler
from transformers import CLIPTextModel, CLIPTokenizer, CLIPFeatureExtractor
import torch

# Define the path to the fine-tuned UNet directory
fine_tuned_unet_path = "/content/fine_tuned_sd"

# Load pre-trained components
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae").to("cuda")
text_encoder = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder").to("cuda")
scheduler = DDPMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
feature_extractor = CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")

# Load the fine-tuned UNet
unet = UNet2DConditionModel.from_pretrained(fine_tuned_unet_path).to("cuda")

# Create a StableDiffusionPipeline using the fine-tuned UNet
fine_tuned_pipeline = StableDiffusionPipeline(
    vae=vae,
    text_encoder=text_encoder,
    unet=unet,
    scheduler=scheduler,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    safety_checker=None  # Disable safety checker if not needed
).to("cuda")

# Generate an image
description = "Drinking-cup; glass; green and red; covered with various scenes representing the death of King Lycurgus; rim mounted with silver-gilt band of leaf ornament, plus silver-gilt foot with open-work vine leaves."
generated_image = fine_tuned_pipeline(description).images[0]

# Save and display the generated image
generated_image.save("/content/generated_artifact_image.jpg")
print("Image saved at /content/generated_artifact_image.jpg")


You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/50 [00:00<?, ?it/s]

Image saved at /content/generated_artifact_image.jpg
