# Fine-Tune Stable Diffusion with Flickr8k using LoRA

In [3]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import CLIPTokenizer
from diffusers import StableDiffusionPipeline, DDPMScheduler
from peft import get_peft_model, LoraConfig
from datasets import Dataset as HFDataset
from tqdm import tqdm

In [4]:
import torch

# Check if GPU is available
print("CUDA Available:", torch.cuda.is_available())

# Show GPU name if available
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("Total GPU Memory (MB):", torch.cuda.get_device_properties(0).total_memory / (1024**2))
else:
    print("No GPU found.")


CUDA Available: True
GPU Name: NVIDIA GeForce GTX 1050 Ti
Total GPU Memory (MB): 4095.875


In [5]:
# === Config ===
image_dir = "C:/Users/molavade.s/Latent_Diffusion_model/flickr8k/Images"
captions_file = "C:/Users/molavade.s/Latent_Diffusion_model/flickr8k/captions.txt"
pretrained_model = "CompVis/stable-diffusion-v1-4"
output_dir = "./sd-fine-tuned-lora"
image_size = 512
batch_size = 2
num_epochs = 3
lr = 1e-5
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
# === Load Captions ===
def load_captions(captions_path):
    pairs = []
    with open(captions_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("image,caption"):
                continue
            try:
                img, caption = line.split(',', 1)
                img = img.split('#')[0]
                full_img_path = os.path.join(image_dir, img)
                if os.path.exists(full_img_path):
                    pairs.append({'image': full_img_path, 'caption': caption.strip()})
                else:
                    print(f"Image file not found: {full_img_path}")
            except Exception as e:
                print(f"Error processing line: {line} – {e}")
    return pairs


# Load and convert to HuggingFace Dataset
pairs = load_captions(captions_file)[:100]  # Limit to first 100
from datasets import Dataset as HFDataset
hf_dataset = HFDataset.from_list(pairs)



In [7]:
# === Tokenizer and Transform ===
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

In [8]:
# === Custom Dataset ===
class FlickrDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        example = self.data[idx]
        image = Image.open(example['image']).convert('RGB')
        pixel_values = transform(image)
        text_inputs = tokenizer(example['caption'], padding='max_length', truncation=True, max_length=77, return_tensors='pt')
        return {
            'pixel_values': pixel_values,
            'input_ids': text_inputs.input_ids.squeeze(0),
            'attention_mask': text_inputs.attention_mask.squeeze(0)
        }

dataset = FlickrDataset(hf_dataset)

In [9]:
# === Load Pipeline and Freeze ===
pipe = StableDiffusionPipeline.from_pretrained(pretrained_model, torch_dtype=torch.float16 if device=="cuda" else torch.float32)
pipe.to(device)
pipe.vae.requires_grad_(False)
pipe.text_encoder.requires_grad_(False)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), ep

In [10]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["to_q", "to_k", "to_v", "to_out.0"],
    bias="none"
)

pipe.unet = get_peft_model(pipe.unet, lora_config)


In [11]:
# === Training ===
optimizer = torch.optim.Adam(pipe.unet.parameters(), lr=lr)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
pipe.scheduler = DDPMScheduler.from_config(pipe.scheduler.config)

for epoch in range(num_epochs):
    pipe.unet.train()
    for batch in tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        images = batch['pixel_values'].to(device, dtype=torch.float16)
        latents = pipe.vae.encode(images).latent_dist.sample() * 0.18215
        input_ids = batch['input_ids'].to(device)
        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, pipe.scheduler.config.num_train_timesteps, (latents.shape[0],), device=device).long()
        noisy_latents = pipe.scheduler.add_noise(latents, noise, timesteps)
        with torch.no_grad():
            encoder_hidden_states = pipe.text_encoder(input_ids)[0].to(dtype=torch.float16)
        model_pred = pipe.unet(noisy_latents, timesteps, encoder_hidden_states).sample
        loss = torch.nn.functional.mse_loss(model_pred, noise)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}: Loss = {loss.item()}')

Epoch 1/3: 100%|██████████| 50/50 [49:14<00:00, 59.10s/it]


Epoch 1: Loss = 0.1844482421875


Epoch 2/3: 100%|██████████| 50/50 [48:35<00:00, 58.31s/it]


Epoch 2: Loss = 0.018707275390625


Epoch 3/3: 100%|██████████| 50/50 [49:04<00:00, 58.90s/it]


Epoch 3: Loss = 0.1700439453125


In [12]:
# === Save final LoRA fine-tuned UNet ===
pipe.unet.save_pretrained(output_dir)
print(f"Fine-tuned U-Net saved to: {output_dir}")


Fine-tuned U-Net saved to: ./sd-fine-tuned-lora


In [1]:
from diffusers import StableDiffusionPipeline
import torch
from peft import PeftModel, LoraConfig

# Load original SD pipeline
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
pipe.to("cuda")

# Load your LoRA fine-tuned U-Net
from peft import PeftModel
pipe.unet = PeftModel.from_pretrained(pipe.unet, "./sd-fine-tuned-lora")
pipe.unet.eval()

# Enable faster generation
pipe.enable_attention_slicing()


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [2]:
# Prompt to test
prompt = "A magical forest with glowing mushrooms and a small waterfall"

# Generate
with torch.autocast("cuda"):
    image = pipe(prompt=prompt, guidance_scale=7.5).images[0]

# Show image (Jupyter)
image.show()

# Optionally save
image.save("generated_output.png")

  0%|          | 0/50 [00:00<?, ?it/s]