In [1]:
from diffusers import StableDiffusionPipeline
import torch

pipe = StableDiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    torch_dtype=torch.float32  # Metal backend prefers float32
)
pipe = pipe.to("mps")  # for Mac

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [3]:
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler, DDPMScheduler
from diffusers import UNet2DConditionModel
from transformers import CLIPTokenizer, CLIPTextModel
from peft import get_peft_model, LoraConfig, TaskType
from PIL import Image
import torch
import os
from torchvision import transforms

# Set device
device = "mps" if torch.backends.mps.is_available() else "cpu"

# Load pretrained SD 1.4 base
pipe = StableDiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    torch_dtype=torch.float32,
)
pipe = pipe.to(device)

# Access model parts
unet = pipe.unet
text_encoder = pipe.text_encoder
tokenizer = pipe.tokenizer

# Freeze base model
unet.requires_grad_(False)
text_encoder.requires_grad_(False)

# Apply LoRA
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["to_q", "to_k", "to_v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.FEATURE_EXTRACTION,
)

unet = get_peft_model(unet, lora_config)

# Dummy text prompt and image transform
prompt = "photo of a data analyst"
image_folder = "./Training dataset"
transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

# Training loop (simplified single-batch example)
optimizer = torch.optim.Adam(unet.parameters(), lr=5e-5)

for epoch in range(10):
    for image_name in os.listdir(image_folder):
        image_path = os.path.join(image_folder, image_name)
        image = Image.open(image_path).convert("RGB")
        pixel_values = transform(image).unsqueeze(0).to(device)

        # Tokenize
        inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=77)
        input_ids = inputs.input_ids.to(device)
        encoder_hidden_states = text_encoder(input_ids)[0]

        # Get noise + timesteps
        noise = torch.randn_like(pixel_values)
        timesteps = torch.randint(0, 1000, (1,), device=device).long()
        scheduler = DDPMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
        noisy_images = scheduler.add_noise(pixel_values, noise, timesteps)

        # Predict noise
        noise_pred = unet(noisy_images, timesteps, encoder_hidden_states).sample

        loss = torch.nn.functional.mse_loss(noise_pred, noise)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch}, Loss: {loss.item()}")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

TypeError: UNet2DConditionModel.forward() got an unexpected keyword argument 'input_ids'