In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

from diffusers import StableDiffusionPipeline, DDPMScheduler
from transformers import CLIPTokenizer, CLIPTextModel
from peft import LoraConfig, get_peft_model

MODEL_ID = "runwayml/stable-diffusion-v1-5"
DATASET_DIR = "/content/drive/MyDrive/resized_images_dataset"
CAPTION = "handwritten text"
OUTPUT_DIR = "handwriting_lora"

EPOCHS = 5
BATCH_SIZE = 2
LR = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if device.type == "cuda" else torch.float32

os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Using device:", device)

class HandwritingDataset(Dataset):
    def __init__(self, folder):
        self.files = [
            os.path.join(folder, f)
            for f in os.listdir(folder)
            if f.lower().endswith((".png", ".jpg", ".jpeg"))
        ]

        if len(self.files) == 0:
            raise ValueError("❌ Dataset folder is empty")

        self.transform = transforms.Compose([
            transforms.Resize((512, 512)),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ])

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        image = Image.open(self.files[idx]).convert("RGB")
        return self.transform(image)

dataset = HandwritingDataset(DATASET_DIR)
dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

pipe = StableDiffusionPipeline.from_pretrained(
    MODEL_ID,
    safety_checker=None,
    torch_dtype=dtype
).to(device)

pipe.enable_attention_slicing()
pipe.enable_vae_slicing()

vae = pipe.vae
unet = pipe.unet

tokenizer = CLIPTokenizer.from_pretrained(MODEL_ID, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(
    MODEL_ID,
    subfolder="text_encoder",
    torch_dtype=dtype
).to(device)

scheduler = DDPMScheduler.from_config(pipe.scheduler.config)

vae.requires_grad_(False)
text_encoder.requires_grad_(False)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["to_q", "to_v"],
    lora_dropout=0.05,
    bias="none"
)

unet = get_peft_model(unet, lora_config)
unet.train()

optimizer = torch.optim.AdamW(unet.parameters(), lr=LR)

print("🚀 Starting LoRA fine-tuning...")

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")

    for images in tqdm(dataloader):
        images = images.to(device, dtype=dtype)

        with torch.no_grad():
            latents = vae.encode(images).latent_dist.sample()
            latents = latents * 0.18215

        noise = torch.randn_like(latents)
        timesteps = torch.randint(
            0,
            scheduler.config.num_train_timesteps,
            (latents.shape[0],),
            device=device
        ).long()

        noisy_latents = scheduler.add_noise(latents, noise, timesteps)

        tokens = tokenizer(
            [CAPTION] * latents.shape[0],
            padding="max_length",
            truncation=True,
            max_length=77,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            encoder_hidden_states = text_encoder(
                tokens.input_ids
            ).last_hidden_state

        noise_pred = unet(
            noisy_latents,
            timesteps,
            encoder_hidden_states
        ).sample

        loss = torch.nn.functional.mse_loss(noise_pred, noise)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Loss: {loss.item():.4f}")

unet.save_pretrained(OUTPUT_DIR)

print("\n✅ LoRA fine-tuning complete")
print(f"📁 Saved to: {OUTPUT_DIR}")
print("🎨 Ready for handwriting generation!")


Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

text_encoder/model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .
  deprecate(


🚀 Starting LoRA fine-tuning...

Epoch 1/5


100%|██████████| 770/770 [09:18<00:00,  1.38it/s]


Loss: 0.0030

Epoch 2/5


100%|██████████| 770/770 [09:03<00:00,  1.42it/s]


Loss: 0.0038

Epoch 3/5


100%|██████████| 770/770 [09:02<00:00,  1.42it/s]


Loss: 0.0572

Epoch 4/5


100%|██████████| 770/770 [09:03<00:00,  1.42it/s]


Loss: 0.0044

Epoch 5/5


100%|██████████| 770/770 [09:03<00:00,  1.42it/s]


Loss: 0.0928

✅ LoRA fine-tuning complete
📁 Saved to: handwriting_lora
🎨 Ready for handwriting generation!
