In [None]:
import gc
import json
import locale
from pathlib import Path

import torch
from PIL import Image
from diffusers import DiffusionPipeline, AutoencoderKL
from huggingface_hub import interpreter_login, snapshot_download, whoami, upload_folder, create_repo
from transformers import AutoProcessor, BlipForConditionalGeneration

from lora_utils import create_image_grid
from train_dreambooth_lora_sdxl import save_model_card

In [None]:
DATA_DIR = "./tio/"
HUGGINGFACE_DATASET = "mtailanian/tio"
locale.getpreferredencoding = lambda: "UTF-8"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Download data

In [None]:
snapshot_download(
    HUGGINGFACE_DATASET,
    local_dir=DATA_DIR,
    repo_type="dataset",
    ignore_patterns=".gitattributes",
)

## Show the data

In [None]:
image_extensions = [".png", ".jpg", ".jpeg", "JPG"]

images_paths = [str(path) for ext in image_extensions for path in Path(DATA_DIR).glob(f"*{ext}")]
images = [Image.open(path) for path in images_paths]

num_images_to_preview = 5
create_image_grid(images[:num_images_to_preview], 1, num_images_to_preview)

# Add captions for each image (automatically or not)

Al Tío le vamos a asignar un Token especial.
Idealmente sería uno que se use muy poco en el modelo actual.
Además es mejor que sea corto. Las palabras largas se dividen en pedazos más chicos al tokenizarse, y cada pedacito puede tener algún significado asociado.

Y queremos agregar un concepto nuevo. El concepto Tío. Sin prejuicios.

Vamos a crear un archivo con una descripción de cada imagen. La descripción tendrá un prefijo que lleve este token, y una descripción autogenerada con un modelo de descripción de imágenes: BLIP

## Crear captions automáticos

In [None]:
blip_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base",torch_dtype=torch.float16).to(DEVICE)

def caption_image(input_image):
    inputs = blip_processor(images=input_image, return_tensors="pt").to(DEVICE, torch.float16)
    pixel_values = inputs.pixel_values

    generated_ids = blip_model.generate(pixel_values=pixel_values, max_length=50)
    generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_caption

## Crear archivo de captions, incluyendo el Token elegido para representar al Tío

In [None]:
TIO_TOKEN = "TOK"

caption_prefix = f"A photo of {TIO_TOKEN}. "
with open(f"{Path(DATA_DIR) / 'metadata.jsonl'}", "w") as outfile:
  for path, img in zip(images_paths, images):
      caption = caption_prefix + caption_image(img).split("\n")[0]
      entry = {"file_name":path.split("/")[-1], "prompt": caption}
      json.dump(entry, outfile)
      outfile.write('\n')

### Check images and descriptions

In [None]:
with open(f"{Path(DATA_DIR) / 'metadata.jsonl'}", "r") as f:
    data = list(map(json.loads, f.read().splitlines()))

for d in data[:5]:
    img = Image.open(Path(DATA_DIR) / d["file_name"]).resize((256, 256))
    print(d["file_name"], d["prompt"])
    display(img)

# C

In [None]:
del blip_processor, blip_model
gc.collect()
torch.cuda.empty_cache()

In [None]:
%%bash
accelerate config default

In [None]:
%%bash
accelerate launch train_dreambooth_lora_sdxl.py \
  --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \
  --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \
  --dataset_name="tio" \
  --output_dir="output" \
  --caption_column="prompt"\
  --mixed_precision="fp16" \
  --instance_prompt="a photo of TOK" \
  --resolution=256 \
  --train_batch_size=1 \
  --gradient_accumulation_steps=3 \
  --gradient_checkpointing \
  --learning_rate=1e-4 \
  --snr_gamma=5.0 \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --mixed_precision="fp16" \
  --use_8bit_adam \
  --max_train_steps=500 \
  --checkpointing_steps=717 \
  --seed="0"

# Login to HuggingFace to save the model

In [None]:
interpreter_login()

In [None]:
model_name = "output"
username = whoami(token=Path("/root/.cache/huggingface/"))["name"]
repo_id = f"{username}/{model_name}"

# Push to the hub 🔥

In [None]:
repo_id = create_repo(repo_id, exist_ok=True).repo_id

# change the params below according to your training arguments
save_model_card(
    repo_id = repo_id,
    images=[],
    base_model="stabilityai/stable-diffusion-xl-base-1.0",
    train_text_encoder=False,
    instance_prompt="a photo of TOK",
    validation_prompt=None,
    repo_folder=model_name,
    vae_path="madebyollin/sdxl-vae-fp16-fix",
    use_dora=False,
)

upload_folder(
    repo_id=repo_id,
    folder_path=model_name,
    commit_message="End of training",
    ignore_patterns=["step_*", "epoch_*"],
)

# Inference

In [None]:
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
pipe = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    vae=vae,
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True
)
pipe.load_lora_weights(repo_id)
_ = pipe.to("cuda")

In [None]:
prompt = "a photo of TOK with a hat in Paris with the Eiffel tower"

image = pipe(prompt=prompt, num_inference_steps=25).images[0]

display(image)