# Dreambooth training 
### We recommend [official Diffusers tutorial](https://huggingface.co/docs/diffusers/training/dreambooth) which helps to get familiar with this type of fine-tuning, explains the environment setup and used parameters in detail.   
### Our fine-tuned models and training data are available on OneDrive.

## Crop and resize selected images to 512x512.

In [2]:
from PIL import Image
import os
from torchvision.transforms import functional as F
from torchvision.transforms import InterpolationMode
from torchvision.transforms import transforms
import random
import json

In [3]:
folder = "/home/ethan/DiffusionResearch/Sim2RealDiffusion/training/solid_pushblock_training_data_45"
folder_cropped = "/home/ethan/DiffusionResearch/Sim2RealDiffusion/training/solid_256"
os.makedirs(folder_cropped, exist_ok=True)
set_num = 1
subdir_count = 0
transform = transforms.RandomCrop(512)
for subdir in os.listdir(folder):
    subdir_count += 1
    file_count = 0
    if subdir_count > set_num:
        break
    for filename in os.listdir(os.path.join(folder, subdir)):
        if filename.endswith('.jpeg') or filename.endswith('.png') or filename.endswith('.jpg'):
            # Center crop will keep the main changes betweeen the images, in this case the robot motion
            image = Image.open(os.path.join(folder, subdir, filename)).convert("RGB")
            # image = F.center_crop(image, 512)
            image = F.resize(image, 256, InterpolationMode.LANCZOS)
            # Upper center crop for rope task
            # width, height = image.size
            # top = 0  # Start from the top
            # left = max(0, (width - 512) // 2)
            # right = left + 512
            # bottom = top + 512
            # image = image.crop((left, top, right, bottom))
            image.save(os.path.join(folder_cropped, filename))
            # Save random crops for data augmentation
            if file_count % 100 == 0:
                # every 1 for rope
                # every 25 for tissue pushblock
                # every 5 for solid pushblock
                image = Image.open(os.path.join(folder, subdir, filename)).convert("RGB")
                image = transform(image)
                filename_split = filename.split('.')
                filename = filename_split[0] + '_1.' + filename_split[1]
                image.save(os.path.join(folder_cropped, filename))
            file_count += 1

In [7]:
# Optional code to take input image and resize it to desired resolution
image = "/home/ethan/DiffusionResearch/Sim2RealDiffusion/inference/test_images/simsolid_interim.png"
resolution = 512
image = Image.open(image).convert("RGB")
image = F.resize(image, [resolution, resolution], InterpolationMode.LANCZOS)
image.save(f"/home/ethan/DiffusionResearch/Sim2RealDiffusion/inference/test_images/simsolid_interim_{resolution}.png")

## Creating json concept list

In [3]:
concepts_list = [
    {
        "instance_prompt":      "pushblock",
        "class_prompt":         "",
        "instance_data_dir":    f"{folder_cropped}"
    },
]

with open("./concepts_list.json", "w") as f:
    json.dump(concepts_list, f, indent=4)

## DREAMBOOTH TRAINING
### Following commands run in Terminal. Also remember to have `diffusers venv` activated!

In [None]:
'''
cd training
source diffusers_venv/bin/activate
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
export OUTPUT_DIR="/home/ethan/DiffusionResearch/Sim2RealDiffusion/training/checkpoints"
'''

### Template for Dreambooth training command

In [None]:
#template
'''
!accelerate launch train_dreambooth.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --pretrained_vae_name_or_path="stabilityai/sd-vae-ft-mse" \
  --output_dir=$OUTPUT_DIR \
  --revision="fp16" \
  --with_prior_preservation --prior_loss_weight=1.0 \
  --seed=1337 \
  --resolution=512 \
  --train_batch_size=1 \
  --train_text_encoder \
  --mixed_precision="fp16" \
  --use_8bit_adam \
  --gradient_accumulation_steps=1 \
  --learning_rate=1e-6 \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --num_class_images=50 \
  --sample_batch_size=4 \
  --max_train_steps=800 \
  --save_interval=10000 \
  --save_sample_prompt="photo of zwx dog" \
  --concepts_list="concepts_list.json"
'''

### The actual command (with proper parameters) we used for training all styles - run in Terminal

In [None]:
'''
CUDA_VISIBLE_DEVICES=1 accelerate launch train_dreambooth.py \
--pretrained_model_name_or_path=$MODEL_NAME \
--output_dir=$OUTPUT_DIR \
--concepts_list="concepts_list.json" \
--train_text_encoder \
--seed=1337 \
--resolution=512 \
--train_batch_size=4 \
--mixed_precision="fp16" \
--gradient_accumulation_steps=1 \
--learning_rate=1e-6 \
--lr_warmup_steps=0 \
--num_class_images=50 \
--save_interval=500 \
--max_train_steps=3000
'''

### Quick inference for sanity check using diffusers pipeline

In [None]:
# #Quick inference for sanity check
# from diffusers import StableDiffusionPipeline
# import torch

# model_id = "/path/to/save/checkpoints"
# pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

# prompt = "cholect45"
# image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]

# image

### If you want to upload the fine-tuned model to WebUI, convert diffusers format to original SD format and save it to proper WebUI models folder

In [4]:
!python convert_diffusers_to_sd.py --model_path ./cholect_vid52_56_v2_ckpts/2000 \
    --checkpoint_path ../stable-diffusion-webui/models/Stable-diffusion/cholect_vid52_56_v2_2000.safetensors --half --use_safetensors


Reshaping encoder.mid.attn_1.q.weight for SD format
Reshaping encoder.mid.attn_1.k.weight for SD format
Reshaping encoder.mid.attn_1.v.weight for SD format
Reshaping encoder.mid.attn_1.proj_out.weight for SD format
Reshaping decoder.mid.attn_1.q.weight for SD format
Reshaping decoder.mid.attn_1.k.weight for SD format
Reshaping decoder.mid.attn_1.v.weight for SD format
Reshaping decoder.mid.attn_1.proj_out.weight for SD format
