In [7]:
import os
from PIL import Image

import cv2
import torch
from basicsr.utils import tensor2img
from pytorch_lightning import seed_everything
from torch import autocast

from ldm.inference_base import (diffusion_inference, get_adapters, get_base_argument_parser, get_sd_models)
from ldm.modules.extra_condition import api
from ldm.modules.extra_condition.api import (ExtraCondition, get_adapter_feature, get_cond_model)

torch.set_grad_enabled(False)
torch.cuda.set_device('cuda:3')
import glob
import numpy as np 

In [6]:
model_edges = Image.open('/export/data/ffeiden/ResultsControlNetXS/T2I/t2i_canny/steps-50/caption-2/000000_depht.jpg')
print(np.array(model_edges))

[[  0   0 255 ...   3   0   0]
 [254 253   1 ...   1 255 255]
 [  0 255 251 ...   0   0 254]
 ...
 [  0   0   0 ...   0   0   0]
 [  2   0   0 ...   0   0   0]
 [  0   1   0 ...   0   0   0]]


In [8]:
image_root = '/export/data/ffeiden/PaperControlnetXS/512_images/'
size = 512


paths = glob.glob(image_root+'*_depth*.png')

prompts = []
for im_path in paths: 
    images = {}
    name = im_path.split('/')[-1]
    name, alternative = name.split('_depth')[0], name.split('_depth')[1]

    prompts.append(name)
    

In [14]:


class deafult_settings(): 
    def __init__(self):
        self.outdir = '/export/home/ffeiden/Projects/T2I-Adapter/outputs/test_gen' # str
        self.prompt = prompts # str
        self.neg_prompt = '' # str longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
                             # 'fewer digits, cropped, worst quality, low quality
        self.cond_path = paths # str: condition image path
        self.cond_inp_type = 'depth' # str: the type of the input condition image, take depth T2I as example, the input can be raw image, '
                                     # 'which depth will be calculated, or the input can be a directly a depth map image
        self.sampler = 'ddim' # str: ddim, plms
        self.steps = 50 # int: numper of sampling steps
        self.sd_ckpt = '/export/data/vislearn/rother_subgroup/dzavadsk/models/pretrained_originals/StableDiffusion/v1-5-pruned.ckpt'
                        # str: path to sd ckpt or safetensor
        self.vae_ckpt = None # str: VAE checkpoint 
        self.adapter_ckpt = '/export/data/vislearn/rother_subgroup/feiden/models/pretrained/T2I_Adapter/t2iadapter_depth_sd15v2.pth'
                            # str: Adapter ckpt
        self.config = 'configs/stable-diffusion/sd-v1-inference.yaml' # str: path to config
        self.max_resolution = 512 * 512 # float, max image hight * width
        self.resize_short_edge = None # int: resize short edge of the input image, if set max_res not used
        self.C = 4 # int: latent channels
        self.f = 8 # int: downsampling factor
        self.scale = 7.5 # float: unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))
        self.cond_tau = 1.0 # float: timestamp parameter that determines until which step adapter is applied
        self.style_cond_tau = 1.0 # timestamp parameter that determines until which step the adapter is applied
        self.cond_weight = 1.0 # float: the adapter features are multiplied with this (control strength)
        self.seed = 42 # int
        self.n_samples = 1 # int: # of samples to generate
        self.which_cond = 'depth' #str:  sketch keypose seg depth canny style color openpose
        self.device = 'cuda'

opt = deafult_settings()

In [19]:

sd_model, sampler = get_sd_models(opt)
which_cond = opt.which_cond
adapter = get_adapters(opt, getattr(ExtraCondition, which_cond))
cond_model = None
if opt.cond_inp_type == 'image':
    cond_model = get_cond_model(opt, getattr(ExtraCondition, which_cond))

process_cond_module = getattr(api, f'get_cond_{which_cond}')


def give_params(model):
    total_params = sum(
                    param.numel() for param in model.parameters()
                      )
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    trainable_params = sum([np.prod(p.size()) for p in model_parameters])
    return total_params, trainable_params
    


Loading model from /export/data/vislearn/rother_subgroup/dzavadsk/models/pretrained_originals/StableDiffusion/v1-5-pruned.ckpt
LatentDiffusion: Running in eps-prediction mode
DiffusionWrapper has 859.52 M params.
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 64, 64) = 16384 dimensions.
making attention of type 'vanilla' with 512 in_channels


In [24]:
total, train = give_params(sd_model)    
print(f'sd_model: trainable: {total:_} ; trainable: {train:_}')
ad_total, ad_train = give_params(adapter['model'])
print(f'adapter_model: trainable: {ad_total:_} ; trainable: {ad_train:_}')

print(f'Trainable_params total: {train + ad_train:_}')

sd_model: trainable: 1_066_235_307 ; trainable: 859_520_964
adapter_model: trainable: 77_369_280 ; trainable: 77_369_280
Trainable_params total: 936_890_244


In [5]:
def run_depth(opt):
    which_cond = opt.which_cond
    if opt.outdir is None:
        opt.outdir = f'outputs/test-{which_cond}'
    os.makedirs(opt.outdir, exist_ok=True)
    if opt.resize_short_edge is None:
        print(f"you don't specify the resize_shot_edge, so the maximum resolution is set to {opt.max_resolution}")
    opt.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # support two test mode: single image test, and batch test (through a txt file)
    if type(opt.prompt) is not list:
        if opt.prompt.endswith('.txt'):
            assert opt.prompt.endswith('.txt')
            image_paths = []
            prompts = []
            with open(opt.prompt, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    line = line.strip()
                    image_paths.append(line.split('; ')[0])
                    prompts.append(line.split('; ')[1])
    else:
        image_paths = opt.cond_path
        prompts = opt.prompt
    print(image_paths)

    # prepare models
    sd_model, sampler = get_sd_models(opt)
    adapter = get_adapters(opt, getattr(ExtraCondition, which_cond))
    cond_model = None
    if opt.cond_inp_type == 'image':
        cond_model = get_cond_model(opt, getattr(ExtraCondition, which_cond))

    process_cond_module = getattr(api, f'get_cond_{which_cond}')

    # inference
    with torch.inference_mode(), \
            sd_model.ema_scope(), \
            autocast('cuda'):
        for i in range(len(image_paths)):
            cond_path, prompt = image_paths[i], prompts[i]
            seed_everything(opt.seed)
            for v_idx in range(opt.n_samples):
                # seed_everything(opt.seed+v_idx+test_idx)
                cond = process_cond_module(opt, cond_path, opt.cond_inp_type, cond_model)

                base_count = len(os.listdir(opt.outdir)) // 2

                adapter_features, append_to_context = get_adapter_feature(cond, adapter)
                opt.prompt = prompt
                result = diffusion_inference(opt, sd_model, sampler, adapter_features, append_to_context)
                cv2.imwrite(os.path.join(opt.outdir, f'{i:05}_result.png'), tensor2img(result))




In [7]:
run_depth(opt)

you don't specify the resize_shot_edge, so the maximum resolution is set to 262144
['/export/data/ffeiden/PaperControlnetXS/512_images/render of a lavender cube on red background_depth.png', '/export/data/ffeiden/PaperControlnetXS/512_images/Image of mickey mouse standing and smiling_depth.png', '/export/data/ffeiden/PaperControlnetXS/512_images/Photo of an empty street with cars parked on both sides_depth.png', '/export/data/ffeiden/PaperControlnetXS/512_images/render of a green cube on NavyBlue background_depth.png', '/export/data/ffeiden/PaperControlnetXS/512_images/close up image of a face, manga style, blue hair_depth.png', '/export/data/ffeiden/PaperControlnetXS/512_images/Photo of a woman wearing a summer dress and a hat_depth.png', '/export/data/ffeiden/PaperControlnetXS/512_images/Portrait of a thoughtful young woman, photography, 4k_depth.png', '/export/data/ffeiden/PaperControlnetXS/512_images/Photo of a big house with stores at the first floor, cars parked, 4k_depth.png', '

Global seed set to 42


torch.float32
Data shape for DDIM sampling is (1.0, 4, 64, 64), eta 0.0


TypeError: randn() received an invalid combination of arguments - got (tuple, device=torch.device), but expected one of:
 * (tuple of ints size, *, torch.Generator generator, tuple of names names, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.Generator generator, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, tuple of names names, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
