In [1]:
!pip install -Uq diffusers transformers fastcore

[0m

In [2]:
# Imports
from pathlib import Path
import torch
from huggingface_hub import notebook_login
import logging
from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler
from tqdm.auto import tqdm
from PIL import Image
from torchvision import transforms
from transformers import CLIPTextModel, CLIPTokenizer

###

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [3]:
logging.disable(logging.WARNING)

# Log into hugging face
if not (Path.home()/'.cache/huggingface'/'token').exists(): notebook_login()

# Scheduler Parameters
beta_start, beta_end = 0.00085, 0.012

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# Import VAE, UNET, Scheduler, Tokenizer and Text Encoder
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema", torch_dtype=torch.float16).to("cuda")
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", torch_dtype=torch.float16).to("cuda")
scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear", num_train_timesteps=1000)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16)
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cuda")

Downloading (…)lve/main/config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading (…)ch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Downloading (…)ain/unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading (…)ch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

In [5]:
# Model prompt and Hyperparameters
prompt = ["a photograph of an astronaut riding a horse"]

height = 512
width = 512
num_inference_steps = 70
guidance_scale = 7.5
batch_size = 1

In [6]:
def text_enc(prompts, maxlen=None):
    if maxlen is None: maxlen = tokenizer.model_max_length
    inp = tokenizer(prompts, padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt")
    return text_encoder(inp.input_ids.to("cuda"))[0].half()

def mk_img(t):
    image = (t/2+0.5).clamp(0,1).detach().cpu().permute(1, 2, 0).numpy()
    return Image.fromarray((image*255).round().astype("uint8"))

In [7]:
def mk_samples(prompts, neg_prompts=None, init_image=None, g=7.5, seed=100, steps=70):
    bs = len(prompts)
    text = text_enc(prompts)
    if neg_prompts is None:
        neg_prompts = text_enc([""] * bs, text.shape[1])
    else:
        neg_prompts = text_enc(neg_prompts)
    emb = torch.cat([neg_prompts, text])
    if seed: torch.manual_seed(seed)

    
    latents = None
    # I need to change the latents to be the latents of (init_image + noise)
    if init_image is None:
        latents = torch.randn((bs, unet.in_channels, height//8, width//8))
    else:
        transform = transforms.Compose([
            transforms.Resize((height, width)),
            transforms.ToTensor()
        ])
        
        init_tensor_image = transform(init_image)
        init_tensor_image = init_tensor_image.unsqueeze(0).to("cuda").half()  
        # print(f"Actual: {init_tensor_image.shape}")
        latents = vae.encode(init_tensor_image)
        # expected_latents = torch.randn((bs, unet.in_channels, height//8, width//8))
        # print(f"Actual: {latents.shape}")
        # print(f"Expected: {expected_latents.shape}")
    
    
    
    
    scheduler.set_timesteps(steps)
    latents = latents.to("cuda").half() * scheduler.init_noise_sigma

    for i,ts in enumerate(tqdm(scheduler.timesteps)):
        inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)
        with torch.no_grad(): u,t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)
        pred = u + g*(t-u)
        latents = scheduler.step(pred, ts, latents).prev_sample

    with torch.no_grad(): return vae.decode(1 / 0.18215 * latents).sample

In [8]:
prompts = ['volcano with dinosaurs']
neg_prompts = ['blue']
init_image = Image.open('Children-draw.jpg')

In [9]:
# With initial image
images = mk_samples(prompts, init_image = init_image)
for img in images: display(mk_img(img))

AttributeError: 'AutoencoderKLOutput' object has no attribute 'to'

In [None]:
# With initial image
images = mk_samples(prompts, init_image = init_image)
for img in images: display(mk_img(img))

In [None]:
# Working mk_samples with neg_prompts

def mk_samples(prompts, neg_prompts=None, g=7.5, seed=100, steps=70):
    bs = len(prompts)
    text = text_enc(prompts)
    if neg_prompts is None:
        neg_prompts = text_enc([""] * bs, text.shape[1])
    else:
        neg_prompts = text_enc(neg_prompts)
    emb = torch.cat([neg_prompts, text])
    if seed: torch.manual_seed(seed)

    latents = torch.randn((bs, unet.in_channels, height//8, width//8))
    scheduler.set_timesteps(steps)
    latents = latents.to("cuda").half() * scheduler.init_noise_sigma

    for i,ts in enumerate(tqdm(scheduler.timesteps)):
        inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)
        with torch.no_grad(): u,t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)
        pred = u + g*(t-u)
        latents = scheduler.step(pred, ts, latents).prev_sample

    with torch.no_grad(): return vae.decode(1 / 0.18215 * latents).sample

In [None]:
# Without initial image
images = mk_samples(prompts, seed=44)
for img in images: display(mk_img(img))