# Donde viven los monstruos
## Maquinito Mensual 04
Basado muy fuertemente en [Grokking Stable Diffusion](https://colab.research.google.com/drive/1dlgggNa5Mz8sEAGU0wFCHhGLFooW_pf1?usp=sharing) de [Jonathan Whitaker](https://github.com/johnowhitaker) y en [Stable Diffusion with 🧨 diffusers](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) de [🤗 Hugging Face](https://github.com/huggingface/diffusers)

In [None]:
#@markdown #Comprobar GPU
!nvidia-smi -L

In [None]:
#@markdown #Login en HuggingFace
from IPython.display import clear_output
!pip install huggingface_hub
from huggingface_hub import notebook_login
clear_output()
notebook_login()

In [None]:
#@markdown #Librerías + Modelos + Funciones de ayuda

!pip install diffusers==0.3.0
!pip install transformers scipy ftfy

from google.colab import files
from transformers import CLIPTextModel, CLIPTokenizer
from tqdm.auto import tqdm
from torch import autocast
from PIL import Image
from huggingface_hub import notebook_login
from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler
import torch
clear_output()


vae = AutoencoderKL.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="vae", use_auth_token=True)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
unet = UNet2DConditionModel.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="unet", use_auth_token=True)
scheduler = LMSDiscreteScheduler(
    beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
clear_output()

torch_device = "cuda" if torch.cuda.is_available() else "cpu"
vae = vae.to(torch_device)
text_encoder = text_encoder.to(torch_device)
unet = unet.to(torch_device)


def latents_to_pil(latents):
    latents = (1 / 0.18215) * latents
    with torch.no_grad():
        images = vae.decode(latents).sample
    images = (images / 2 + 0.5).clamp(0, 1)
    images = images.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (images * 255).round().astype("uint8")
    pil_images = [Image.fromarray(image) for image in images]

    return pil_images


def image_grid(imgs, cols):
    grid_w = min([cols, len(imgs)])
    grid_h = len(imgs)//cols + 1
    w, h = imgs[0].size
    grid = Image.new('RGB', size=(grid_w*w, grid_h*h))
    #grid_w, grid_h = grid.size
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i % cols*w, i//cols*h))

    return grid


def render(steps, scale, seed, embeddings, batch_size):

    width = 512
    height = 512

    if seed == 0:
      seed = torch.randint(2**32, (1, 1))[0, 0].item()

    scheduler.set_timesteps(steps)
    generator = torch.manual_seed(seed)

    latents = torch.randn(
        (batch_size, unet.in_channels, height // 8, width // 8),
        generator=generator,
    )
    latents = latents.to(torch_device)  # [batch_size, 4, 64, 64]
    latents = latents * scheduler.sigmas[0]

    with autocast("cuda"):

        for i, t in tqdm(enumerate(scheduler.timesteps), total=steps):

            sigma = scheduler.sigmas[i]
            latent_model_input = torch.cat(
                [latents] * 2)  # [batch_size*2, 4, 64, 64]
            latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

            with torch.no_grad():
                # [2, 4, 64, 64]
                noise_pred = unet(latent_model_input, t,
                                  encoder_hidden_states=embeddings).sample

            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
            noise_pred = noise_pred_uncond + scale * \
                (noise_pred_text - noise_pred_uncond)  # [1, 4, 64, 64]

            latents = scheduler.step(
                noise_pred, i, latents).prev_sample  # [1, 4, 64, 64]
            # print(f"i={i} t={t}, sigma={sigma}")

    return latents_to_pil(latents), seed


token_EOS_value = 49407


In [None]:
#@markdown #1. El legado de Loab

num_images = 1

prompt = "teenager face"  # @param {type:"string"}
semilla = 2323883001  # @param {type:"number"}
guia = 7.5  # -@param {type:"slider", min:4.5, max:21, step:1.5}
hiperescala = 1.9  # @param {type:"slider", min:-2, max:2, step:0.05}
pasos = 200  # @param {type:"slider", min:50, max:250, step:25}


prompts = [""] * num_images  # Unconditional
prompts.extend([prompt] * num_images)  # Prompts


inputs = tokenizer(prompts, padding="max_length", max_length=tokenizer.model_max_length,
                   truncation=True, return_tensors="pt")  # [num_images * 2, 77]

with torch.no_grad():
  text_embeddings = text_encoder(inputs.input_ids.to(torch_device))[
      0]  # [num_images * 2, 77, 768]

token_list = inputs.input_ids[1].tolist()
# Reescalamos los embeddings del prompt
text_embeddings[num_images:, 1:token_list.index(
    token_EOS_value)] *= hiperescala


images, semilla = render(pasos, guia, semilla, text_embeddings, num_images)
filename = f"1_{prompt}_{semilla}_{hiperescala:0.02f}_{pasos}.jpg"

print(f"Semilla: {semilla}")
print(f"Fichero: {filename}\n")

grid = image_grid(images, cols=3)
grid.save(filename, quality=100, subsampling=0)
files.download(filename)
grid


In [None]:
#@markdown #2.1 Criptozoología

num_images = 1
pasos = 75
guia = 7.5

prompt1 = "seal"  # @param {type:"string"}
prompt2 = "greyhound"  # @param {type:"string"}
semilla = 171347836  # @param {type:"number"}
mezcla = 53  # @param {type:"slider", min:0, max:100, step:1}
mezcla /= 100


inputs = tokenizer(["", prompt1, prompt2], padding="max_length",
                   max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")  # [3,77]

with torch.no_grad():
  embeddings = text_encoder(inputs.input_ids.to(torch_device))[0]  # [3,77,768]

text_embeddings = torch.stack(
    [embeddings[0], embeddings[1]*(1-mezcla) + embeddings[2]*mezcla])  # [2,77,768]


images, semilla = render(pasos, guia, semilla, text_embeddings, num_images)
filename = f"21_{prompt1}_{prompt2}_{semilla}_{mezcla:0.02f}.jpg"

print(f"Semilla: {semilla}")
print(f"Fichero: {filename}\n")

grid = image_grid(images, cols=3)
grid.save(filename, quality=100, subsampling=0)
files.download(filename)
grid

In [None]:
#@markdown #2.2 Criptozoología MULTI

num_images = 1

prompt_body = "@, natgeo"  # @param {type:"string"}
prompt_subjects = "seal greyhound"  # @param {type:"string"}
semilla = 0  # @param {type:"number"}
pasos = 75  # @param {type:"slider", min:50, max:250, step:25}
guia = 10.5  # @param {type:"slider", min:4.5, max:21, step:1.5}

subject_list = prompt_subjects.split(" ")
prompts = [prompt_body.replace('@', ps.strip()) for ps in subject_list]
prompts.insert(0, "")


inputs = tokenizer(prompts, padding="max_length", max_length=tokenizer.model_max_length,
                   truncation=True, return_tensors="pt")  # [1+len(subject_list),77]

with torch.no_grad():
  embeddings = text_encoder(inputs.input_ids.to(torch_device))[
      0]  # [1+len(subject_list),77,768]

text_embeddings = torch.stack([embeddings[0], torch.sum(
    embeddings[1:], axis=0)/len(subject_list)])  # [2,77,768]


images, semilla = render(pasos, guia, semilla, text_embeddings, num_images)
filename = f"22_{prompt_body}_{prompt_subjects}_{semilla}_{pasos}_{guia}.jpg"

print(f"Semilla: {semilla}")
print(f"Fichero: {filename}\n")

grid = image_grid(images, cols=3)
grid.save(filename, quality=100, subsampling=0)
files.download(filename)
grid

In [None]:
#@markdown #2.3 Criptozoología NEG
num_images = 1

prompt_body = "a photograph of a plush @ stuffed toy" # @param {type:"string"}
prompt_subjects = "seal:3 greyhound:1 man:2"  # @param {type:"string"}
semilla = 0  # @param {type:"number"}
pasos = 75  # @param {type:"slider", min:50, max:250, step:25}
guia = 7.5  # @param {type:"slider", min:4.5, max:21, step:1.5}

weights = []
prompts = [""]
weight_sum = 0
for subject in prompt_subjects.split(" "):
  values = subject.split(":")
  values[1] = int(values[1])
  weights.append(values[1])
  prompts.append(prompt_body.replace('@', values[0].strip()))
  weight_sum += (values[1])


inputs = tokenizer(prompts, padding="max_length", max_length=tokenizer.model_max_length,
                   truncation=True, return_tensors="pt")  # [1+len(subject_list),77]

with torch.no_grad():
  embeddings = text_encoder(inputs.input_ids.to(torch_device))[
      0]  # [1+len(subject_list),77,768]

for i in range(embeddings.shape[0]-1):
  embeddings[i+1] *= weights[i]/weight_sum

text_embeddings = torch.stack(
    [embeddings[0], torch.sum(embeddings[1:], axis=0)])  # [2,77,768]


images, semilla = render(pasos, guia, semilla, text_embeddings, num_images)
filename = f"22_{prompt_body}_{prompt_subjects}_{semilla}_{pasos}_{guia}.jpg"

print(f"Semilla: {semilla}")
print(filename)

grid = image_grid(images, cols=3)
grid.save(filename, quality=100, subsampling=0)
files.download(filename)
grid

In [None]:
#@markdown #¡Ayuda! - Cuentatokens
prompt1 = "greyhound natgeo"  # @param {type:"string"}
prompt2 = "greyhound, natgeo"  # @param {type:"string"}


def tokens_to_list(tokens):
  tokens = list(filter(lambda id: id != 49406 and id != 49407, tokens))
  token_list = [tokenizer.decoder.get(t) for t in tokens]
  return token_list


tokens = tokenizer([prompt1, prompt2], return_tensors="np",
                   padding=True).input_ids
tokens = [tokens_to_list(tokens[0]), tokens_to_list(tokens[1])]
print(f"{tokens[0]}: {len(tokens[0])}")
print(f"{tokens[1]}: {len(tokens[1])}")

### Para cuando todos tengamos máquinas súper potentes

In [None]:
# #@markdown #BACKUP

# num_images = 1

# prompt1 = "a cat"  # @param {type:"string"}
# prompt2 = "a bird"  # @param {type:"string"}

# semilla = 25115  # @param {type:"number"}
# seed = semilla
# if seed == -1:
#   seed = torch.randint(2**32, (1, 1))[0, 0].item()

# mezcla = 0.5  # @param {type:"slider", min:0, max:1, step:0.1}
# mix_factor = hiperescala

# height = 512
# width = 512
# num_inference_steps = 50
# guidance_scale = 7.5

# generator = torch.manual_seed(32)


# text_input1 = tokenizer([prompt1], padding="max_length",
#                         max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
# with torch.no_grad():
#   text_embeddings1 = text_encoder(text_input1.input_ids.to(torch_device))[0]

# text_input2 = tokenizer([prompt2], padding="max_length",
#                         max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
# with torch.no_grad():
#   text_embeddings2 = text_encoder(text_input2.input_ids.to(torch_device))[0]

# # Take the average
# # text_embeddings = text_embeddings1*mix_factor


# # And the uncond. input as before:
# uncond_input = tokenizer(
#     [""] * batch_size, padding="max_length", max_length=77, return_tensors="pt"
# )
# with torch.no_grad():
#   uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
# text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

# # Prep Scheduler
# scheduler.set_timesteps(num_inference_steps)

# # Prep latents
# latents_cpu = torch.randn(
#     (batch_size, unet.in_channels, height // 8, width // 8),
#     generator=generator,
# )


# # Loop
# for f, mix_factor in enumerate(numpy.linspace(1, 2, 101)):

#   text_embeddings = text_embeddings1*mix_factor
#   text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

#   latents = latents_cpu.to(torch_device)
#   latents = latents * scheduler.sigmas[0]  # Need to scale to match k

#   with autocast("cuda"):
#     for i, t in tqdm(enumerate(scheduler.timesteps)):
#       # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
#       latent_model_input = torch.cat([latents] * 2)
#       sigma = scheduler.sigmas[i]
#       latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

#       # predict the noise residual
#       with torch.no_grad():
#         noise_pred = unet(latent_model_input, t,
#                           encoder_hidden_states=text_embeddings)["sample"]

#       # perform guidance
#       noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
#       noise_pred = noise_pred_uncond + guidance_scale * \
#           (noise_pred_text - noise_pred_uncond)

#       # compute the previous noisy sample x_t -> x_t-1
#       latents = scheduler.step(noise_pred, i, latents)["prev_sample"]

#   latents_to_pil(latents)[0].save(f"lale_{f:05d}.png")
