# Donde viven los monstruos - Helpers
## Maquinito Mensual 04

In [None]:
#@markdown #Comprobar GPU
!nvidia-smi -L

In [1]:
#@markdown #Login en HuggingFace
from IPython.display import clear_output
!pip install huggingface_hub
from huggingface_hub import notebook_login
clear_output()
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [2]:
#@markdown #Librerías + Modelos + Funciones de ayuda

!pip install diffusers==0.3.0
!pip install transformers scipy ftfy

from google.colab import files
from transformers import CLIPTextModel, CLIPTokenizer
from tqdm.auto import tqdm
from torch import autocast
from PIL import Image
from huggingface_hub import notebook_login
from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler
from torchvision import transforms as tfms
import torch
import numpy as np

clear_output()


vae = AutoencoderKL.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="vae", use_auth_token=True)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
unet = UNet2DConditionModel.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="unet", use_auth_token=True)
scheduler = LMSDiscreteScheduler(
    beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
clear_output()

torch_device = "cuda" if torch.cuda.is_available() else "cpu"
vae = vae.to(torch_device)
text_encoder = text_encoder.to(torch_device)
unet = unet.to(torch_device)

to_tensor_tfm = tfms.ToTensor() 


def pil_to_latent(input_im):
  with torch.no_grad():
    pixel_values = to_tensor_tfm(input_im).unsqueeze(0).to(torch_device)*2-1
    latents = vae.encode(pixel_values).latent_dist.sample().detach()
    latents = latents * 0.18215
  return latents


def latents_to_pil(latents):
    latents = (1 / 0.18215) * latents
    with torch.no_grad():
        images = vae.decode(latents).sample
    images = (images / 2 + 0.5).clamp(0, 1)
    images = images.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (images * 255).round().astype("uint8")
    pil_images = [Image.fromarray(image) for image in images]

    return pil_images


def image_grid(imgs, cols):
    grid_w = min([cols, len(imgs)])
    grid_h = len(imgs)//cols + 1
    w, h = imgs[0].size
    grid = Image.new('RGB', size=(grid_w*w, grid_h*h))
    #grid_w, grid_h = grid.size
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i % cols*w, i//cols*h))

    return grid


def render(steps, scale, seed, embeddings, batch_size):

    width = 512
    height = 512

    if seed == 0:
      seed = torch.randint(2**32, (1, 1))[0, 0].item()

    scheduler.set_timesteps(steps)
    generator = torch.manual_seed(seed)

    latents = torch.randn(
        (batch_size, unet.in_channels, height // 8, width // 8),
        generator=generator,
    )
    latents = latents.to(torch_device)  # [batch_size, 4, 64, 64]
    latents = latents * scheduler.sigmas[0]

    with autocast("cuda"):

        for i, t in tqdm(enumerate(scheduler.timesteps), total=steps):

            sigma = scheduler.sigmas[i]
            latent_model_input = torch.cat(
                [latents] * 2)  # [batch_size*2, 4, 64, 64]
            latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

            with torch.no_grad():
                # [2, 4, 64, 64]
                noise_pred = unet(latent_model_input, t,
                                  encoder_hidden_states=embeddings).sample

            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
            noise_pred = noise_pred_uncond + scale * \
                (noise_pred_text - noise_pred_uncond)  # [1, 4, 64, 64]

            latents = scheduler.step(
                noise_pred, i, latents).prev_sample  # [1, 4, 64, 64]
            # print(f"i={i} t={t}, sigma={sigma}")

    return latents_to_pil(latents), seed


token_EOS_value = 49407



In [None]:
#@markdown #Imagen: Espacio latente
!curl --output peluche.jpg 'https://both.rocks/maquimensu04/peluche.jpg'
input_image = Image.open('peluche.jpg')

encoded = pil_to_latent(input_image)
decoded = latents_to_pil(encoded)[0]

decoded

In [None]:
#@markdown #Imagen: Sustituir por ruido
encoded_random = encoded.clone()
encoded_random [0,0,:,:16] = torch.rand((1,64,16))
encoded_random [0,1,:,16:32] = torch.rand((1,64,16))
encoded_random [0,2,:,32:48] = torch.rand((1,64,16))
encoded_random [0,3,:,48:] = torch.rand((1,64,16))
decoded = latents_to_pil(encoded_random)[0]
decoded

In [43]:
#@markdown #Texto: Cuentatokens
prompt1 = "elefante"  # @param {type:"string"}
prompt2 = "\"p\xE1jaro\""  # @param {type:"string"}


def tokens_to_list(tokens):
  tokens = list(filter(lambda id: id != 49406 and id != 49407, tokens))
  token_list = [tokenizer.decoder.get(t) for t in tokens]
  return token_list


tokens = tokenizer([prompt1, prompt2], return_tensors="np",
                   padding=True).input_ids
tokens = [tokens_to_list(tokens[0]), tokens_to_list(tokens[1])]
print(f"{tokens[0]}: {len(tokens[0])}")
print(f"{tokens[1]}: {len(tokens[1])}")

['ele', 'fan', 'te</w>']: 3
['p', 'Ã¡', 'jaro</w>']: 3


In [33]:
#@markdown #Texto: Interpolación de tokens (inglés)
def clean_tokens(tokens):
  return list(filter(lambda id: id != 49406 and id != 49407, tokens))

start_prompt = "cat"
end_prompt = "dog"

tokens = tokenizer([start_prompt, end_prompt], padding="max_length",
                   truncation=True, return_tensors="np").input_ids

start_token = clean_tokens(tokens[0])
end_token = clean_tokens(tokens[1])

                  
for t in np.linspace(start_token[0], end_token[0], num=6, endpoint=True):
  print(f"{int(t)}:{tokenizer.decoder.get(int(t))}")

2368:cat</w>
2280:took</w>
2192:mid
2104:suppor
2016:movie</w>
1929:dog</w>


In [41]:
#@markdown #Texto: Interpolación de tokens (español)
def clean_tokens(tokens):
  return list(filter(lambda id: id != 49406 and id != 49407, tokens))

start_prompt = "elefante"
end_prompt = "pájaro"

tokens = tokenizer([start_prompt, end_prompt], padding="max_length",
                   truncation=True, return_tensors="np").input_ids

start_token = clean_tokens(tokens[0])
end_token = clean_tokens(tokens[1])

print(f"{start_prompt}: {start_token}")
print(f"{end_prompt}: {end_token}")
                  
for t in np.linspace(start_token[0], end_token[0], num=6, endpoint=True):
  print(f"{int(t)}:{tokenizer.decoder.get(int(t))}")

elefante: [2084, 1675, 756]
pájaro: [79, 7261, 35505]
2084:ele
1683:hope</w>
1282:pic</w>
881:ster</w>
480:Ĥ</w>
79:p


### Para cuando todos tengamos máquinas súper potentes

In [None]:
# #@markdown #BACKUP

# num_images = 1

# prompt1 = "a cat"  # @param {type:"string"}
# prompt2 = "a bird"  # @param {type:"string"}

# semilla = 25115  # @param {type:"number"}
# seed = semilla
# if seed == -1:
#   seed = torch.randint(2**32, (1, 1))[0, 0].item()

# mezcla = 0.5  # @param {type:"slider", min:0, max:1, step:0.1}
# mix_factor = hiperescala

# height = 512
# width = 512
# num_inference_steps = 50
# guidance_scale = 7.5

# generator = torch.manual_seed(32)


# text_input1 = tokenizer([prompt1], padding="max_length",
#                         max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
# with torch.no_grad():
#   text_embeddings1 = text_encoder(text_input1.input_ids.to(torch_device))[0]

# text_input2 = tokenizer([prompt2], padding="max_length",
#                         max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
# with torch.no_grad():
#   text_embeddings2 = text_encoder(text_input2.input_ids.to(torch_device))[0]

# # Take the average
# # text_embeddings = text_embeddings1*mix_factor


# # And the uncond. input as before:
# uncond_input = tokenizer(
#     [""] * batch_size, padding="max_length", max_length=77, return_tensors="pt"
# )
# with torch.no_grad():
#   uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
# text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

# # Prep Scheduler
# scheduler.set_timesteps(num_inference_steps)

# # Prep latents
# latents_cpu = torch.randn(
#     (batch_size, unet.in_channels, height // 8, width // 8),
#     generator=generator,
# )


# # Loop
# for f, mix_factor in enumerate(numpy.linspace(1, 2, 101)):

#   text_embeddings = text_embeddings1*mix_factor
#   text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

#   latents = latents_cpu.to(torch_device)
#   latents = latents * scheduler.sigmas[0]  # Need to scale to match k

#   with autocast("cuda"):
#     for i, t in tqdm(enumerate(scheduler.timesteps)):
#       # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
#       latent_model_input = torch.cat([latents] * 2)
#       sigma = scheduler.sigmas[i]
#       latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

#       # predict the noise residual
#       with torch.no_grad():
#         noise_pred = unet(latent_model_input, t,
#                           encoder_hidden_states=text_embeddings)["sample"]

#       # perform guidance
#       noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
#       noise_pred = noise_pred_uncond + guidance_scale * \
#           (noise_pred_text - noise_pred_uncond)

#       # compute the previous noisy sample x_t -> x_t-1
#       latents = scheduler.step(noise_pred, i, latents)["prev_sample"]

#   latents_to_pil(latents)[0].save(f"lale_{f:05d}.png")
