In [2]:
import os
from PIL import Image, ImageDraw
import cv2
import numpy as np
from IPython.display import HTML
from base64 import b64encode

import torch
from torch import autocast
from torch.nn import functional as F
from diffusers import StableDiffusionPipeline, AutoencoderKL
from diffusers import UNet2DConditionModel, PNDMScheduler, LMSDiscreteScheduler
from diffusers.schedulers.scheduling_ddim import DDIMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from tqdm.auto import tqdm
from huggingface_hub import notebook_login

device = "cuda" if torch.cuda.is_available() else "cpu"

notebook_login()

Login successful
Your token has been saved to /Users/tonimo/.huggingface/token


In [2]:
# make sure you're logged in with `huggingface-cli login
pipe = StableDiffusionPipeline.from_pretrained(
    'CompVis/stable-diffusion-v1-4', revision='fp16',
    torch_dtype=torch.float16, use_auth_token=True)
pipe = pipe.to(device)

Downloading:   0%|          | 0.00/71.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/167M [00:00<?, ?B/s]

In [3]:
prompt = "lucid air driving into sunset"
with autocast(device):
  image = pipe(prompt)["sample"][0]
image

RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'

In [None]:
def image_grid(imgs):
    n = len(imgs)
    if n % 2:
      rows, cols = n // 2, n // 2 + 1
    else:
      rows = cols = n // 2

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

def prompify():
  n_images = int(input("Enter number of images to generate: "))
  prompt = [str(input("Enter text prompt: ").strip())]
  prompts = prompt * n_images
  return prompts

def generate(prompts):
  device = "cuda" if torch.cuda.is_available() else "cpu"
  with autocast(device):
    images = pipe(prompts)["sample"]
    return images
  

In [None]:
prompts = prompify()
images = generate(prompts)
image_grid(images)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# loading variational autoencoder (turns text to image)
vae = AutoencoderKL.from_pretrained(
    'CompVis/stable-diffusion-v1-4', subfolder='vae', use_auth_token=True)
vae = vae.to(device)

# tokenizer to tokenize text, and encoder to encode tokenized sequences (Clip model)
tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-large-patch14')
text_encoder = CLIPTextModel.from_pretrained('openai/clip-vit-large-patch14')
text_encoder = text_encoder.to(device)

# compression and denoising
unet = UNet2DConditionModel.from_pretrained(
    'CompVis/stable-diffusion-v1-4', subfolder='unet', use_auth_token=True)
unet = unet.to(device)

# need scheduler for inference steps (for denoising)
scheduler = LMSDiscreteScheduler(
    beta_start=0.00085, beta_end=0.012,
    beta_schedule='scaled_linear', num_train_timesteps=1000)

In [None]:
# take in text embeddings and convert into 
def generate_latets(text_embeddings, resolution=(512, 512), num_inference_steps=50, 
                     guidance_scale=7.5, latents=None):
  device = "cuda" if torch.cuda.is_available() else "cpu"

  height, width = resolution
  assert(height % 8 == 0 and width % 8 == 0)

  # start with random noisy latents to start denoise (can pass custom latents)
  if latents is None:
    latents = torch.randn((text_embeddings.shape[0] // 2, unet.in_channels, \
                           height // 8, width // 8))
  latents = latents.to(device) # move to gpu

  # handling how to take denoise prediction & apply to latence optimally
  scheduler.set_timesteps(num_inference_steps)
  latents = latents * scheduler.sigmas[0] # scaling

  with autocast('cuda'):
    
    for i, t in tqdm(enumerate(scheduler.timesteps)):
      # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
      latent_model_input = torch.cat([latents] * 2)
      sigma = scheduler.sigmas[i]
      latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

      # predict the noise residual
      with torch.no_grad():
        noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']

      # perform guidance
      noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
      noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

      # compute the previous noisy sample x_t -> x_t-1 (undoing noise)
      latents = scheduler.step(noise_pred, i, latents)['prev_sample']
  
  return latents

In [None]:
embeddings = get_text_embeds(prompify())

In [None]:
latents = generate_latets(embeddings)