In [None]:
!pip install diffusers transformers accelerate

In [None]:
from diffusers import StableDiffusionPipeline

pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")

In [None]:
from IPython.display import display

pipe.to("cuda")
prompt = "a photograph of an astronaut riding a horse"
prompt = "a horse sized cat eating a bagel"
num_steps = 20

image = pipe(prompt, num_inference_steps=num_steps, height=512, width=512).images[0]
display(image)

In [None]:
!pip install git+https://github.com/tinygrad/tinygrad.git
!pip install git+https://github.com/Fatlonder/tinyfusers.git

In [1]:
from IPython.display import display
from tqdm import tqdm
from pathlib import Path
from PIL import Image
import numpy as np
import tempfile
from tinygrad import Device, GlobalCounters, dtypes, Tensor, TinyJit
from tinygrad.nn.state import torch_load, load_state_dict, get_state_dict
from tinygrad.helpers import Timing, Context, getenv, fetch, colored
from tinyfusers.variants.sd import StableDiffusion
from tinyfusers.tokenizer.clip import ClipTokenizer

In [2]:
default_prompt = "a horse sized cat eating a bagel"
args = {"prompt": default_prompt, "steps": 20, "fp16": True, "out": "rendered.png", "noshow": False, "timing": False, "guidance":7.5, "seed": 42}
Tensor.no_grad = True
model = StableDiffusion()

In [None]:
load_state_dict(model, torch_load(fetch('https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt', 'sd-v1-4.ckpt'))['state_dict'], strict=False)

In [4]:
for l in get_state_dict(model).values():
  l.replace(l.cast(dtypes.float16).realize())

In [None]:
# run through CLIP to get context
tokenizer = ClipTokenizer()
prompt = Tensor([tokenizer.encode(args['prompt'])])
context = model.cond_stage_model.transformer.text_model(prompt).realize()
print("got CLIP context", context.shape)

prompt = Tensor([tokenizer.encode("")])
unconditional_context = model.cond_stage_model.transformer.text_model(prompt).realize()
print("got unconditional CLIP context", unconditional_context.shape)

# done with clip model
del model.cond_stage_model

In [None]:
timesteps = list(range(1, 1000, 1000//args['steps']))
print(f"running for {timesteps} timesteps")
alphas = model.alphas_cumprod[Tensor(timesteps)]
alphas_prev = Tensor([1.0]).cat(alphas[:-1])

# start with random noise
if args['seed'] is not None: Tensor._seed = args['seed']
latent = Tensor.randn(1,4,64,64)

@TinyJit
def run(model, *x): return model(*x).realize()

# this is diffusion
with Context(BEAM=getenv("LATEBEAM")):
  for index, timestep in (t:=tqdm(list(enumerate(timesteps))[::-1])):
    GlobalCounters.reset()
    t.set_description("%3d %3d" % (index, timestep))
    with Timing("step in ", enabled=args['timing'], on_exit=lambda _: f", using {GlobalCounters.mem_used/1e9:.2f} GB"):
      tid = Tensor([index])
      latent = run(model, unconditional_context, context, latent, Tensor([timestep]), alphas[tid], alphas_prev[tid], Tensor([args['guidance']]))
      if args['timing']: Device[Device.DEFAULT].synchronize()
  del run

# upsample latent space to image with autoencoder
x = model.decode(latent)
print(x.shape)

im = Image.fromarray(x.numpy().astype(np.uint8, copy=False))
im.save(args['out'])
display(im)