In [1]:
prompt = "A green clock stands tall on a brick sidewalk, surrounded by parked bicycles and a white truck. Two men converse on the sidewalk, one wearing a red shirt and the other a grey hoodie. Trees with sparse foliage line the street, and a white brick building looms in the background."

In [None]:
from diffusers import DiffusionPipeline
from diffusers.utils import pt_to_pil, make_image_grid
import torch
import os

parent_path = os.getcwd()
cache_dir=os.path.join(parent_path, "cache")

# stage 1
stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16,
                                            cache_dir=cache_dir, use_safetensors=True)
stage_1.enable_model_cpu_offload()

# stage 2
stage_2 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16",
                                            torch_dtype=torch.float16, cache_dir=cache_dir, use_safetensors=True)
stage_2.enable_model_cpu_offload()

# stage 3
safety_modules = {
    "feature_extractor": stage_1.feature_extractor,
    "safety_checker": stage_1.safety_checker,
    "watermarker": stage_1.watermarker,
}
stage_3 = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16,
    cache_dir=cache_dir, use_safetensors=True
)
stage_3.enable_model_cpu_offload()


In [None]:
# prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
# generator = torch.manual_seed(1) generator=generator,

# text embeds
prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt)

# stage 1
stage_1_output = stage_1(
    prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt"
).images
#pt_to_pil(stage_1_output)[0].save("./if_stage_I.png")

# stage 2
stage_2_output = stage_2(
    image=stage_1_output,
    prompt_embeds=prompt_embeds,
    negative_prompt_embeds=negative_embeds,
    output_type="pt",
).images
#pt_to_pil(stage_2_output)[0].save("./if_stage_II.png")

# stage 3
stage_3_output = stage_3(prompt=prompt, image=stage_2_output, noise_level=100).images
#stage_3_output[0].save("./if_stage_III.png")
images = make_image_grid([pt_to_pil(stage_1_output)[0], pt_to_pil(stage_2_output)[0], stage_3_output[0]], rows=1, cols=3, resize=1024)

In [None]:
pt_to_pil(stage_2_output)[0]

In [None]:
images

In [None]:
import generator as gen
import os

parent_path = os.getcwd()
cache_dir=os.path.join(parent_path, "cache")
print(cache_dir)
repo_ids = ["DeepFloyd/IF-I-XL-v1.0", "DeepFloyd/IF-II-L-v1.0", "stabilityai/stable-diffusion-x4-upscaler"]
gpu_id = 0
stage1, stage2, stage3 = gen.get_deepfloydif_pipeline(repo_ids, cache_dir, gpu_id)

In [None]:
prompts = [prompt, prompt]
images = gen.run_deepfloydif_t2i(stage1, stage2, stage3, prompts, manual_seed=True, seed=8)

In [5]:
images

[[<PIL.Image.Image image mode=RGB size=256x256>,
  <PIL.Image.Image image mode=RGB size=1024x1024>],
 [<PIL.Image.Image image mode=RGB size=256x256>,
  <PIL.Image.Image image mode=RGB size=1024x1024>]]