In [1]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [None]:
import torch
import math
import matplotlib.pyplot as plt

from PIL import Image
from diffusers import FluxPipeline
from torch import Tensor
from torchvision import transforms

DTYPE = torch.bfloat16
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=DTYPE)
pipe.to("cuda")

In [3]:
@torch.inference_mode()
def decode_imgs(latents, pipeline):
    imgs = (latents / pipeline.vae.config.scaling_factor) + pipeline.vae.config.shift_factor
    imgs = pipeline.vae.decode(imgs)[0]
    imgs = pipeline.image_processor.postprocess(imgs, output_type="pil")
    return imgs

@torch.inference_mode()
def encode_imgs(imgs, pipeline):
    latents = pipeline.vae.encode(imgs).latent_dist.sample()
    latents = (latents - pipeline.vae.config.shift_factor) * pipeline.vae.config.scaling_factor
    latents = latents.to(dtype=DTYPE)
    return latents

def get_noise(
    num_samples: int,
    height: int,
    width: int,
    device: torch.device,
    dtype: torch.dtype,
    seed: int,
):
    return torch.randn(  # [B, 16, H // 8, W // 8], latents after VAE
        num_samples,
        16,
        2 * math.ceil(height / 16),
        2 * math.ceil(width / 16),
        device=device,
        dtype=dtype,
        generator=torch.Generator(device=device).manual_seed(seed),
    )

In [None]:
def time_shift(mu: float, sigma: float, t: Tensor):
    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)

def get_lin_function(
    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
):
    m = (y2 - y1) / (x2 - x1)
    b = y1 - m * x1
    return lambda x: m * x + b

def get_schedule(
    num_steps: int,
    image_seq_len: int,
    base_shift: float = 0.5,
    max_shift: float = 1.15,
    shift: bool = True,
) -> list[float]:
    timesteps = torch.linspace(1, 0, num_steps + 1)
    if shift:
        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
        timesteps = time_shift(mu, 1.0, timesteps)
    return timesteps.tolist()

timesteps = get_schedule( # shape: [num_inference_steps]
            num_steps=4,
            image_seq_len=(1024 // 16) * (1024 // 16), # vae_scale_factor = 16
            shift=True,  # Set True for Flux-dev, False for Flux-schnell
        )

print(timesteps)

In [5]:
@torch.inference_mode()
def forward_denoise(pipeline, num_steps, prompt, height, width, guidance_scale=3.5, seed=0):
    timesteps = get_schedule( # shape: [num_inference_steps]
            num_steps=num_steps,
            image_seq_len=(height // 16) * (width // 16), # vae_scale_factor = 16
            shift=True,  # Set True for Flux-dev, False for Flux-schnell
        )
    
    prompt_embeds, pooled_prompt_embeds, text_ids = pipeline.encode_prompt(prompt=prompt, prompt_2=prompt)

    noise = get_noise( # save, shape [num_samples, 16, resolution // 8, resolution // 8]
        num_samples=1,
        height=height,
        width=width,
        device="cuda",
        dtype=DTYPE,
        seed=seed,
    )

    latent_image_ids = FluxPipeline._prepare_latent_image_ids(
        noise.shape[0],
        noise.shape[2],
        noise.shape[3],
        noise.device,
        DTYPE,
    )

    packed_latents = FluxPipeline._pack_latents( # shape [num_samples, (resolution // 16 * resolution // 16), 16 * 2 * 2]
        noise,
        batch_size=noise.shape[0],
        num_channels_latents=noise.shape[1],
        height=noise.shape[2],
        width=noise.shape[3],
    )
    
    # Reversed denoising loop in latent space
    with pipeline.progress_bar(total=len(timesteps)-1) as progress_bar:
        for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
            t_vec = torch.full((packed_latents.shape[0],), t_curr, dtype=packed_latents.dtype, device=packed_latents.device)
            guidance_vec = torch.full((packed_latents.shape[0],), guidance_scale, device=packed_latents.device, dtype=packed_latents.dtype)
            print(f"time step: {t_vec[0]}")
            pred = pipeline.transformer(
                    hidden_states=packed_latents, # shape: [batch_size, seq_len, num_channels_latents], e.g. [1, 4096, 64] for 1024x1024
                    timestep=t_vec,        # range: [0, 1]
                    guidance=guidance_vec, # scalar guidance values for each sample in the batch
                    pooled_projections=pooled_prompt_embeds, # CLIP text embedding
                    encoder_hidden_states=prompt_embeds,     # T5 text embedding
                    txt_ids=text_ids,
                    img_ids=latent_image_ids,
                    joint_attention_kwargs=None,
                    return_dict=pipeline,
                )[0]
            packed_latents = packed_latents + (t_prev - t_curr) * pred
            progress_bar.update()
    
    img_latents = FluxPipeline._unpack_latents( # save, shape [num_samples, 16, resolution//8, resolution//8]
            packed_latents,
            height=height,
            width=width,
            vae_scale_factor=pipeline.vae_scale_factor,
    )
    return img_latents

In [None]:
# prompt = 'A high resolution photo of a scientist, white background, photo-realistic, high-detail'

# prompt = 'A vibrant, starry night sky illuminates a lively street café, with warm golden lights spilling from its windows. The café is nestled on a narrow cobblestone street, surrounded by rustic buildings with swirling, textured brushstrokes. Bold, dynamic colors—deep blues and glowing yellows—fill the scene. People are seated at small round tables, sipping coffee, and chatting. The atmosphere is cozy and inviting, yet full of movement and energy, capturing the timeless essence of a Van Gogh painting.'

# prompt = 'An exquisite gothic queen vampiress with dark blue hair and crimson red eyes: Her sensuous white skin gleams in the atmospheric, dense fog, creating an epic and dramatic mood. This hyper-realistic portrait is filled with morbid beauty, from her gothicattire to the intense lighting that highlights every intricate detail. The scene combines glamour with dark, mysterious elements, blending fantasy and horror in a visually stunning way.'

# prompt = 'Jewelry design, a ring with bright rose-cut blue diamonds, surrounded by small lily-of-the-valley flower-shaped diamonds, golden stems form the ring of the ring. The center of the base is a beautiful rose gold, with a detachable black ring on both sides'

# prompt = "Sci-fi entity with a mix of organic and mechanical elements: This oil painting-style portrait features a figure with a heavily brush-stroked texture, focusing on the upper body. The entity's gaze is locked, evoking a sense of horror tied to technology. The black and chrome color scheme, inspired by Tsutomu Nihei’s dystopian environments, creates a chaotic, hyper-detailed composition filled with raw, ultra-realistic elements.",

# prompt =  "Two cats dressed as samurais engaging in a duel, inspired by Akira Kurosawa's movie style. The photorealistic artwork is rich in high-detail textures, capturing the intensity and elegance of their battle. The scene is both humorous and masterful, blending realism with fantasy.",

# prompt =  "A red race car rendered in the style of Sam Spratt, blending historical illustrations with old masters' monochromatic realism. Influences from Genndy Tartakovsky and Masaccio give the car a soggy, gritty texture, evoking a sense of timeless speed and power."


# prompt = "An Asian girl wearing an elegant top or dress, set against a vibrant, neon-infused night. The style draws from iconic album covers with soft-focus portraits, combining light purple and amber hues. The scene feels both modern and timeless, with chicano-inspired elements adding depth to the image.", # 24

# prompt = "Close-up of a red rose breaking through a cube of cracked ice. The frosted surface glows with a cinematic light, with blood dripping from the petals, creating a stark contrast. The melting ice enhances the Valentine’s Day theme, with sharp focus and intricate, dramatic details." # 123

prompt = "A painting of a beautiful woman in an abstract, non-representational style. The image uses bold colors and shapes to express emotions and feelings. The imaginative composition features stunning details, blending artistic elements into a visually captivating piece." # 123

# prompt = "Minimal home office design with warm sunlight and artificial lighting. The soft atmosphere is enhanced by carefully placed objets, creating a clean, inviting, and warm workspace with a focus on simplicity and functionality." # 8

# prompt = "Front view of a Mediterranean terrace captured at sunset. Terracotta tiles, wrought iron details, and lush plantings create a warm, inviting atmosphere. Shot with a Panasonic Lumix S1R, 50mm f/1.4 lens, capturing the tranquil ambiance perfectly." # 256

# prompt = "A steampunk city with towering skyscrapers and intricate clockwork mechanisms. Steam billows from chimneys, and airships navigate skylanes above. The city is alive with gears and pistons, all rendered in 32K UHD with dynamic angles and highly detailed professional photography." # 213

# prompt = "A hummingbird flying near a flower in a forest. The masterpiece captures the dynamic motion of the bird, with natural light casting a soft glow on its feathers. The photorealistic scene is hyper-detailed, from the bird’s delicate wings to the vibrant surroundings, evoking a sense of wonder in wildlife photography." # 9

# prompt = "A hyper-detailed resin ring, featuring an intricate sci-fi city inside, illuminated by glowing LEDs. The ring is set against a cyberpunk background with vivid colors and futuristic elements, blending jewelry design with imaginative, sci-fi architecture." # 284

# prompt = "A cinematic shot of a cyberpunk industrial city, featuring tall futuristic buildings illuminated by neon lights. The scene is set at sunset, with warm colors and high detail. The high-resolution image captures the striking contrast between modern architecture and the glowing futuristic skyline." # 375

# prompt = "Photorealistic black semi-glossy retail shop facade, captured with a Canon EOS 5D. The geometric outline of the building contrasts with the motion-blurred crowd and car tail lights. Dramatic lighting effects highlight the sleek, award-winning design in a rainy, ultra-realistic scene." # 5

# prompt = "An 80s synthwave-inspired purple cat wearing bright orange sunglasses. The scene features a vivid sunset in the background, with neon colors and high-detail design, creating a nostalgic, futuristic vibe with a playful, retro twist." # 39

# prompt = "3D rendering of a Gundam face from the front, with a tactile paper-work style. The vibrant color palette includes burgundy, yellow, and blue. Emphasis on rough-textured paper, creating a handmade, masterpiece-like effect." # 326

# prompt = "A poster titled 'A Family Veterinarian,' featuring a watercolor portrait of a West Highland Terrier. The barnboard background adds warmth, with the tagline 'Caring is our duty' below, evoking trust and compassion in this heartwarming design." # 13

# prompt = "A lion painted in whimsical watercolors, featuring a mix of dark yellow, green, and dark purple tones. The characterful pen and ink style, combined with light orange and navy colors, creates a dynamic, bold animal portrait with a surreal twist." # 283

steps_list = [4, 6, 8, 12, 16, 28, 50]

# steps_list = [4]

fig, axes = plt.subplots(2, 4, figsize=(14, 7))

for i, num_steps in enumerate(steps_list):
    img_latents = forward_denoise(pipe, num_steps=num_steps, prompt=prompt, 
                                  height=1024, width=1024, guidance_scale=3.5, seed=123) # 66
    out = decode_imgs(img_latents, pipe)[0]
    
    # ax = axes[i // 4, i % 4]
    # ax.imshow(out)
    # ax.set_title(f"{num_steps} Steps")
    # ax.axis('off') 
    
    # out.save(f"./aqua_imgs/lion_{num_steps}_steps.png")
    out.save(f"./aqua_imgs/woman_paint_reflow_{num_steps}_steps.png")

# fig.suptitle("schewed t training, schewed t inference, 3rf", fontsize=16)
# plt.tight_layout()
# plt.show()

In [54]:
# prompt = "A AQUACOLTOK watercolor painting of a small, cozy café with outdoor seating, surrounded by flowering plants. The scene captures the charm of a quiet morning with the sun shining on the cobblestone streets. High quality, detailed architecture."

# prompt = "A AQUACOLTOK watercolor painting of UT campus"

# prompt = "A photo of UT campus"

# prompt = "A photo of a sks Corgi puppy swimming hardly underwater, best quality, high resolution"  # 123

# prompt = "A photo of sks Corgi puppy wearing sunglasses, smiling at the camera, close-up look, highly detailed, ultra-realistic." # 33

# prompt = "A photo of sks Corgi puppy riding in a basket on a bicycle through a European village, high resolution, bright and cheerful." # 33

# prompt = "A photo of sks Corgi puppy in a futuristic city, standing on a hoverboard, cyberpunk style, highly detailed, vibrant colors." # 12

prompt = "A sks Corgi puppy wearing a superhero costume, flying through the city, dynamic pose, ultra-realistic, high resolution." # 12

steps_list = [4, 6, 8, 12, 16, 28, 50]

# steps_list = [4]

for i, num_steps in enumerate(steps_list):
    img_latents = forward_denoise(pipe, num_steps=num_steps, prompt=prompt, 
                                  height=1024, width=1024, guidance_scale=3.5, seed=12) # 66
    out = decode_imgs(img_latents, pipe)[0]
    
    # out.save(f"./aqua_imgs/lion_{num_steps}_steps.png")
    out.save(f"./sksdog/sksdog_hero_{num_steps}step.png")

  0%|          | 0/4 [00:00<?, ?it/s]

time step: 1.0
time step: 0.90625
time step: 0.7578125
time step: 0.51171875


  0%|          | 0/6 [00:00<?, ?it/s]

time step: 1.0
time step: 0.94140625
time step: 0.86328125
time step: 0.7578125
time step: 0.61328125
time step: 0.38671875


  0%|          | 0/8 [00:00<?, ?it/s]

time step: 1.0
time step: 0.95703125
time step: 0.90625
time step: 0.83984375
time step: 0.7578125
time step: 0.65625
time step: 0.51171875
time step: 0.310546875


  0%|          | 0/12 [00:00<?, ?it/s]

time step: 1.0
time step: 0.97265625
time step: 0.94140625
time step: 0.90625
time step: 0.86328125
time step: 0.81640625
time step: 0.7578125
time step: 0.69140625
time step: 0.61328125
time step: 0.51171875
time step: 0.38671875
time step: 0.22265625


  0%|          | 0/16 [00:00<?, ?it/s]

time step: 1.0
time step: 0.98046875
time step: 0.95703125
time step: 0.93359375
time step: 0.90625
time step: 0.875
time step: 0.83984375
time step: 0.80078125
time step: 0.7578125
time step: 0.7109375
time step: 0.65625
time step: 0.58984375
time step: 0.51171875
time step: 0.421875
time step: 0.310546875
time step: 0.173828125


  0%|          | 0/28 [00:00<?, ?it/s]

time step: 1.0
time step: 0.98828125
time step: 0.9765625
time step: 0.96484375
time step: 0.94921875
time step: 0.9375
time step: 0.921875
time step: 0.90625
time step: 0.88671875
time step: 0.87109375
time step: 0.8515625
time step: 0.828125
time step: 0.80859375
time step: 0.78515625
time step: 0.7578125
time step: 0.73046875
time step: 0.703125
time step: 0.671875
time step: 0.63671875
time step: 0.59765625
time step: 0.55859375
time step: 0.51171875
time step: 0.462890625
time step: 0.40625
time step: 0.345703125
time step: 0.275390625
time step: 0.1953125
time step: 0.1044921875


  0%|          | 0/50 [00:00<?, ?it/s]

time step: 1.0
time step: 0.9921875
time step: 0.98828125
time step: 0.98046875
time step: 0.97265625
time step: 0.96484375
time step: 0.95703125
time step: 0.94921875
time step: 0.94140625


In [44]:
pipe.unload_lora_weights()

pipe.load_lora_weights("/root/autodl-tmp/flux-lora-dreambooth/sksdog_reflow_3000.safetensors", adapter_name="reflow")

In [25]:
pipe.unload_lora_weights()

# pipe.load_lora_weights("/root/autodl-tmp/flux-lora-dreambooth/ghibsky-illustration.safetensors", adapter_name="style")
# pipe.load_lora_weights("/root/autodl-tmp/flux-lora-dreambooth/2rf-skewed_t-accelerator.safetensors", adapter_name="acclerator")

# pipe.set_adapters(["style", "acclerator"], adapter_weights=[1., 1.])

# pipe.fuse_lora(adapter_names=["accelerate", "water"], lora_scale=1.0)

In [None]:
# prompt = "GHIBSKY style, a cat on a windowsill gazing out at a starry night sky and distant city lights" # seed = 123

# prompt = "GHIBSKY style, an orange Lamborghini driving down a hill road at night with a beautiful ocean view in the background, side view, no text" # 123

# prompt = "GHIBSKY style, cozy mountain cabin covered in snow, with smoke curling from the chimney and a warm, inviting light spilling through the windows" # 836

steps_list = [4, 8, 16, 32]

# steps_list = [2, 32]

# 32

fig, axes = plt.subplots(1, len(steps_list), figsize=(10, 4), dpi=300)  # 调整尺寸以适应一行布局

for i, num_steps in enumerate(steps_list):
    img_latents = forward_denoise(pipe, num_steps=num_steps, prompt=prompt, 
                                  height=1024, width=768, guidance_scale=3.5, seed=8)
    out = decode_imgs(img_latents, pipe)[0]
    
    ax = axes[i]
    ax.imshow(out)
    ax.set_title(f"{num_steps} Steps")  # 优化字体大小
    ax.axis('off') 

fig.suptitle("TurboRender-LoRA")
# fig.suptitle("Flux-dev baseline")
plt.tight_layout() 
plt.show()
