In [1]:
# Scripts Installation Path: 'C:\Users\Talha\AppData\Local\Programs\Python\Python312\Scripts'

# Libraries installed: 
# 1. diffusers
# 2. transformers
# 3. accelerate
# 4. hf_xet

import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt


import torch, transformers, shutil, os

from diffusers                              import AutoencoderKL, UNet2DConditionModel, UNet2DModel, LMSDiscreteScheduler
from transformers                           import CLIPTextModel, CLIPTokenizer
from torchinfo                              import summary
from PIL                                    import Image
from torchvision                            import transforms as tfms
from tqdm.auto                              import tqdm
from IPython.display                        import display, clear_output
from config                                 import *
from architectures                          import *
from pathlib                                import Path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
unet = UNet2DModel(**UNET_PARAMS).to(device)


In [None]:
inp = torch.randn(4, 8, 32, 32).to(dtype = torch.float16, device = device)
mask = torch.randn(4, 8, 32, 32).to(dtype = torch.float16, device = device)
t   = torch.randint(0, 1000, (4,), device=device).long()

summary(unet, input_data = [inp, t])

In [5]:
## Initiating tokenizer and encoder.
tokenizer    = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16)
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cpu")

## Initiating the VAE
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", torch_dtype=torch.float16).to(device)

## Initializing a scheduler and Setting number of sampling steps
scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
scheduler.set_timesteps(50)

## Initializing the U-Net model
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", torch_dtype=torch.float16).to(device)

unet/diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
# Steps to run VAE (83.65 Million Params)

# 1) Make sure input is dtype torch.float16 and on the same device as the model.
# 2) The forward pass consists of 3 steps
#    i)   posterior  = vae.encode(inputs).latent_dist --> gives a DiagnolGaussianDistribution Object which has the mean, logvar etc as its self members
#    ii)  pos_sample = posterior.sample() --> simply returns a latent representation sample by x = self.mean + self.std * epsilon (of same shape as self.mean) with 
#         latent_dim = H/8, W /8
#    iii) recon      = vae.decode(pos_sample).sample --> returns the reconstruction which is logits I believe.

# Example Usage
# >>> inputs = torch.randn(1, 3, 256, 256).to(torch.float16).to('cuda')
# >>> recon  = vae(inputs)

In [3]:
inputs = torch.randn(1, 3, 256, 256).to(torch.float16).to('cuda')
posterior = vae.encode(inputs).latent_dist

In [8]:
posterior

<diffusers.models.autoencoders.vae.DiagonalGaussianDistribution at 0x7f02942f2850>

In [10]:
## Helper functions
def load_image(p):
    '''
    Function to load images from a defined path
    '''
    return Image.open(p).convert('RGB').resize((512,512))

def pil_to_latents(image):
    '''
    Function to convert image to latents
    '''
    init_image = tfms.ToTensor()(image).unsqueeze(0) * 2.0 - 1.0
    init_image = init_image.to(device="cuda", dtype=torch.float16) 
    init_latent_dist = vae.encode(init_image).latent_dist.sample() * 0.18215
    return init_latent_dist

def latents_to_pil(latents):
    '''
    Function to convert latents to images
    '''
    latents = (1 / 0.18215) * latents
    with torch.no_grad():
        image = vae.decode(latents).sample
    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    pil_images = [Image.fromarray(image) for image in images]
    return pil_images


def text_enc(prompts, maxlen=None):
    '''
    A function to take a texual promt and convert it into embeddings
    '''
    if maxlen is None: maxlen = tokenizer.model_max_length
    inp = tokenizer(prompts, padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt") 
    return text_encoder(inp.input_ids.to("cpu"))[0].half()


In [12]:
prompts = 'A dog wearing a hat'
# bs      = len(prompts) # 19
# text    = text_enc(prompts) # (1, 77, 768) 
# uncond  = text_enc([''] * bs, text.shape[1]) # (19, 77, 768)
# emb     = torch.cat([uncond, text])          # (20, 77, 768) 

# emb_filtered     = emb[:2, :, :]

# print(f'text_encoding shape: {text.shape}, uncond_text.shape: {uncond.shape}, emb.shape: {emb.shape}, emb_filtered {emb_filtered.shape}')  


In [27]:
# Instanting random noise which will be noised and feeded to unet along with prompt. Note see 'https://huggingface.co/CompVis/stable-diffusion-v1-4/blob/main/unet/config.json' for the config for unet in SD 1.4
dim = 512
latents = torch.randn((bs, unet.config.in_channels, dim//8, dim//8))# Initiating random noise

# Setting number of steps in scheduler
scheduler.set_timesteps(70)

# Adding noise to the latents 
latents = latents.to("cuda").half() * scheduler.init_noise_sigma
latents.shape

torch.Size([23, 4, 64, 64])

In [None]:
def prompt_2_img(prompts, g=7.5, seed=100, steps=70, dim=512, save_int=True):
    """
    Diffusion process to convert prompt to image
    """
    
    # Defining batch size
    bs = len(prompts) 
    
    # Converting textual prompts to embedding
    text = text_enc(prompts) 
    
    # Adding an unconditional prompt , helps in the generation process
    uncond =  text_enc([""] * bs, text.shape[1])
    emb    = torch.cat([uncond, text])
    
    # Setting the seed
    if seed: torch.manual_seed(seed)
    
    # Initiating random noise
    latents = torch.randn((bs, unet.config.in_channels, dim//8, dim//8))
    
    # Setting number of steps in scheduler
    scheduler.set_timesteps(steps)
    
    # Adding noise to the latents 
    latents = latents.to("cuda").half() * scheduler.init_noise_sigma

    print("Processing text prompts:", prompts)
    # Just before the loop starts:
    print("Visualizing initial latents...")
    latents_norm = torch.norm(latents.view(latents.shape[0], -1), dim=1).mean().item()
    print(f"Initial Latents Norm: {latents_norm}")

    # Iterating through defined steps
    for i,ts in enumerate(tqdm(scheduler.timesteps)):
        # We need to scale the i/p latents to match the variance
        inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts) # [2 * B, 4, 32, 32]
        
        # Predicting noise residual using U-Net
        print(f'ts: {ts}, inp.shape {inp.shape}, emb.shape {emb.shape}')
        with torch.no_grad(): u, t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)
            
        # Performing Guidance
        pred = u + g*(t-u)
        
        # Conditioning  the latents
        latents = scheduler.step(pred, ts, latents).prev_sample
        
        # Inside your loop, after `latents` have been updated:
        latents_norm = torch.norm(latents.view(latents.shape[0], -1), dim=1).mean().item()
        print(f"Step {i+1}/{steps} Latents Norm: {latents_norm}")
        
        from IPython.display import display, clear_output
        if   save_int and i%10==0: 
                image_path = f'steps2/la_{i:04d}.jpeg'
                latents_to_pil(latents)[0].save(image_path)
                display(latents_to_pil(latents)[0])  # Display the new image

    return latents_to_pil(latents)

In [14]:
images = prompt_2_img([prompts], save_int=True)
for img in images:display(img)


AssertionError: Torch not compiled with CUDA enabled