In [1]:
from pathlib import Path 
print(Path.cwd().parents[0]) 

/home/amzad/Desktop/stable_diffusion


In [2]:
import torch
from torch import nn
import math

class SelfAttention(nn.Module):
    def __init__(self, n_heads, d_embed, in_proj_bias=True, out_proj_bias=True):
        super().__init__()
        self.in_proj = nn.Linear(d_embed, 3 * d_embed, bias=in_proj_bias)
        self.out_proj = nn.Linear(d_embed, d_embed, bias=out_proj_bias)
        self.n_heads = n_heads
        self.d_head = d_embed // n_heads

    def forward(self, x, causal_mask=False):
        input_shape = x.shape 
        batch_size, sequence_length, d_embed = input_shape 
        interim_shape = (batch_size, sequence_length, self.n_heads, self.d_head)
        q, k, v = self.in_proj(x).chunk(3, dim=-1)
        q = q.view(interim_shape).transpose(1, 2)
        k = k.view(interim_shape).transpose(1, 2)
        v = v.view(interim_shape).transpose(1, 2)
        weight = q @ k.transpose(-1, -2)
        if causal_mask:
            mask = torch.ones_like(weight, dtype=torch.bool).triu(1)
            weight.masked_fill_(mask, -torch.inf)
        weight /= math.sqrt(self.d_head)
        weight = nn.functional.softmax(weight, dim=-1)
        output = weight @ v
        output = output.transpose(1, 2)
        output = output.reshape(input_shape)
        output = self.out_proj(output)
        return output


class VAE_attention(nn.Module):
    """
    VAE_attention is an attention mechanism used in Variational Autoencoders (VAEs).
    It applies Group Normalization followed by a multi-head attention mechanism.
    """

    def __init__(self, channels, num_heads):
        """
        Initializes the VAE_attention.

        Parameters:
        channels (int): Number of input and output channels.
        num_heads (int): Number of attention heads.
        """
        super(VAE_attention, self).__init__()
        self.groupnorm = nn.GroupNorm(32, channels)
        self.in_proj = nn.Linear(channels, 3 * channels, bias=True)
        self.out_proj = nn.Linear(channels, channels, bias=False)
        self.attention = nn.MultiheadAttention(channels, num_heads)

    def forward(self, x):
        """
        Forward pass of the VAE_attention.

        Parameters:
        x (torch.Tensor): Input tensor with shape (Batch_Size, Features, Height, Width).

        Returns:
        torch.Tensor: Output tensor after applying attention mechanism.
        """
        x = self.groupnorm(x)  # Apply Group Normalization

        n, c, h, w = x.shape  # Get the shape of the input tensor
        key, query, value = self.in_proj(x).chunk(3, dim=-1)  
        print("key.shape:", key.shape )
        print("query.shape:", query.shape )
        print("value.shape:", value.shape )

        key = key.view(n, h * w, c).transpose(1, 2)  # Reshape key tensor
        query = query.view(n, h * w, c).transpose(1, 2)
        value = value.view(n, h * w, c).transpose(1, 2)
        out, _ = self.attention(query, key, value)  
        out = out.transpose(1, 2).view(n, c, h, w)  
        out = self.out_proj(out)  
        return out 

def test_vae_attention():
    torch.manual_seed(0)
    batch_size = 2
    channels = 64
    height = 64
    width = 4
    num_heads = 2
    x = torch.randn(batch_size, channels, height, width)

    vae_attn = VAE_attention(channels, num_heads)
    output = vae_attn(x)

    print("Output:\n", output)

    assert output.shape == x.shape, "Output shape is incorrect!"

if __name__ == "__main__":
    test_vae_attention()



RuntimeError: mat1 and mat2 shapes cannot be multiplied (8192x4 and 64x192)

In [3]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [5]:
cpli_text_encoder = model.text_encoder
print(cpli_text_encoder)

AttributeError: 'CLIPModel' object has no attribute 'text_encoder'

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
checkpoint = "HuggingFaceTB/SmolLM-1.7B"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for fp16 use `torch_dtype=torch.float16` instead
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.bfloat16)
inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to("cuda")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

tokenizer_config.json:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [1]:
# pip install transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM-360M"
device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))
vae_prei

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


OutOfMemoryError: CUDA out of memory. Tried to allocate 180.00 MiB. GPU 