In [1]:
p_config = {
    "auto_map": {
        "AutoImageProcessor": "image_processing_moonvit.MoonViTImageProcessor"
    },
    "in_token_limit": 4096,
    "patch_size": 14,
    "num_pooled_tokens": 1024,
    "image_mean": [
        0.5,
        0.5,
        0.5
    ],
    "image_std": [
        0.5,
        0.5,
        0.5
    ],
    "pad_input": True
}

In [2]:
from transformers.configuration_utils import PretrainedConfig


class MoonViTConfig(PretrainedConfig):
    model_type = "moonvit"

    def __init__(
        self,
        patch_size: int = 16,
        init_pos_emb_height: int = 64,
        init_pos_emb_width: int = 64,
        num_attention_heads: int = 12,
        num_hidden_layers: int = 8,
        hidden_size: int = 768,
        intermediate_size: int = 3072,
        merge_kernel_size: tuple[int, int] = (2, 2),
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.patch_size = patch_size
        # Positional embedding config
        self.init_pos_emb_height = init_pos_emb_height
        self.init_pos_emb_width = init_pos_emb_width
        # Transformer config
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        # Patch merger config
        self.merge_kernel_size = merge_kernel_size


In [3]:
import math
import numpy as np
from PIL import Image
from typing import Optional, Union

import torch
from torchvision.transforms import functional as TF
from transformers.image_utils import ImageInput, make_list_of_images, valid_images
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from transformers.utils import TensorType


OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)


class MoonViTImageProcessor(BaseImageProcessor):
    model_type = "moonvit"

    def __init__(
        self,
        image_mean: tuple[float, float, float] = OPENAI_DATASET_MEAN,
        image_std: tuple[float, float, float] = OPENAI_DATASET_STD,
        in_token_limit: int = 4096,
        patch_size: int = 16,
        pad_input: bool = True,
        merge_kernel_size: list[int, int] = [2, 2],
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.in_token_limit = in_token_limit
        self.patch_size = patch_size
        self.pad_input = pad_input
        self.image_mean = image_mean
        self.image_std = image_std
        self.merge_kernel_size = merge_kernel_size

    def rescale(
        self, image: Image.Image, merge_kernel_size: list[int, int] = [2, 2]
    ) -> Image.Image:
        w, h = image.size
        patch_size = self.patch_size

        if (w // patch_size) * (h // patch_size) > self.in_token_limit:
            scale = math.sqrt(self.in_token_limit / ((w // patch_size) * (h // patch_size)))
            new_w, new_h = int(w * scale), int(h * scale)
            image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
        if self.pad_input:
            new_w, new_h = image.size
            pad_size_h = merge_kernel_size[0] * patch_size
            pad_size_w = merge_kernel_size[1] * patch_size

            pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
            pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w

            image = TF.pad(image, (0, 0, pad_w, pad_h))
        else:
            new_w, new_h = image.size
            new_w = new_w - new_w % patch_size
            new_h = new_h - new_h % patch_size
            image = TF.center_crop(image, (new_h, new_w))

        w, h = image.size
        if w // patch_size >= 512 or h // patch_size >= 512:
            raise ValueError("Exceed pos emb")

        return image

    def to_tensor(self, image: Image.Image) -> torch.Tensor:
        return TF.to_tensor(image.convert("RGB"))

    def normalize(self, image: torch.Tensor) -> torch.Tensor:
        return TF.normalize(image, self.image_mean, self.image_std)

    def patchify(self, image: torch.Tensor) -> tuple[torch.Tensor, list[int, int]]:
        patch_size = self.patch_size
        C, H, W = image.shape
        patches = image.reshape(C, H // patch_size, patch_size, W // patch_size, patch_size)
        patches = patches.permute(1, 3, 0, 2, 4)
        patches = patches.contiguous().view(-1, C, patch_size, patch_size)
        grid_hw = (H // patch_size, W // patch_size)
        return patches, grid_hw

    def _preprocess(self, image: ImageInput) -> tuple[torch.Tensor, list[int, int]]:
        """
        Preprocess image and patchify it.
        Args:
            image (`ImageInput`):
                Image to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
        Returns:
            patches: torch.Tensor
            grid_hw: list[int, int]
        """
        image = self.rescale(image, self.merge_kernel_size)
        image = self.to_tensor(image)
        image = self.normalize(image)
        patches, grid_hw = self.patchify(image)
        return patches, grid_hw

    def preprocess(
        self,
        images: ImageInput,
        return_tensors: Optional[Union[str, TensorType]] = None,
    ) -> BatchFeature:
        images = make_list_of_images(images)

        if not valid_images(images):
            raise ValueError(
                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                "torch.Tensor, tf.Tensor or jax.ndarray."
            )

        pixel_values, image_grid_hws = [], []
        for image in images:
            patches, image_grid_hw = self._preprocess(image)
            pixel_values.append(patches)
            image_grid_hws.append(image_grid_hw)
        pixel_values = torch.concat(pixel_values, dim=0)
        image_grid_hws = np.array(image_grid_hws)
        data = {"pixel_values": pixel_values, "image_grid_hws": image_grid_hws}

        return BatchFeature(data=data, tensor_type=return_tensors)



2025-07-20 08:25:50.120084: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752999950.350649      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752999950.420045      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
img_processor = MoonViTImageProcessor()

In [5]:
from torchvision import transforms
to_pil = transforms.ToPILImage()
images = [to_pil(torch.rand(3, 256, 384)),to_pil(torch.rand(3, 256, 256))]

In [6]:
images_processed = img_processor(images, return_tensors="pt")

In [7]:
images_processed['pixel_values'].size()

torch.Size([640, 3, 16, 16])

In [8]:
images_processed.image_grid_hws

tensor([[16, 24],
        [16, 16]])

In [9]:
import math
from copy import deepcopy
from typing import Union, Tuple, Sequence, Optional, List

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers.activations import PytorchGELUTanh
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import is_flash_attn_2_available



# if is_flash_attn_2_available():
#     from flash_attn import flash_attn_varlen_func
# else:
#     flash_attn_varlen_func = None


def multihead_attention(
    q: torch.Tensor,
    k: torch.Tensor,
    v: torch.Tensor,
    q_cu_seqlens: Optional[torch.Tensor] = None,
    k_cu_seqlens: Optional[torch.Tensor] = None,
):
    """Multi-head attention using flash attention 2.
    Args:
        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
            or (tot_seqlens, num_heads, head_dim) if packing.
        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
            The first element should be 0 and the last element should be q.shape[0].
        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
            The first element should be 0 and the last element should be k.shape[0].
    Returns:
        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
            where dim = num_heads * head_dim
    """
    # Unified format legal check
    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
    assert q_cu_seqlens[-1] == q.shape[0], "q_cu_seqlens must sum to q.shape[0]"
    assert (
        k_cu_seqlens[-1] == k.shape[0] == v.shape[0]
    ), "k_cu_seqlens must sum to k.shape[0]"
    assert q.dtype in [
        torch.bfloat16,
        torch.float16,
    ], f"unsupported dtype {q.dtype} for multihead attn"

    max_seqlen_q = (q_cu_seqlens[1:] - q_cu_seqlens[:-1]).max().item()
    max_seqlen_k = (k_cu_seqlens[1:] - k_cu_seqlens[:-1]).max().item()
    attn_out = flash_attn_varlen_func(
        q,
        k,
        v,
        q_cu_seqlens,
        k_cu_seqlens,
        max_seqlen_q,
        max_seqlen_k,
        causal=False,
    )
    attn_out = attn_out.flatten(start_dim=-2)

    return attn_out


def sdpa_attention(
    q: torch.Tensor,
    k: torch.Tensor,
    v: torch.Tensor,
    q_cu_seqlens: Optional[torch.Tensor] = None,
    k_cu_seqlens: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    """SDPA attention.
    Args:
        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
            or (tot_seqlens, num_heads, head_dim) if packing.
    """
    seq_length = q.shape[0]
    attention_mask = torch.zeros(
        [1, seq_length, seq_length], device=q.device, dtype=torch.bool
    )
    for i in range(1, len(q_cu_seqlens)):
        attention_mask[
            ...,
            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
        ] = True
    q = q.transpose(0, 1)
    k = k.transpose(0, 1)
    v = v.transpose(0, 1)
    attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
    attn_output = attn_output.transpose(0, 1)
    attn_output = attn_output.reshape(seq_length, -1)
    return attn_output


def eager_attention(
    q: torch.Tensor,
    k: torch.Tensor,
    v: torch.Tensor,
    q_cu_seqlens: Optional[torch.Tensor] = None,
    k_cu_seqlens: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    seq_length = q.shape[0]
    attention_mask = torch.zeros(
        [1, seq_length, seq_length], device=q.device, dtype=torch.bool
    )
    for i in range(1, len(q_cu_seqlens)):
        attention_mask[
            ...,
            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
        ] = True
    q = q.transpose(0, 1)
    k = k.transpose(0, 1)
    v = v.transpose(0, 1)

    attn_weight = q @ k.transpose(-2, -1) / math.sqrt(q.shape[-1])
    attn_weight += attention_mask
    attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32).to(q.dtype)

    attn_output = attn_weight @ v
    attn_output = attn_output.transpose(0, 1)
    attn_output = attn_output.reshape(seq_length, -1)
    return attn_output


VL_VISION_ATTENTION_FUNCTIONS = {
    # "flash_attention_2": multihead_attention,
    "sdpa": sdpa_attention,
    "eager": eager_attention,
}


def _apply_rope_input_validation(x, freqs_cis):
    assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape)
    assert x.shape[:-2] == freqs_cis.shape[:-1], (x.shape, freqs_cis.shape)
    assert x.shape[-1] == 2 * freqs_cis.shape[-1], (x.shape, freqs_cis.shape)
    assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype

def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
    orig_dtype = tensor.dtype
    tensor = tensor.float()
    cos = freqs.cos()
    sin = freqs.sin()
    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
    output = (tensor * cos) + (rotate_half(tensor) * sin)
    output = output.to(orig_dtype)
    return output
    
def apply_rope(
    xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Args: (The leading dimensions of all inputs should be the same)
        xq: query, tensor of shape (..., num_heads, head_dim)
        xk: key, tensor of shape (..., num_heads, head_dim)
        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
    Returns:
        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
    """
    
    _apply_rope_input_validation(xq, freqs_cis)
    _apply_rope_input_validation(xk, freqs_cis)

    freqs_cis = freqs_cis.unsqueeze(-2)  # ..., 1, head_dim/2
    # ..., num_heads, head_dim/2
    xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
    xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
    return xq_out.type_as(xq), xk_out.type_as(xk)

def apply_rope_real(
    xq: torch.Tensor, xk: torch.Tensor, freqs: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
        xq, xk: (..., num_heads, head_dim)
        freqs: (..., head_dim//2) in radians
    Returns:
        xq_out, xk_out: (..., num_heads, head_dim)
    """
    def rotate_half(x):
        x1 = x[..., : x.shape[-1] // 2]
        x2 = x[..., x.shape[-1] // 2 :]
        return torch.cat((-x2, x1), dim=-1)
    
    orig_dtype = xq.dtype
    xq = xq.float()
    xk = xk.float()

    # Convert freqs (radians) into cos and sin
    cos = freqs.cos().unsqueeze(-2)  # match head dimension
    sin = freqs.sin().unsqueeze(-2)

    # Expand to match xq/xk shape
    cos = cos.repeat_interleave(2, dim=-1)  # from head_dim/2 to head_dim
    sin = sin.repeat_interleave(2, dim=-1)

    xq_out = (xq * cos) + (rotate_half(xq) * sin)
    xk_out = (xk * cos) + (rotate_half(xk) * sin)

    return xq_out.to(orig_dtype), xk_out.to(orig_dtype)

class Learnable2DInterpPosEmb(nn.Module):
    def __init__(
        self, height: int, width: int, dim: int, interpolation_mode: str = "bicubic"
    ) -> None:
        super().__init__()
        self.height = height
        self.width = width
        self.interpolation_mode = interpolation_mode
        self.weight = nn.Parameter(torch.empty(height, width, dim))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.normal_(self.weight)

    def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
        pos_embs = []
        for shape in grid_hws.tolist():
            if shape == self.weight.shape[:-1]:
                pos_embs.append(self.weight.flatten(end_dim=1))
            else:
                pos_embs.append(
                    F.interpolate(
                        self.weight.permute((2, 0, 1)).unsqueeze(0),
                        size=shape,
                        mode=self.interpolation_mode,
                    )
                    .squeeze(0)
                    .permute((1, 2, 0))
                    .flatten(end_dim=1)
                )
        out = x + torch.cat(pos_embs)
        return out


class MoonVisionPatchEmbed(nn.Module):

    def __init__(
        self,
        out_dim: int,
        in_dim: int = 3,
        patch_size: Union[int, Tuple[int, int]] = (16, 16),
        pos_emb_height: int = 16,
        pos_emb_width: int = 16,
    ):
        super().__init__()
        assert isinstance(
            patch_size, (int, Sequence)
        ), f"Invalid patch_size type: {type(patch_size)}"
        if isinstance(patch_size, int):
            patch_size = (patch_size, patch_size)
        assert (
            len(patch_size) == 2
        ), f"Expected patch_size to be a tuple of 2, got {patch_size}"
        self.patch_size = patch_size

        self.proj = nn.Conv2d(
            in_dim, out_dim, kernel_size=patch_size, stride=patch_size
        )

        self.pos_emb = Learnable2DInterpPosEmb(
            height=pos_emb_height, width=pos_emb_width, dim=out_dim
        )

    def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x (L, Channels): input tensor
            grid_hws (N, 2): grid height and width
        Returns:
            (L, Cout) tensor
        """
        x = self.proj(x).view(x.size(0), -1)
        # apply positional embedding
        x = self.pos_emb(x, grid_hws)
        return x


class Rope2DPosEmb(nn.Module):
    """2D rotary position embedding with multi-resolution support.
    This class is intended to be used in the following way:
    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
        The rope is shared across all attention layers and all heads.
    Refs:
    - RoFormer: https://arxiv.org/abs/2104.09864
    - VisionLLaMA: https://arxiv.org/abs/2403.00522
    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py
    Args:
        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
        max_height (int): the maximum height of the 2D grid
        max_width (int): the maximum width of the 2D grid
        theta_base (float): the base of the theta
        device (str): the device to store the precomputed cis
    """

    def __init__(self, dim: int, max_height: int, max_width: int, theta_base=10000):
        super().__init__()
        self.dim = dim
        assert self.dim % 4 == 0, "dim must be divisible by 4"
        self.max_height = max_height
        self.max_width = max_width
        self.theta_base = theta_base

        self.freqs_cis = None

    def extra_repr(self):
        return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"

    def _precompute_freqs_cis(self, device: torch.device) -> torch.Tensor:
        """Calculate the cis(freqs) for each position in the 2D grid.
        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
        """
        N = self.max_height * self.max_width
        flat_pos = torch.arange(0, N).float().to(device)
        x_pos = flat_pos % self.max_width
        y_pos = flat_pos // self.max_width
        dim_range = (
            torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(device)
        )  # C/4
        freqs = 1.0 / (self.theta_base ** (dim_range / self.dim))
        x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
        y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
        x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
        y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
        # N, C/4, 2
        freqs_cis = torch.cat(
            [x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1
        )
        # max_height, max_width, C/2
        freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
        return freqs_cis

    def get_freqs_cis(self, grid_hws: torch.Tensor) -> torch.Tensor:
        """
        Args:
            grid_hws (torch.Tensor): grid height and width
        Returns:
            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
        """
        if self.freqs_cis is None:
            self.freqs_cis = self._precompute_freqs_cis(grid_hws.device)

        shapes = grid_hws.tolist()
        assert all(
            1 <= h <= self.max_height and 1 <= w <= self.max_width for h, w in shapes
        ), (
            shapes,
            self.max_height,
            self.max_width,
        )
        freqs_cis = torch.cat(
            [self.freqs_cis[:h, :w].reshape(-1, self.dim // 2) for h, w in shapes],
            dim=0,
        )
        return freqs_cis


class MLP2(nn.Module):
    """
    Args:
        dims: [in_dim, hidden_dim, out_dim]
        bias: whether to use bias in linear layer.
    """

    def __init__(self, dims: list[int], activation, bias=True):
        super().__init__()
        assert len(dims) == 3
        self.fc0 = nn.Linear(dims[0], dims[1], bias=bias)
        self.fc1 = nn.Linear(dims[1], dims[2], bias=bias)
        self.activation = activation
        for m in [self.fc0, self.fc1]:
            nn.init.trunc_normal_(m.weight, std=math.sqrt(2 / m.in_features))
            if m.bias is not None:
                nn.init.zeros_(m.bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.fc0(x)
        x = self.activation(x)
        return self.fc1(x)


class MoonVitEncoderLayer(nn.Module):

    def __init__(
        self,
        num_heads: int,
        hidden_dim: int,
        mlp_dim: int,
        *,
        attn_implementation: str = "eager",
        activation=F.gelu,
        attn_bias: bool = False,
    ):
        super().__init__()
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim
        self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
        self.attn_implementation = attn_implementation

        self.norm0 = nn.LayerNorm(hidden_dim)
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation)
        self.wqkv = nn.Linear(hidden_dim, hidden_dim * 3, bias=attn_bias)
        self.wo = nn.Linear(hidden_dim, hidden_dim, bias=attn_bias)

    def attention_qkvpacked(
        self,
        x: torch.Tensor,
        cu_seqlens: torch.Tensor,
        rope_freqs_cis: Optional[torch.Tensor] = None,
    ):
        """
        Args:
            x (torch.Tensor): (batch_size, seqlen, hidden_dim)
            cu_seqlens (torch.Tensor):
        """
        xqkv = self.wqkv(x)

        qkv_shape = xqkv.size()[:-1] + (
            3,
            self.num_heads,
            self.hidden_size_per_attention_head,
        )
        # xqkv: (batch_size, seqlen, 3, nheads, headdim)
        xqkv = xqkv.view(*qkv_shape)
        xq, xk, xv = torch.unbind(xqkv, dim=-3)
       
        xq, xk = apply_rope_real(xq, xk, rope_freqs_cis)

        attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation]
        attn_out = attn_func(
            xq, xk, xv, q_cu_seqlens=cu_seqlens, k_cu_seqlens=cu_seqlens
        )

        attn_out = self.wo(attn_out)
        return attn_out

    def forward(
        self,
        hidden_states: torch.Tensor,
        cu_seqlens: torch.Tensor,
        rope_freqs_cis: Union[torch.Tensor, None] = None,
    ) -> torch.Tensor:
        """
        Args:
            hidden_states: non-packed (B, N, D) or packed (L, D). if non-packed, seqlens should be None, if packed, seqlens should be set
        Returns:
            output: same shape of input, non-packed (B, N, D) for non-packed input, (L, D) for packed input
        """
        residual = hidden_states
        hidden_states = self.norm0(hidden_states)
        attn_out = self.attention_qkvpacked(
            hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
        )
        hidden_states = residual + attn_out

        residual = hidden_states
        hidden_states = self.mlp(self.norm1(hidden_states))
        hidden_states = residual + hidden_states
        return hidden_states


class MoonVitEncoder(nn.Module):

    def __init__(
        self,
        hidden_dim: int,
        num_layers: int,
        block_cfg: dict,
    ) -> None:
        super().__init__()

        self.rope_2d = Rope2DPosEmb(
            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512
        )
        self.blocks = nn.ModuleList(
            [MoonVitEncoderLayer(**block_cfg) for _ in range(num_layers)]
        )
        self.final_layernorm = nn.LayerNorm(hidden_dim)

    def forward(
        self, hidden_states: torch.Tensor, grid_hws: torch.Tensor
    ) -> torch.Tensor:
        rope_freqs_cis = self.rope_2d.get_freqs_cis(grid_hws=grid_hws)

        lengths = torch.cat(
            (
                torch.zeros(1, device=hidden_states.device, dtype=grid_hws.dtype),
                grid_hws[:, 0] * grid_hws[:, 1],
            )
        )
        cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)

        for _, block in enumerate(self.blocks):
            hidden_states = block(
                hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
            )

        hidden_states = self.final_layernorm(hidden_states)

        return hidden_states


def patch_merger(
    x: torch.Tensor,
    grid_hws: torch.Tensor,
    merge_kernel_size: list[int, int] = (2, 2),
) -> List[torch.Tensor]:
    d_model = x.size(-1)

    outputs = []
    pre_sum = 0
    for x_shape in grid_hws.tolist():
        height, width = x_shape[0], x_shape[1]
        # Get the current sequence
        seq = x[pre_sum : pre_sum + height * width]
        # Reshape along self.merge_kernel_size and concat to the last dimension
        kernel_height, kernel_width = merge_kernel_size
        new_height, new_width = height // kernel_height, width // kernel_width
        reshaped_seq = seq.view(
            new_height, kernel_height, new_width, kernel_width, d_model
        )
        reshaped_seq = reshaped_seq.permute(0, 2, 1, 3, 4).contiguous()
        padded_seq = reshaped_seq.view(
            new_height * new_width, kernel_height * kernel_width, -1
        )
        outputs.append(padded_seq)
        pre_sum += height * width

    return outputs


class MoonVitPretrainedModel(PreTrainedModel):
    config_class = MoonViTConfig
    model_type = "moonvit"
    _no_split_modules = ["PackingTransformer"]
    _supports_flash_attn_2 = False
    _supports_sdpa = True

    def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        config = deepcopy(config)
        self.merge_kernel_size = config.merge_kernel_size
        self.patch_size = config.patch_size
        self.patch_embed = MoonVisionPatchEmbed(
            out_dim=config.hidden_size,
            patch_size=config.patch_size,
            pos_emb_height=config.init_pos_emb_height,
            pos_emb_width=config.init_pos_emb_width,
        )

        self.encoder = MoonVitEncoder(
            hidden_dim=config.hidden_size,
            num_layers=config.num_hidden_layers,
            block_cfg={
                "num_heads": config.num_attention_heads,
                "hidden_dim": config.hidden_size,
                "mlp_dim": config.intermediate_size,
                "activation": PytorchGELUTanh(),
                "attn_bias": True,
                "attn_implementation": config._attn_implementation,
            },
        )

    def forward(
        self, pixel_values: torch.Tensor, grid_hws: torch.Tensor
    ) -> torch.Tensor:
        """
        Args:
            pixel_values (torch.Tensor): The input pixel values.
            grid_hws (torch.Tensor): The grid height and width.
        Returns:
            torch.Tensor: The output tokens.
        """
        hidden_states = self.patch_embed(pixel_values, grid_hws)
        hidden_states = self.encoder(hidden_states, grid_hws)
        hidden_states = patch_merger(
            hidden_states, grid_hws, merge_kernel_size=self.merge_kernel_size
        )
        return hidden_states

In [10]:
model = MoonVitPretrainedModel(MoonViTConfig())

In [11]:
images = [to_pil(torch.rand(3, 256, 384)),to_pil(torch.rand(3, 256, 256))]

In [12]:
pix, hw = img_processor._preprocess(images[0])
print(pix.shape, hw)

torch.Size([384, 3, 16, 16]) (16, 24)


In [13]:
pix, hw = img_processor._preprocess(images[1])
print(pix.shape, hw)

torch.Size([256, 3, 16, 16]) (16, 16)


In [14]:
hw

(16, 16)

In [15]:
images_processed = img_processor(images, return_tensors="pt")
image_features: list = model(images_processed.pixel_values, images_processed.image_grid_hws)

  return xq_out.to(orig_dtype), xk_out.to(orig_dtype)


In [16]:
image_features[0].size()
#(16, 24)# 384//4 = 96

torch.Size([96, 4, 768])

In [17]:
image_features[1].size()
#16x16 = 256//4 = 64

torch.Size([64, 4, 768])

In [18]:
from transformers.tokenization_utils_base import AddedToken
from transformers import AutoTokenizer
model_ckpt = "microsoft/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(
    model_ckpt, padding_side="left", truncation_side="right"
)
IMAGE_TOKEN = "<image>"


image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True)


tokens_to_add = {"additional_special_tokens": [image_token]}


tokenizer.add_special_tokens(tokens_to_add)


image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [19]:
!ls ../input

flickr30k


In [20]:
import pandas as pd

df = pd.read_csv("../input/flickr30k/captions.txt")
df = df.dropna()
df["image"] = "../input/flickr30k/Images/" + df["image"]
df.head()
df.shape

(158914, 2)

In [21]:
import random
import math
from torch.utils.data import Dataset, random_split, DataLoader
from PIL import Image
import torchvision.transforms as T

def build_string_from_input(
    prompt,
    image_seq_len,
    image_token=IMAGE_TOKEN
):


    return f"{image_token * image_seq_len}{prompt}"



def get_model_inputs(input_string, suffix,max_length, tokenizer=tokenizer,default=False):


    return_token_type_ids = False


    if suffix:
        suffix = suffix


        return_token_type_ids = True
    
    if default==False:
    
        text = tokenizer.encode(suffix,add_special_tokens=False)
        prompt = tokenizer.encode(input_string,add_special_tokens=False)
        
        full_text = prompt+text
        
        mask = torch.ones(max_length,dtype=torch.int64)
        
        token_type_ids = torch.zeros(max_length,dtype=torch.int64)
        token_type_ids[-len(text):] = 1
        
        if len(full_text)>max_length:
            full_text = full_text[:max_length]
            token_type_ids = token_type_ids[:max_length]
            mask = mask[:max_length]
            
           
        else:      #left padding
            left_pad = max_length-len(full_text)
            full_text = [0]*left_pad+full_text
            mask[:left_pad] = 0
            # token_type_ids
    
        inputs = {'input_ids':torch.tensor(full_text),'attention_mask':mask,'token_type_ids':token_type_ids}
    else:


        return_token_type_ids = True if suffix is not None else False
    
    
        inputs = tokenizer(
            input_string,
            text_pair=suffix,
            return_token_type_ids=return_token_type_ids,
            padding="max_length",
            add_special_tokens=False,
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
    return inputs
dtype = torch.bfloat16

class ImgDataset(Dataset):
    def __init__(self, df,  tokenizer,img_ops,default= True, transform=None):
        self.df = df
        self.transform = transform
        self.img_ops = img_ops
       
        self.tokenizer = tokenizer
       
        self.type = default

    def __len__(
        self,
    ):
        return len(self.df)

    def __getitem__(self, idx):
        caption = self.df.caption.iloc[idx] + tokenizer.eos_token
        img_path = self.df.image.iloc[idx]
        img = Image.open(img_path).convert("RGB")

        input_data, grid_shape = self.img_ops._preprocess(img)
        l = math.prod(grid_shape)
        prompt = tokenizer.bos_token +" "+ "Explain this image."
        input_string = build_string_from_input(prompt,image_seq_len=l)
        
        return input_data, grid_shape, l, input_string, caption

In [22]:
myd = ImgDataset(df,tokenizer,img_processor)

In [23]:
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
def custom_collate_fn(batch):
    input_data, grid_shape, img_prompt_size, img_prompt, caption = zip(*batch) # Unpack the tuples
    max_img_prompt_size = max(img_prompt_size)
    text_out =  [] 
    ids, mask,tids = [], [], []
    for prom, cap in zip(img_prompt,caption):
        out = get_model_inputs(prom,cap,max_length=max_img_prompt_size+64)
        ids.append(out['input_ids'].unsqueeze(0))
        mask.append(out['attention_mask'].unsqueeze(0))
        tids.append(out['token_type_ids'].unsqueeze(0))

    text_out = {'input_ids':torch.cat(ids),'attention_mask':torch.cat(mask),'token_type_ids':torch.cat(tids)}
    pixel_values = torch.concat(input_data, dim=0)
    image_grid_hws = np.array(grid_shape)
    data = {"pixel_values": pixel_values, "image_grid_hws": image_grid_hws}
    return {'img_info':BatchFeature(data=data,tensor_type='pt'), "text_info":text_out}

In [24]:
batch_size = 4
dataloader = DataLoader(ImgDataset(df,tokenizer,img_processor), batch_size=batch_size, collate_fn=custom_collate_fn)

In [25]:
for d in dataloader:
    break

In [26]:
d['img_info']['pixel_values'].size()

torch.Size([2816, 3, 16, 16])

In [27]:
device = torch.device("cuda")

In [28]:
model = MoonVitPretrainedModel(MoonViTConfig())


In [29]:
model.to(torch.bfloat16).to(device)

MoonVitPretrainedModel(
  (patch_embed): MoonVisionPatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (pos_emb): Learnable2DInterpPosEmb()
  )
  (encoder): MoonVitEncoder(
    (rope_2d): Rope2DPosEmb(dim=64, max_height=512, max_width=512, theta_base=10000)
    (blocks): ModuleList(
      (0-7): 8 x MoonVitEncoderLayer(
        (norm0): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP2(
          (fc0): Linear(in_features=768, out_features=3072, bias=True)
          (fc1): Linear(in_features=3072, out_features=768, bias=True)
          (activation): PytorchGELUTanh()
        )
        (wqkv): Linear(in_features=768, out_features=2304, bias=True)
        (wo): Linear(in_features=768, out_features=768, bias=True)
      )
    )
    (final_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
)

In [30]:
from tqdm.notebook import tqdm
for i, d in tqdm(enumerate(dataloader)):
    out = model(d['img_info']['pixel_values'].to(torch.bfloat16).to(device),d['img_info']['image_grid_hws'].to(device))

0it [00:00, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 193.12 MiB is free. Process 4361 has 15.70 GiB memory in use. Of the allocated memory 9.02 GiB is allocated by PyTorch, and 6.39 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
alsdjnfjgnsaldmsgmdsh

In [None]:
# print(d['img_info'].size(),d['grid_shapes'])

In [None]:
# out = model(d['img_info']['pixel_values'],d['img_info']['image_grid_hws'])

In [None]:
# torch.cat(out, dim=0).size()

In [None]:
for k in d['text_info'].keys():
 print(k, d['text_info'][k].size())

In [None]:
data_embeds_.shape

In [None]:
word_embeddings = nn.Embedding(
            128002,
            768,
        )
for i, d in enumerate(dataloader):
    # data_inputs = torch.concatenate(d['pixel_values'], dim=0)
    input_ids = d['text_info']['input_ids']
    txt_emb = word_embeddings(input_ids)
    batch_size, sequence_length, input_embed_dim = txt_emb.shape
    data_embeds_ = model(d['img_info']['pixel_values'],d['img_info']['image_grid_hws'])
    data_embeds_ = torch.cat(data_embeds_, dim=0).view(-1, input_embed_dim)
    
    
    image_feature_nums, image_feature_dim = data_embeds_.shape
    
    assert image_feature_dim == input_embed_dim
    
    image_token_nums = (input_ids == 128001).sum().item()
    assert image_feature_nums == image_token_nums
    
    # (batch_size, sequence_length, input_embed_dim) -> (batch_size * sequence_length, input_embed_dim)
    inputs_embeds = txt_emb.reshape(-1, input_embed_dim)
    
    # (batch_size, sequence_length) -> (batch_size * sequence_length)
    input_ids = input_ids.flatten()
    
    inputs_embeds[input_ids == 128001] = data_embeds_
    
    inputs_embeds = inputs_embeds.reshape(
        (batch_size, sequence_length, input_embed_dim)
    )
    print(inputs_embeds.size())
    if i>5:
        break