In [1]:
"initialize model and preprocessor. attn_implementation==eager"
import os
os.environ['CUDA_VISIBLE_DEVICES']="0"
from PIL import Image
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor

# executing visual and scatter image features into input_embeds
# before set: 13401MiB, after set:5183MiB
torch.set_grad_enabled(False)
model_dir = "/models/Qwen/Qwen2-VL-2B-Instruct"
# Load the model in half-precision on the available device(s)
model:Qwen2VLForConditionalGeneration = Qwen2VLForConditionalGeneration.from_pretrained(
    model_dir,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="eager",
).eval()
processor = Qwen2VLProcessor.from_pretrained(model_dir)

  from .autonotebook import tqdm as notebook_tqdm
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s]


In [2]:
"tokenizer, preprocessor and inputs"
# Image
# url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open("historia.jpg")
print(f"[Image Width] {image.width} [Image Height] {image.height}")
conversation = [
    {"role":"system","content":"你是一名高级人工智能助手。"},
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this image in 10 words."},
            {"type": "image","image":image},
        ],
    }
]

# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# print(text_prompt)
# Excepted output:
# <|im_start|>system
# 你是一名高级人工智能助手。<|im_end|>
# <|im_start|>user
# Describe this image.<|vision_start|><|image_pad|><|vision_end|><|im_end|>
# <|im_start|>assistant (if add_generation_prompt)

inputs = processor(
    text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
# addressed in short:
# origin: (width, height): (939, 969)
# resize to number can be accurately divided by 28 (28 = merge_size*patch_size: merge_size=2, patch_size=14)
# after resize: (952,980) width: round(939/28)*28, height: round(969/28)*28
inputs = inputs.to("cuda")
# print(inputs.keys())

# image_grid_thw: indicates num_of_patches divided by 14 in height and width
# [[1,70,68]]: 70=980//14; 68=952//14
# image_grid_thw单指patch后width和height的大小，不用算上merge。
# 而merge_size用于组合特征：
# Qwen2vl 会对所有抽取到的特征,最终按照每相邻的4个patch的特征汇总成一个特征(PatchMerger)
print("[image_grid_thw] ",inputs.image_grid_thw)

# pixel_values.shape=[4760, 1176] 4760=1*70*68=grid_t*grid_h*grid_w
# 1176=3*2*14*14=channel*temporal_patch_size*patch_size*patch_size
print("[pixel_values shape] ",inputs.pixel_values.shape)
print("[input_ids[0,:-15] ] ",inputs.input_ids[0,-15:])
print("[input_ids shape] ",inputs.input_ids.shape) #[1,1220]

input_ids=inputs.input_ids
pixel_values=inputs.pixel_values
image_grid_thw=inputs.image_grid_thw

[Image Width] 939 [Image Height] 969
[image_grid_thw]  tensor([[ 1, 70, 68]], device='cuda:0')
[pixel_values shape]  torch.Size([4760, 1176])
[input_ids[0,:-15] ]  tensor([151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151653, 151645,    198, 151644,  77091,    198], device='cuda:0')
[input_ids shape]  torch.Size([1, 1220])


In [1]:
"analyze Qwen2VLImageProcessor._preprocessor flattens image features"
"""
pixel_values在输入到模型之前已经做好了merge相邻4个patches的预处理。
patch: 将[patch_size,patch_size]大小的相邻的像素视作一个patch，
"""
import numpy as np
import torch
from easydict import EasyDict
self = EasyDict()

resized_height=8
resized_width=8
channel=3
self.temporal_patch_size = 2
self.merge_size = 2
self.patch_size = 2

grid_h = resized_height // self.patch_size
grid_w = resized_width // self.patch_size
patches:np.ndarray = np.arange(
    0, 1*channel*resized_height*resized_width
).reshape(1,
          channel,
          resized_height,
          resized_width)
if patches.shape[0]==1:
    patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))

print("[patches shape] ",patches.shape)
print("[patches]\n",patches)

channel = patches.shape[1] # 3
grid_t = patches.shape[0] // self.temporal_patch_size # 1

patches = patches.reshape(
    grid_t,
    self.temporal_patch_size,
    channel,
    grid_h // self.merge_size,
    self.merge_size,
    self.patch_size,
    grid_w // self.merge_size,
    self.merge_size,
    self.patch_size,
)
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
flatten_patches = patches.reshape(
    grid_t * grid_h * grid_w,
    channel * self.temporal_patch_size * self.patch_size * self.patch_size
)
print("[flatten_patches.T]\n",flatten_patches.T)
print("[flatten_patches.T shape] ",flatten_patches.T.shape)
# flatten_patches.T:
# 行元素为每个patch相对位置相同的元素；
# 列元素排序优先度：`每个patch的相对位置元素`,再根据`temporal_patch_size`,最后根据`channel`

[patches shape]  (2, 3, 8, 8)
[patches]
 [[[[  0   1   2   3   4   5   6   7]
   [  8   9  10  11  12  13  14  15]
   [ 16  17  18  19  20  21  22  23]
   [ 24  25  26  27  28  29  30  31]
   [ 32  33  34  35  36  37  38  39]
   [ 40  41  42  43  44  45  46  47]
   [ 48  49  50  51  52  53  54  55]
   [ 56  57  58  59  60  61  62  63]]

  [[ 64  65  66  67  68  69  70  71]
   [ 72  73  74  75  76  77  78  79]
   [ 80  81  82  83  84  85  86  87]
   [ 88  89  90  91  92  93  94  95]
   [ 96  97  98  99 100 101 102 103]
   [104 105 106 107 108 109 110 111]
   [112 113 114 115 116 117 118 119]
   [120 121 122 123 124 125 126 127]]

  [[128 129 130 131 132 133 134 135]
   [136 137 138 139 140 141 142 143]
   [144 145 146 147 148 149 150 151]
   [152 153 154 155 156 157 158 159]
   [160 161 162 163 164 165 166 167]
   [168 169 170 171 172 173 174 175]
   [176 177 178 179 180 181 182 183]
   [184 185 186 187 188 189 190 191]]]


 [[[  0   1   2   3   4   5   6   7]
   [  8   9  10  11  12  1

In [None]:
"image smart_resize&rescale&normalize analysis"
import numpy as np
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize, Qwen2VLImageProcessor
from transformers.image_transforms import resize

img_processor:Qwen2VLImageProcessor = processor.image_processor

def temp_save(image:np.ndarray, filename):
    Image.fromarray(image,mode='RGB').save(filename,format='jpeg')

image = np.array(image)
height, width = image.shape[-3],image.shape[-2]
print(f"[Image Height] {height} [Image Width] {width}")

resized_height, resized_width = smart_resize(
                    height,
                    width,
                    factor=img_processor.patch_size * img_processor.merge_size,
                    min_pixels=img_processor.min_pixels,
                    max_pixels=img_processor.max_pixels,
                )
print(f"[Image Resized Height] {resized_height} [Image Resized Width] {resized_width}")
resized_image = resize(image, size=(resized_height, resized_width),
                     resample=Image.Resampling.BICUBIC,
                     input_data_format="channels_last")
print(resized_image.shape)
rescale_factor=1
rescaled_image=img_processor.rescale(resized_image,rescale_factor,input_data_format="channels_last")
print(rescaled_image.shape)
# temp_save(rescaled_image,"test.jpeg")

# omit normalizing

[Image Height] 969 [Image Width] 939
[Image Resized Height] 980 [Image Resized Width] 952
(980, 952, 3)
(980, 952, 3)


$$
\text{RoPE formula} \\
\text{RoPE会在}Q*K^T\text{的时候作用。}\\
\text{在Q的位置m赋予下面公式，在K的位置n赋予下面公式，两者相乘的值会产生两者的差值} \\
\begin{pmatrix}
x_{m}^{i} cos m\theta_{i} - x_{m}^{i+\frac{d}{2}} sin m\theta_{i} \\
x_{m}^{i+\frac{d}{2}} cos m\theta_{i} + x_{m}^{i} sin m\theta_{i}
\end{pmatrix} \\
d=\text{hidden dim} \\
i \in 1,2,3,...,\frac{d}{2} \\
\text{这里的i即一个token特征(hidden dim)里的元素} \\
代码中常设的freqs即m\theta_{i}
$$
> 详见[RoPE优化与实现](https://nn.labml.ai/transformers/rope/index.html)

In [None]:
"""analyze model.visual.rot_pos_emb. This function generates `m\theta` in RoPE."""
"""
注意！因为在`preprocessor._preprocess`中输出的图像`pixel_values`已经被处理成了相邻4块一列的形式，
因此在`visual`中需要二维width与height的位置信息，而且是非像素而是`patches`为单位
"""
pos_ids=[]
t,h,w=image_grid_thw[0]

hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
print("[hpos_ids shape] ",hpos_ids.shape) # [70,68]
print("[hpos_ids] ",hpos_ids)
hpos_ids = hpos_ids.reshape(
        h // model.visual.spatial_merge_size,
        model.visual.spatial_merge_size,
        w // model.visual.spatial_merge_size,
        model.visual.spatial_merge_size,
    )
print("[hpos_ids shape] ",hpos_ids.shape) # [35,2,34,2]
print("[hpos_ids]\n",hpos_ids)
hpos_ids=hpos_ids.permute(0,2,1,3)
print("[after permute(0,2,1,3) hpos_ids]\n",hpos_ids)
hpos_ids=hpos_ids.flatten() # [4760]

# wpos_ids addressed identical as hpos_ids 
wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
print("[wpos_ids]\n",wpos_ids)
wpos_ids = wpos_ids.reshape(
    h // model.visual.spatial_merge_size,
    model.visual.spatial_merge_size,
    w // model.visual.spatial_merge_size,
    model.visual.spatial_merge_size,
)
wpos_ids = wpos_ids.permute(0, 2, 1, 3)
wpos_ids = wpos_ids.flatten()
print("[wpos_ids shape] ",wpos_ids.shape) # [4760]

pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
pos_ids = torch.cat(pos_ids, dim=0)
print("[pos_ids shape] ",pos_ids.shape)
print("[pos_ids.T[:150,:150]]\n",pos_ids.T[:150,:150]) # 可以注意这里的值，`pos_ids.T`，转置后更清楚
# print("[pos_ids.T[-30:,-30:]]\n",pos_ids.T[-30:,-30:])
"""
注释：
注意上面`pos_ids`：shape=[4760,2],
其中`pos_ids[:,0]`为`hpos_ids`，即`height`上的`position_ids`；`pos_ids[:,1]`同理。
再注意，这里的位置编码都是以`patch`为单位，
【而且在`preprocessor._preprocess`中已经`flatten`为相邻4个`patches`一行的`pixel_values`】
因此这的`hpos_ids`与`wpos_ids`都应该4个`patches`地看。
如`pos_ids.T`为[[0,0,1,1],[0,1,0,1]]即4个相邻的`patches`。
"""


max_grid_size = image_grid_thw[:, 1:].max()
print("[max_grid_size] ",max_grid_size)
rotary_pos_emb_full = model.visual.rotary_pos_emb(max_grid_size) #生成$m\theta$
print("[rotary_pos_emb_full shape] ",rotary_pos_emb_full.shape) #[70,20]。只在20个特征上填充RoPE


#pos_ids的shape是[4760, 2]，这意味着它是一个二维数组，包含4760个索引对，每个索引对由两个整数组成。
#rotary_pos_emb_full的shape是[70, 20]，它是一个二维数组，包含70行和20列。
#当执行rotary_pos_emb_full[pos_ids]时，PyTorch会使用pos_ids中的每一对索引来从rotary_pos_emb_full中提取相应的元素。
#其中每个元素的长度是[1,20]的向量，所以rotary_pos_emb.shape==[4760,2,20]
print("[rotary_pos_emb_full[pos_ids] shape] ",rotary_pos_emb_full[pos_ids].shape)
#由于每一个`patch`具有两个`position id`，因此每一个 patch 会对应到一个 40 维度的向量
rotary_pos_emb=rotary_pos_emb_full[pos_ids].flatten(1)
print("[rotary_pos_emb shape] ",rotary_pos_emb.shape) #[4760,40]
print(rotary_pos_emb[:10,:])
_="""
注释：
先根据最长的`grid`生成RoPE中的$m\theta$的编码`rotary_pos_emb_full`，
然后用`pos_ids`作为索引从里面抽取对应`height`与`width`的`pos_id`的$m\theta$编码，
再`flatten`，即原本`pos_ids`的[4760,2]`flatten`为[4760,40]，前20个特征是`hpos_ids`的，后20个是`wpos_ids`的。
"""


[hpos_ids shape]  torch.Size([70, 68])
[hpos_ids]  tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 1,  1,  1,  ...,  1,  1,  1],
        [ 2,  2,  2,  ...,  2,  2,  2],
        ...,
        [67, 67, 67,  ..., 67, 67, 67],
        [68, 68, 68,  ..., 68, 68, 68],
        [69, 69, 69,  ..., 69, 69, 69]])
[hpos_ids shape]  torch.Size([35, 2, 34, 2])
[hpos_ids]
 tensor([[[[ 0,  0],
          [ 0,  0],
          [ 0,  0],
          ...,
          [ 0,  0],
          [ 0,  0],
          [ 0,  0]],

         [[ 1,  1],
          [ 1,  1],
          [ 1,  1],
          ...,
          [ 1,  1],
          [ 1,  1],
          [ 1,  1]]],


        [[[ 2,  2],
          [ 2,  2],
          [ 2,  2],
          ...,
          [ 2,  2],
          [ 2,  2],
          [ 2,  2]],

         [[ 3,  3],
          [ 3,  3],
          [ 3,  3],
          ...,
          [ 3,  3],
          [ 3,  3],
          [ 3,  3]]],


        [[[ 4,  4],
          [ 4,  4],
          [ 4,  4],
          ...,
          [

'\n注释：\n先根据最长的`grid`生成RoPE中的$m\theta$的编码`rotary_pos_emb_full`，\n然后用`pos_ids`作为索引从里面抽取对应`height`与`width`的`pos_id`的$m\theta$编码，\n再\n'

In [11]:
"""analyze apply_rotary_pos_emb_vision in vision attn"""
import torch
#assuming input q
q=torch.randn([4760,16,80]) #num_heads=16, 80=embed_dim=1280//num_heads
input_q=q.unsqueeze(0)
freqs:torch.Tensor=rotary_pos_emb
print("[input_q shape] ",input_q.shape)
print("[freqs shape] ",freqs.shape)
#freqs.cos().unsqueeze(1).shape==[4760,1,40]
#repeat(1,1,2):
#values on the last dim is repeated at the first 40 and the last 40
after_repeat=freqs.cos().unsqueeze(1).repeat(1,1,2) #[4760,1,80]
# print("[after_repeat shape] ",after_repeat.shape)

cos=after_repeat.unsqueeze(0).float().cpu()
sin=freqs.sin().unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float().cpu()
"""
注意：上面的`freqs`的`cos`与`sin`的`repeat(1,1,2)`即因为`freqs`特征维度40对不上`hidden_states`的80，
这里就自行在最后一个特征上`repeat`一倍，`repeat`后的`freqs`可以简化理解为:
[`hpos_ids`的$m\theta$编码,`wpos_ids`的$m\theta$编码,
 `hpos_ids`的$m\theta$编码,`wpos_ids`的$m\theta$编码]

`Q*K^T`的时候：
`Q`的m位置与`K`的n位置相乘，对应特征维度的`hpos`与`wpos`会各自点乘，
根据RoPE原理，`hpos`会get到`hpos`的相对位置，`wpos`会get到`wpos`的相对位置
"""
print("[cos shape] ",cos.shape)
print("[sin shape]==[cos shape]")

### rotate_half ###
x1=input_q[..., :input_q.shape[-1] // 2]
x2=input_q[..., input_q.shape[-1] // 2:]
# 前`d/2`放后面且置为正，后`d/2`放前面且置为负
after_rotate_half=torch.cat((-x2, x1), dim=-1)
print("[after_rotate_half shape] ", after_rotate_half.shape)
### rotate_half ###

#简略见上markdown注释
#优化思路：只需要在`Q*K^T`的时候满足m位置的q与n位置的k相乘有相对位置信息即可。
#将信息嵌入到前x个hidden_dim也能满足位置信息存在于`Q*K^T`后的分数矩阵：
#因此原RoPE的嵌入所有hidden_dim，优化为嵌入自己设置的前x个hidden_dim就行。
#当然也可以全选，这里就全选了分注意力头之后的hidden_dim=80，全部嵌入位置信息

# 注意这里 `*` 是按位乘
output=(input_q * cos)+(after_rotate_half * sin)
print("[output shape] ", output.shape)

[input_q shape]  torch.Size([1, 4760, 16, 80])
[freqs shape]  torch.Size([4760, 40])
[cos shape]  torch.Size([1, 4760, 1, 80])
[sin shape]==[cos shape]
[after_rotate_half shape]  torch.Size([1, 4760, 16, 80])
[output shape]  torch.Size([1, 4760, 16, 80])


In [3]:
"""analyze vision model
image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)
image_embeds.shape=[1190, 1536]
"""
# omit RMSNorm
import math
import torch.nn.functional as F
from transformers.models.qwen2_vl.modeling_qwen2_vl import apply_rotary_pos_emb_vision
print("[pixel_values shape] ",pixel_values.shape)
hidden_states = model.visual.patch_embed(pixel_values)
print("[after PatchEmbed] ", hidden_states.shape)
grid_thw=image_grid_thw
print("[grid_thw] ",grid_thw)

# please see beneath analysis if has any question
rotary_pos_emb = model.visual.rot_pos_emb(grid_thw)
print("[rotary_pos_emb shape] ",rotary_pos_emb.shape)
print("[grid_thw[:,1]] ",grid_thw[:,1])
print("[grid_thw[:,2]] ",grid_thw[:,2])
print("[grid_thw[:,0]] ",grid_thw[:,0])

cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
    dim=0, dtype=torch.int32
)
print("[after repeat_interleave] ",torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]))
cu_seqlens = F.pad(cu_seqlens, (1,0), value=0)
print("[cu_seqlens] ",cu_seqlens)

# VisionBlock: residual with attn and mlp.
# omit RMSNorm
## VisionAttention
print("## VisionAttention ##")
seq_length = hidden_states.shape[0]
print("[hidden_states] ",hidden_states.shape) #[4760,1280]
after_qkv=model.visual.blocks[0].attn.qkv(hidden_states) #[4760,3840]
num_heads=model.config.vision_config.num_heads #16
qkv_reshaped=after_qkv.reshape(seq_length,3,num_heads,-1) #[4760,3,16,80]
_=qkv_reshaped.permute(1,0,2,3) #[3,4760,16,80]
#unbind,turns given dim to tuple
q,k,v=_.unbind(0) #q.shape==k.shape==v.shape==[4760,16,80]
q=apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
k=apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)

attention_mask = torch.full(
    [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
)
for i in range(1, len(cu_seqlens)):
    attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
print("[attention_mask]\n",attention_mask)

attn_head_dim=model.visual.blocks[0].attn.head_dim
attn_proj=model.visual.blocks[0].attn.proj

q = q.transpose(0, 1)
k = k.transpose(0, 1)
v = v.transpose(0, 1)

attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(attn_head_dim)
attn_weights = attn_weights + attention_mask
attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
attn_output = torch.matmul(attn_weights, v)
attn_output = attn_output.transpose(0, 1)
attn_output = attn_output.reshape(seq_length, -1)
attn_output = attn_proj(attn_output)

print("[attn_output shape] ",attn_output.shape)

### Vision MLP ###
fc1 = model.visual.blocks[0].mlp.fc1 # in=1280,out=5120==1280*4
vision_act_fn = model.visual.blocks[0].mlp.act #QuickGELUActivation
fc2 = model.visual.blocks[0].mlp.fc2 # in=5120,out=1280
vision_mlp_output=fc2(vision_act_fn(fc1(attn_output)))
### Vision MLP ###
print("[vision_mlp_output shape] ",vision_mlp_output.shape)

### Merger ###
# function: merge 4 patches and project to be embeded into text input embeds
merger=model.visual.merger
hidden_size = merger.hidden_size #5120==1280*2**2 (spatial_merge_size==2)
after_merger=merger(vision_mlp_output)
print("[after_merger shape] ",after_merger.shape)
# [4760,1280] (after view) -> [1190,5120] (after merger) -> [1190,1536]
# 4760=70*68( width*height ), merge_size==2
# Qwen2VL merge 4 spatially adjacent patches, which has been ordered in preprocessor.preprocess
### Merger ###


[pixel_values shape]  torch.Size([4760, 1176])
[after PatchEmbed]  torch.Size([4760, 1280])
[grid_thw]  tensor([[ 1, 70, 68]], device='cuda:0')
[rotary_pos_emb shape]  torch.Size([4760, 40])
[grid_thw[:,1]]  tensor([70], device='cuda:0')
[grid_thw[:,2]]  tensor([68], device='cuda:0')
[grid_thw[:,0]]  tensor([1], device='cuda:0')
[after repeat_interleave]  tensor([4760], device='cuda:0')
[cu_seqlens]  tensor([   0, 4760], device='cuda:0', dtype=torch.int32)
## VisionAttention ##
[hidden_states]  torch.Size([4760, 1280])
[attention_mask]
 tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0',
       dtype=torch.bfloat16)
[attn_output shape]  torch.Size([4760, 1280])
[vision_mlp_output shape]  torch.Size([4760, 1280])
[after_merger shape]  torch.Size([1190, 1536])


In [4]:
"""analyze image mask and generate inputs_embeds"""
"""
Simply speaking. image_embeds is directly embeded into inputs_embeds.
Just inplace all token_ids==151655. The tokens length has to be equal to image_embeds
or it loses features by function `masked_scatter`.
"""
image_token_id=151655
inputs_embeds = model.model.embed_tokens(input_ids) #[1,1220,1536]
image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw) #[1190, 1536]
print('[inputs_embeds shape]', inputs_embeds.shape)
print('[image_embeds shape]',image_embeds.shape)

n_image_tokens = (input_ids == model.config.image_token_id).sum().item() #number of image tokens 1190
n_image_features = image_embeds.shape[0] #number of image features: 1190. n_image_features==n_image_tokens or raise ValueError
print('[n_image_tokens] ',n_image_tokens,' [n_image_features] ',n_image_features)

image_mask = (
    # output bool tensor, in which `True` is input_ids' value==image_token_id, and output shape==input_ids.shape
    (input_ids == model.config.image_token_id)
    .unsqueeze(-1) # expand dimension to 1 at pos -1
    # original: [1,1220,1], after `expand_as`: [1,1220,1536]. Copies value [1,1220,0] to [1,1220,1536]
    .expand_as(inputs_embeds)
    .to(inputs_embeds.device)
)
temp1=(input_ids == model.config.image_token_id)
print("[temp1 shape] ", temp1.shape, "\n[temp1[0,-15:]] ",temp1[0,-15:])
temp2=temp1.unsqueeze(-1).expand_as(inputs_embeds)
print("[temp2 shape] ",temp2.shape, "\n[temp2[0,-15:,0]] ",temp2[0,-15:,0])
print('[image_mask shape] ',image_mask.shape)

print(inputs_embeds[0,-15:,0])
# inputs_embeds.shape == image_mask.shape, image_embeds have values >= `True` value in image_mask
# replace inputs_embeds's value where image_mask==True
inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
print(inputs_embeds[0,-15:,0])

[inputs_embeds shape] torch.Size([1, 1220, 1536])
[image_embeds shape] torch.Size([1190, 1536])
[n_image_tokens]  1190  [n_image_features]  1190
[temp1 shape]  torch.Size([1, 1220]) 
[temp1[0,-15:]]  tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True, False,
        False, False, False, False, False], device='cuda:0')
[temp2 shape]  torch.Size([1, 1220, 1536]) 
[temp2[0,-15:,0]]  tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True, False,
        False, False, False, False, False], device='cuda:0')
[image_mask shape]  torch.Size([1, 1220, 1536])
tensor([-0.0101, -0.0101, -0.0101, -0.0101, -0.0101, -0.0101, -0.0101, -0.0101,
        -0.0101, -0.0101, -0.0076,  0.0330, -0.0074,  0.0114,  0.0330],
       device='cuda:0', dtype=torch.bfloat16)
tensor([-0.7930, -0.0942, -0.4668, -0.4375, -1.5859, -0.2158, -0.6523, -1.4922,
        -4.3750, -0.0101, -0.0076,  0.0330, -0.0074,  0.0114,  0.0330],
       device='cuda:0', dtype=torch.bfloat16)


### below is languange model, Qwen2VLModel

In [6]:
"""analyze Qwen2VL.Qwen2VLModel.forward"""

print("[inputs_embeds shape] ",inputs_embeds.shape)
past_seen_tokens=0
cache_position = torch.arange(
    past_seen_tokens,
    past_seen_tokens + inputs_embeds.shape[1],
    device=inputs_embeds.device
) # tensor([0,1,2,...,1219]) shape==[1220]
position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
print("[position_ids shape] ",position_ids.shape)
print("[position_ids]\n",position_ids)

# tri matrix, lower is 0, upper is min of bfloat16: [0\min]
causal_mask=model.model._update_causal_mask(
    None,inputs_embeds,cache_position,None,None)
print("[causal_mask shape] ",causal_mask.shape)
print("[causal_mask]\n",causal_mask)
attention_mask=causal_mask

# Difference between rotary_emb here and VisionModel.rotary_emb is only the first dim
# rotary_emb here repeat 3 times at the first dim.
position_embeddings=model.model.rotary_emb(inputs_embeds,position_ids)
cos,sin=position_embeddings
print("[position_embeddings cos shape] ",cos.shape)
print("[position_embeddings sin shape==cos shape]")
print("[cos[0,0,:50,:]]\n",cos[0,0,:50,:])
print("[cos[1,0,:50,:]]\n",cos[1,0,:50,:])
print("[cos[2,0,:50,:]]\n",cos[2,0,:50,:])


[inputs_embeds shape]  torch.Size([1, 1220, 1536])
[position_ids shape]  torch.Size([3, 1, 1220])
[position_ids]
 tensor([[[   0,    1,    2,  ..., 1217, 1218, 1219]],

        [[   0,    1,    2,  ..., 1217, 1218, 1219]],

        [[   0,    1,    2,  ..., 1217, 1218, 1219]]], device='cuda:0')
[causal_mask shape]  torch.Size([1, 1, 1220, 1221])
[causal_mask]
 tensor([[[[-0.0000e+00, -3.3895e+38, -3.3895e+38,  ..., -3.3895e+38,
           -3.3895e+38, -3.3895e+38],
          [-0.0000e+00, -0.0000e+00, -3.3895e+38,  ..., -3.3895e+38,
           -3.3895e+38, -3.3895e+38],
          [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., -3.3895e+38,
           -3.3895e+38, -3.3895e+38],
          ...,
          [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., -3.3895e+38,
           -3.3895e+38, -3.3895e+38],
          [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., -0.0000e+00,
           -3.3895e+38, -3.3895e+38],
          [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., -0.0000e+00,
           -0.0000

In [7]:
"""analyze Qwen2VLAttention"""
llm_hidden_states=inputs_embeds
attention_mask=attention_mask
position_ids=position_ids
position_embeddings=position_embeddings
llm_self_attn=model.model.layers[0].self_attn

num_heads=llm_self_attn.num_heads
num_key_value_heads=llm_self_attn.num_key_value_heads
head_dim=llm_self_attn.head_dim

q_proj=llm_self_attn.q_proj
k_proj=llm_self_attn.k_proj
v_proj=llm_self_attn.v_proj

bsz, q_len, _ = llm_hidden_states.size()

query_states = q_proj(llm_hidden_states)
key_states = k_proj(llm_hidden_states)
value_states = v_proj(llm_hidden_states)

query_states = query_states.view(bsz, q_len, num_heads, head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, num_key_value_heads, head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, num_key_value_heads, head_dim).transpose(1, 2)
# be aware of transpose(1,2). [bsz, num_heads, q_len, head_dim]
print("[query_states shape] ",query_states.shape)
print("[key_states shape] ",key_states.shape)
print("[value_states.shape==key_states.shape]")
kv_seq_len = key_states.shape[-2]

### apply_multimodal_rotary_pos_emb ###
# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)
unsqueeze_dim=1
cos,sin=position_embeddings # cos.shape=[3, 1, 1220, 128]==sin.shape
mrope_section=[16,24,24]
mrope_section=mrope_section*2 # [16,24,24,16,24,24]

cos = torch.cat(
    [m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1
).unsqueeze(unsqueeze_dim)

sin = torch.cat(
    [m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1
).unsqueeze(unsqueeze_dim)
# 详解见notebooks下一个块
print("[cos_after_cat shape] ",cos.shape)
print("[sin_after_cat.shape==cos_after_cat.shape]")

q_embed = (query_states * cos) + (rotate_half(query_states) * sin)
k_embed = (key_states * cos) + (rotate_half(key_states) * sin)
### apply_multimodal_rotary_pos_emb ###

[query_states shape]  torch.Size([1, 12, 1220, 128])
[key_states shape]  torch.Size([1, 2, 1220, 128])
[value_states.shape==key_states.shape]
[cos_after_cat shape]  torch.Size([1, 1, 1220, 128])
[sin_after_cat.shape==cos_after_cat.shape]


In [12]:
"""temp analyze cat&split in apply_multimodal_rotary_pos_emb"""
import torch
bsz=1
seq_len=10
head_dim=128
mrope_section=[16,24,24,16,24,24]
unsqueeze_dim=1
temp_cos=torch.arange(bsz*seq_len*head_dim).reshape(1,bsz,seq_len,head_dim).expand(3,bsz,seq_len,head_dim)
print("[temp_cos.shape] ",temp_cos.shape)
temp_sections=[]
# temp_cos.split(mrope_section,dim=-1): 6 tensors:
# 0=[3, 1, seq_len, 16], 1=[3, 1, seq_len, 24], 2=[3, 1, seq_len, 24],
# 3=[3, 1, seq_len, 16], 4=[3, 1, seq_len, 24], 5=[3, 1, seq_len, 24]
 
# m[ i%3 ]:
# 取m_0=[3, 1, seq_len, 16]的[0,:]的向量， (time)
# 取m_1=[3, 1, seq_len, 24]的[1,:]的向量， (height)
# 取m_2=[3, 1, seq_len, 24]的[2,:]的向量， (width)
# 取m_3=[3, 1, seq_len, 16]的[0,:]的向量， (time)
# 取m_4=[3, 1, seq_len, 24]的[1,:]的向量， (height)
# 取m_5=[3, 1, seq_len, 24]的[2,:]的向量， (width)
for i,m in enumerate(temp_cos.split(mrope_section,dim=-1)):
    temp_sections.append(m[i%3])
    # print(m[i%3])
    print(m[i%3].shape)
temp_cos=torch.concat(temp_sections,dim=-1).unsqueeze(unsqueeze_dim)
print("[after concat, temp_cos.shape] ",temp_cos.shape)
print("""
结论：mrope就是先expand出3个维度：time,height,width的position_ids的cos和sin
（注：这三个维度的cos，sin完全相同，postion_ids都是从0开始到seq_len），
然后在每个head_dim位置上嵌入不同维度的位置信息：
即某位置特征的head_dim[16:time, 24:height, 24:width, 16:time, 24:height, 24:width]。
在`Q*K^T`时，Q的m位置与K的n位置相乘时，不同维度的信息会与对应维度相乘。
""")

[temp_cos.shape]  torch.Size([3, 1, 10, 128])
torch.Size([1, 10, 16])
torch.Size([1, 10, 24])
torch.Size([1, 10, 24])
torch.Size([1, 10, 16])
torch.Size([1, 10, 24])
torch.Size([1, 10, 24])
[after concat, temp_cos.shape]  torch.Size([1, 1, 10, 128])

结论：mrope就是先expand出3个维度：time,height,width的position_ids的cos和sin
（注：这三个维度的cos，sin完全相同，postion_ids都是从0开始到seq_len），
然后在每个head_dim位置上嵌入不同维度的位置信息：
即某位置特征的head_dim[16:time, 24:height, 24:width, 16:time, 24:height, 24:width]。
在`Q*K^T`时，Q的m位置与K的n位置相乘时，不同维度的信息会与对应维度相乘。



In [9]:
"""Qwen2VL.language model"""
llm_model=model.model
position_ids=None
attention_mask=None
past_key_values=None
use_cache=False
output_attentions=None
output_hidden_states=None
return_dict=None

outputs = llm_model(
    input_ids=None,
    position_ids=position_ids,
    attention_mask=attention_mask,
    past_key_values=past_key_values,
    inputs_embeds=inputs_embeds,
    use_cache=use_cache,
    output_attentions=output_attentions,
    output_hidden_states=output_hidden_states,
    return_dict=return_dict,
)
final_hidden_states=outputs[0]
print("[final_hidden_states.shape] ",final_hidden_states.shape)

### lm_head ###
lm_head = model.lm_head
logits=lm_head(final_hidden_states)
print("[logits.shape] ",logits.shape)
### lm_head ###

[final_hidden_states.shape]  torch.Size([1, 1220, 1536])
[logits.shape]  torch.Size([1, 1220, 151936])


In [None]:
# Inference: Generation of the output
output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)