# 一 模型权重分析

In [6]:
import torch

In [7]:
checkpoint_clip_bin = '/mnt/workspace/models/openai/clip-vit-large-patch14/pytorch_model.bin'

In [8]:
state_dict_clip_bin = torch.load(checkpoint_clip_bin, map_location=torch.device('cpu'))

In [10]:
# 对所有的 key 进行排序后打印
for k in sorted(state_dict_clip_bin.keys()):
    print(f"{k}, {state_dict_clip_bin[k].shape}")

logit_scale, torch.Size([])
text_model.embeddings.position_embedding.weight, torch.Size([77, 768])
text_model.embeddings.position_ids, torch.Size([1, 77])
text_model.embeddings.token_embedding.weight, torch.Size([49408, 768])
text_model.encoder.layers.0.layer_norm1.bias, torch.Size([768])
text_model.encoder.layers.0.layer_norm1.weight, torch.Size([768])
text_model.encoder.layers.0.layer_norm2.bias, torch.Size([768])
text_model.encoder.layers.0.layer_norm2.weight, torch.Size([768])
text_model.encoder.layers.0.mlp.fc1.bias, torch.Size([3072])
text_model.encoder.layers.0.mlp.fc1.weight, torch.Size([3072, 768])
text_model.encoder.layers.0.mlp.fc2.bias, torch.Size([768])
text_model.encoder.layers.0.mlp.fc2.weight, torch.Size([768, 3072])
text_model.encoder.layers.0.self_attn.k_proj.bias, torch.Size([768])
text_model.encoder.layers.0.self_attn.k_proj.weight, torch.Size([768, 768])
text_model.encoder.layers.0.self_attn.out_proj.bias, torch.Size([768])
text_model.encoder.layers.0.self_attn.out

# 二 模型权重转换

In [24]:
state_dict_clip_bin_processed = {}

prefix_len = len('vision_model.')

for k, v in state_dict_clip_bin.items():
    if k.startswith('vision_model.') and not k.startswith('vision_model.post_layernorm'):
        new_key = k[prefix_len:]
        if new_key.startswith('encoder.'):
            new_key = new_key.replace('encoder.', 'transformer.', 1)
        state_dict_clip_bin_processed[new_key] = v

for layer_id in range(24):
    q_bias_key = f"transformer.layers.{layer_id}.self_attn.q_proj.bias"
    k_bias_key = f"transformer.layers.{layer_id}.self_attn.k_proj.bias"
    v_bias_key = f"transformer.layers.{layer_id}.self_attn.v_proj.bias"

    q_weight_key = f"transformer.layers.{layer_id}.self_attn.q_proj.weight"
    k_weight_key = f"transformer.layers.{layer_id}.self_attn.k_proj.weight"
    v_weight_key = f"transformer.layers.{layer_id}.self_attn.v_proj.weight"

    qkv_bias = torch.cat([state_dict_clip_bin_processed[q_bias_key], state_dict_clip_bin_processed[k_bias_key], state_dict_clip_bin_processed[v_bias_key]], dim=0)
    qkv_weight = torch.cat([state_dict_clip_bin_processed[q_weight_key], state_dict_clip_bin_processed[k_weight_key], state_dict_clip_bin_processed[v_weight_key],], dim=0)

    qkv_bias_key = f"transformer.layers.{layer_id}.self_attn.qkv_proj.bias"
    qkv_weight_key = f"transformer.layers.{layer_id}.self_attn.qkv_proj.weight"

    state_dict_clip_bin_processed[qkv_bias_key] = qkv_bias
    state_dict_clip_bin_processed[qkv_weight_key] = qkv_weight

    state_dict_clip_bin_processed.pop(q_bias_key, None)
    state_dict_clip_bin_processed.pop(k_bias_key, None)
    state_dict_clip_bin_processed.pop(v_bias_key, None)
    state_dict_clip_bin_processed.pop(q_weight_key, None)
    state_dict_clip_bin_processed.pop(k_weight_key, None)
    state_dict_clip_bin_processed.pop(v_weight_key, None)


checkpoint_clip_pth = '/mnt/workspace/models/openai/clip-vit-large-patch14/pytorch_model_processed.pth'
torch.save(state_dict_clip_bin_processed, checkpoint_clip_pth)

In [25]:
for k in sorted(state_dict_clip_bin_processed.keys()):
    print(f"{k}, {state_dict_clip_bin_processed[k].shape}")

embeddings.class_embedding, torch.Size([1024])
embeddings.patch_embedding.weight, torch.Size([1024, 3, 14, 14])
embeddings.position_embedding.weight, torch.Size([257, 1024])
embeddings.position_ids, torch.Size([1, 257])
pre_layrnorm.bias, torch.Size([1024])
pre_layrnorm.weight, torch.Size([1024])
transformer.layers.0.layer_norm1.bias, torch.Size([1024])
transformer.layers.0.layer_norm1.weight, torch.Size([1024])
transformer.layers.0.layer_norm2.bias, torch.Size([1024])
transformer.layers.0.layer_norm2.weight, torch.Size([1024])
transformer.layers.0.mlp.fc1.bias, torch.Size([4096])
transformer.layers.0.mlp.fc1.weight, torch.Size([4096, 1024])
transformer.layers.0.mlp.fc2.bias, torch.Size([1024])
transformer.layers.0.mlp.fc2.weight, torch.Size([1024, 4096])
transformer.layers.0.self_attn.out_proj.bias, torch.Size([1024])
transformer.layers.0.self_attn.out_proj.weight, torch.Size([1024, 1024])
transformer.layers.0.self_attn.qkv_proj.bias, torch.Size([3072])
transformer.layers.0.self_attn.

# 三 转换结果验证

## 3.1 out_proj.bias

In [27]:
state_dict_clip_bin['vision_model.encoder.layers.0.self_attn.out_proj.bias']

tensor([-0.0262, -0.0654,  0.0032,  ...,  0.1761, -0.0446,  0.0023])

In [28]:
state_dict_clip_bin_processed['transformer.layers.0.self_attn.out_proj.bias']

tensor([-0.0262, -0.0654,  0.0032,  ...,  0.1761, -0.0446,  0.0023])

## 3.2 out_proj.weight

In [29]:
state_dict_clip_bin['vision_model.encoder.layers.0.self_attn.out_proj.weight']

tensor([[-6.7596e-03,  8.8043e-03, -7.9422e-03,  ..., -8.6441e-03,
         -8.7433e-03,  3.5553e-03],
        [ 1.2077e-02,  5.8784e-03,  1.1253e-02,  ..., -3.7060e-03,
          2.0008e-03,  3.8319e-03],
        [-5.2032e-03,  2.6913e-03,  1.2894e-02,  ...,  6.4812e-03,
         -3.0398e-05, -4.2796e-04],
        ...,
        [-4.5037e-04, -2.5063e-03, -3.2768e-03,  ..., -3.2768e-03,
         -1.9409e-02,  9.2545e-03],
        [-7.3624e-03,  2.8419e-03, -7.9193e-03,  ...,  4.0627e-04,
         -1.3866e-03, -6.7186e-04],
        [ 9.0408e-03,  1.5287e-03,  1.6737e-03,  ...,  2.4242e-03,
         -3.7575e-03,  4.9667e-03]])

In [30]:
state_dict_clip_bin_processed['transformer.layers.0.self_attn.out_proj.weight']

tensor([[-6.7596e-03,  8.8043e-03, -7.9422e-03,  ..., -8.6441e-03,
         -8.7433e-03,  3.5553e-03],
        [ 1.2077e-02,  5.8784e-03,  1.1253e-02,  ..., -3.7060e-03,
          2.0008e-03,  3.8319e-03],
        [-5.2032e-03,  2.6913e-03,  1.2894e-02,  ...,  6.4812e-03,
         -3.0398e-05, -4.2796e-04],
        ...,
        [-4.5037e-04, -2.5063e-03, -3.2768e-03,  ..., -3.2768e-03,
         -1.9409e-02,  9.2545e-03],
        [-7.3624e-03,  2.8419e-03, -7.9193e-03,  ...,  4.0627e-04,
         -1.3866e-03, -6.7186e-04],
        [ 9.0408e-03,  1.5287e-03,  1.6737e-03,  ...,  2.4242e-03,
         -3.7575e-03,  4.9667e-03]])

## 3.3 qkv_proj.bias

In [31]:
state_dict_clip_bin['vision_model.encoder.layers.0.self_attn.q_proj.bias']

tensor([ 1.5674, -1.6143, -0.8208,  ..., -1.2832, -0.0975,  0.7827])

In [32]:
state_dict_clip_bin['vision_model.encoder.layers.0.self_attn.v_proj.bias']

tensor([-0.0277,  0.0222, -0.0353,  ...,  0.0115,  0.0107, -0.0043])

In [33]:
state_dict_clip_bin_processed['transformer.layers.0.self_attn.qkv_proj.bias']

tensor([ 1.5674, -1.6143, -0.8208,  ...,  0.0115,  0.0107, -0.0043])

## 3.4 qkv_proj.weight

In [34]:
state_dict_clip_bin['vision_model.encoder.layers.0.self_attn.q_proj.weight']

tensor([[-7.0632e-05, -1.6510e-04, -7.0930e-05,  ...,  4.5090e-03,
         -2.9160e-02, -7.8201e-05],
        [-1.3733e-04,  1.2165e-04,  4.2319e-05,  ..., -1.6594e-03,
          3.1433e-02,  7.4446e-05],
        [ 4.8018e-04,  7.7963e-04, -1.0991e-04,  ..., -1.6846e-02,
          4.2999e-02,  1.5199e-04],
        ...,
        [ 6.8367e-05,  8.2791e-05,  8.7738e-05,  ..., -3.9940e-03,
          1.2596e-02, -3.9220e-05],
        [-1.5414e-04,  3.5167e-06, -2.7108e-04,  ..., -1.5259e-04,
         -3.3212e-04,  1.6868e-05],
        [-1.3053e-04,  8.8096e-05,  5.4955e-05,  ..., -1.4862e-02,
         -1.4143e-03,  4.3333e-05]])

In [35]:
state_dict_clip_bin['vision_model.encoder.layers.0.self_attn.v_proj.weight']

tensor([[ 9.6023e-05, -1.6069e-04, -5.7364e-04,  ...,  3.5210e-03,
         -1.0292e-02,  9.0539e-05],
        [-3.4380e-04, -1.9860e-04,  5.0724e-05,  ...,  9.1410e-04,
          9.4299e-03, -8.7619e-05],
        [ 5.0259e-04, -8.7440e-05,  2.2519e-04,  ..., -1.0691e-03,
         -1.9806e-02, -1.0806e-04],
        ...,
        [ 2.1267e-04,  4.1032e-04, -7.2420e-05,  ...,  4.8027e-03,
         -1.7338e-03, -6.6102e-05],
        [ 3.0518e-04, -4.4405e-05, -2.2709e-04,  ...,  1.1551e-02,
          3.3436e-03,  7.4685e-05],
        [-2.8849e-05,  4.5919e-04,  9.3341e-05,  ..., -1.1314e-02,
          3.7670e-03, -7.7844e-05]])

In [36]:
state_dict_clip_bin_processed['transformer.layers.0.self_attn.qkv_proj.weight']

tensor([[-7.0632e-05, -1.6510e-04, -7.0930e-05,  ...,  4.5090e-03,
         -2.9160e-02, -7.8201e-05],
        [-1.3733e-04,  1.2165e-04,  4.2319e-05,  ..., -1.6594e-03,
          3.1433e-02,  7.4446e-05],
        [ 4.8018e-04,  7.7963e-04, -1.0991e-04,  ..., -1.6846e-02,
          4.2999e-02,  1.5199e-04],
        ...,
        [ 2.1267e-04,  4.1032e-04, -7.2420e-05,  ...,  4.8027e-03,
         -1.7338e-03, -6.6102e-05],
        [ 3.0518e-04, -4.4405e-05, -2.2709e-04,  ...,  1.1551e-02,
          3.3436e-03,  7.4685e-05],
        [-2.8849e-05,  4.5919e-04,  9.3341e-05,  ..., -1.1314e-02,
          3.7670e-03, -7.7844e-05]])