## Check env

In [1]:
import platform
machine = platform.machine()
print(f"This is a {machine} machine")

backend = 'fbgemm'
if machine == 'AMD64':
    backend = 'x86'
elif machine == 'arm64':
    backend = 'qnnpack'
print(f"Backend is {backend}")

This is a AMD64 machine
Backend is x86


## Libraries & Config

In [2]:
from dataclasses import dataclass
import torch as T
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Subset
import torch.ao.quantization as Q
from torchvision import datasets, transforms
import math
import os
import numpy as np


T.backends.quantized.engine = backend

In [3]:
@dataclass
class VisionConfig:
    num_hidden_layers: int = 12 # number of hidden layers in the encoder as in the paper
    num_channels: int = 3
    embed_dim: int = 512  # patch_size * patch_size * num_channels
    image_size: int = 32
    patch_size: int = 4
    num_attention_heads: int = 3  # embed_dim // 64
    hidden_size: int = 48
    intermediate_size: int = 144  # 4 * hidden_size
    layer_norm_eps: float = 1e-6
    attention_dropout: float = 0.0
    
model_config = VisionConfig()

@dataclass
class DatasetConfig:
    batch_size: int = 1
    subset_size: int = 100
    num_workers: int = 4
    
dataset_config = DatasetConfig()

## Load Dataset

In [27]:
# Load the dataset with float32 and int8
transform_fp32 = transforms.Compose([
    transforms.ToTensor(),
    # transforms.Normalize(
    #         mean=[0.485, 0.456, 0.406],
    #         std=[0.229, 0.224, 0.225]
    # )
])
def pil_to_tensor(img):
    return T.from_numpy(np.array(img)).permute(2, 0, 1)
transform_int8 = transforms.Compose([
    transforms.Lambda(pil_to_tensor),    
])

# Shape: (B, C, H, W)
full_dataset_fp32 = datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transform_fp32,
)
full_dataset_int8 = datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transform_int8,
)

# Subset for calibration
indices = np.random.choice(
    a=len(full_dataset_fp32),
    size=dataset_config.subset_size,
    replace=False
)
calibration_dataset_fp32 = Subset(full_dataset_fp32, indices)
calibration_loader_fp32 = T.utils.data.DataLoader(
    dataset=calibration_dataset_fp32,
    batch_size=dataset_config.batch_size,
    shuffle=False,
    num_workers=dataset_config.num_workers,
)

calibration_dataset_int8 = Subset(full_dataset_int8, indices)
calibration_loader_int8 = T.utils.data.DataLoader(
    dataset=calibration_dataset_int8,
    batch_size=dataset_config.batch_size,
    shuffle=False,
)

Files already downloaded and verified
Files already downloaded and verified


In [28]:
print(f"Number of data points: {len(calibration_dataset_fp32)}\nNumber of calibration batches: {len(calibration_loader_fp32)}")

Number of data points: 100
Number of calibration batches: 100


In [29]:
# float32 input for check the model
input_tensor_fp32 = next(iter(calibration_loader_fp32))[0]
print(f"Float32 input shape: {input_tensor_fp32.shape}\n{input_tensor_fp32[0, 0, 0]}")

# int8 input for quantization
input_tensor_int8 = next(iter(calibration_loader_int8))[0]
print(f"Int8 input shape: {input_tensor_int8.shape}\n{input_tensor_int8[0, 0, 0]}")

Float32 input shape: torch.Size([1, 3, 32, 32])
tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0039, 0.0039, 0.0039,
        0.0039, 0.0039, 0.0039, 0.0039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0039,
        0.0039, 0.0039, 0.0039, 0.0078, 0.0196, 0.0275, 0.0235, 0.0196, 0.0157,
        0.0157, 0.0157, 0.0157, 0.0157, 0.0196])
Int8 input shape: torch.Size([1, 3, 32, 32])
tensor([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 2, 5, 7,
        6, 5, 4, 4, 4, 4, 4, 5], dtype=torch.uint8)


## Test

In [5]:
# define a floating point model where some layers could be statically quantized
class M(T.nn.Module):
    def __init__(self):
        super().__init__()
        # QuantStub converts tensors from floating point to quantized
        self.quant = Q.QuantStub()
        self.conv = T.nn.Conv2d(1, 1, 1)
        self.relu = T.nn.ReLU()
        # DeQuantStub converts tensors from quantized to floating point
        self.dequant = Q.DeQuantStub()

    def forward(self, x):
        # manually specify where tensors will be converted from floating
        # point to quantized in the quantized model
        x = self.quant(x)
        x = self.conv(x)
        x = self.relu(x)
        # manually specify where tensors will be converted from quantized
        # to floating point in the quantized model
        x = self.dequant(x)
        return x

# create a model instance
model_fp32 = M()

# model must be set to eval mode for static quantization logic to work
model_fp32.eval()


M(
  (quant): QuantStub()
  (conv): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (relu): ReLU()
  (dequant): DeQuantStub()
)

In [None]:
# attach a global qconfig, which contains information about what kind
# of observers to attach. Use 'x86' for server inference and 'qnnpack'
# for mobile inference. Other quantization configurations such as selecting
# symmetric or asymmetric quantization and MinMax or L2Norm calibration techniques
# can be specified here.
# Note: the old 'fbgemm' is still available but 'x86' is the recommended default
# for server inference.
# model_fp32.qconfig = Q.get_default_qconfig('fbgemm')
model_fp32.qconfig = Q.get_default_qconfig(backend)

## 運算融合
# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
model_fp32_fused = Q.fuse_modules(model_fp32, [['conv', 'relu']])

# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
model_fp32_prepared = Q.prepare(model_fp32_fused)

# calibrate the prepared model to determine quantization parameters for activations
# in a real world setting, the calibration would be done with a representative dataset
input_fp32 = T.randn(4, 1, 4, 4)
model_fp32_prepared(input_fp32)

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
model_int8 = Q.convert(model_fp32_prepared)

# run the model, relevant calculations will happen in int8
res = model_int8(input_fp32)

In [12]:
model_int8

M(
  (quant): Quantize(scale=tensor([0.0233]), zero_point=tensor([107]), dtype=torch.quint8)
  (conv): QuantizedConvReLU2d(1, 1, kernel_size=(1, 1), stride=(1, 1), scale=0.007877849042415619, zero_point=0)
  (relu): Identity()
  (dequant): DeQuantize()
)

## Embeddings

In [None]:
class VisionEmbeddings(nn.Module):
  def __init__(self, config: VisionConfig):
    super().__init__()
    self.config = config

    self.num_channels = config.num_channels  # 3 for RGB
    self.embed_dim = config.embed_dim  # 512
    self.image_size = config.image_size  # 32
    self.patch_size = config.patch_size  # 4

    self.patch_embedding = nn.Conv2d(
      in_channels=self.num_channels,
      out_channels=self.embed_dim,
      kernel_size=self.patch_size,
      stride=self.patch_size,
      padding=0,
    )

    self.num_patches = (self.image_size // self.patch_size) ** 2  # （32/4）^2 = 64
    self.num_positions = self.num_patches
    self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
    self.register_buffer(
      "position_ids",
      T.arange(self.num_positions).expand((1, -1)),
      persistent=False,
    )
    
    self.quant = Q.QuantStub()
    self.dequant = Q.DeQuantStub()

  def forward(self, pixel_values: T.FloatTensor) -> T.Tensor:
    # B, C, H, W = pixel_values.shape
    pixel_values = self.quant(pixel_values)
    patch_embeds = self.patch_embedding(pixel_values)
    patch_embeds = self.dequant(patch_embeds)
    
    embeddings = patch_embeds.flatten(start_dim=2, end_dim=-1)
    embeddings = embeddings.transpose(1, 2)
    embeddings = embeddings + self.position_embedding(self.position_ids)
    return embeddings

In [None]:
embd_fp32 = VisionEmbeddings(model_config).eval()
print(f"Shape: {embd_fp32(input_tensor_fp32).shape}")

Shape: torch.Size([1, 64, 512])


### Quantize the VisionEmbeddings module

In [14]:
def embd_calibrate(model, data_loader):
    model.eval()
    with T.no_grad():
        for img, _ in data_loader:
            model(img)

In [None]:
def quantize_model(model: nn.Module, model_name: str, calibrate_fn: callable) -> nn.Module:
    model.qconfig = Q.get_default_qconfig(backend)
    if model_name == 'embd':
        model.position_embedding.qconfig = Q.float_qparams_weight_only_qconfig

    # Prepare the model for static quantization. This inserts observers in
    # the model that will observe activation tensors during calibration.
    model = Q.prepare(model)
    calibrate_fn(model, calibration_loader_fp32)
    model = Q.convert(model)
    return model

embd_int8 = quantize_model(
    model=embd_fp32,
    model_name='embd',
    calibrate_fn=embd_calibrate
)

In [16]:
embd_int8

VisionEmbeddings(
  (patch_embedding): QuantizedConv2d(3, 512, kernel_size=(4, 4), stride=(4, 4), scale=0.02934972196817398, zero_point=69)
  (position_embedding): QuantizedEmbedding(num_embeddings=64, embedding_dim=512, dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams)
  (quant): Quantize(scale=tensor([0.0079]), zero_point=tensor([0]), dtype=torch.quint8)
  (dequant): DeQuantize()
)

### Save Input Data

In [None]:
def save_data(
    data_loader: T.utils.data.DataLoader,
    model: nn.Module,
    dir_path: str = "result/input",
) -> None:
    scale = model.quant.scale
    zero_point = model.quant.zero_point
    if scale is not None:
        scale = scale.detach().numpy()
        index =  math.ceil(math.log2(0.5/scale.item()))
        scale_file_path = os.path.join(dir_path, "scale.npy")
        index_file_path = os.path.join(dir_path, "index.npy")
        np.save(scale_file_path, scale)
        np.save(index_file_path, index)
    if zero_point is not None:
        zero_point = zero_point.detach().numpy()
        zero_point_file_path = os.path.join(dir_path, "zero_point.npy")
        np.save(zero_point_file_path, zero_point)
    datas_int8 = []
    for img_fp32, idx in data_loader:
        data_fp32 = embd_int8(img_fp32)
        data_int8 = (T.round(data_fp32 / scale) + zero_point).to(T.int8)
        datas_int8.append(data_int8.numpy())
    datas_int8_np = np.stack(datas_int8, axis=0)
    print(f"Shape: {datas_int8_np.shape}")
    data_file_path = os.path.join(dir_path, "input.npy")
    np.save(data_file_path, datas_int8_np)
    
    print(f"Saved input data to {dir_path}/")

save_data(
    data_loader=calibration_loader_fp32,
    model=embd_int8,
    dir_path="result/input",
)

Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Type: torch.int8
Shape: (100, 1, 64, 512)
Saved input data to result/input/


In [39]:
# Check input type and shape
input_data = np.load("result/input/input.npy")
print(f"Type: {type(input_data[0, 0, 0, 0])}, Shpae: {input_data.shape}\n{input_data[:, 0, 0, 0]}")

Type: <class 'numpy.int8'>, Shpae: (100, 1, 64, 512)
[-120  -67  -98  -84  -95  -88  -67 -105  -77 -127  -81 -116 -120  -67
 -127  -74  -67  -67  -88  -67  -67 -123  -81 -109  -95  -81 -123 -102
  -81 -102  -95 -105  -98  -88  -67  -88  -70  -95 -123 -123  -74  -95
  -91 -105  -67 -105 -123  119  -84  -70 -127  -77 -109  -74  -81  -74
 -102 -116 -120 -127 -113  -98  -77  -81 -120  -67  -95  -95  -98 -113
 -116  -74 -105  -95 -116  -77 -123 -123  -95  -88  -81 -102  -98  -91
 -116  -77  -74  -67 -113  -70 -105  -98 -127  -88 -113 -109  -88  -70
  -95  -95]


## MSA/Attention

In [8]:
class Attention(nn.Module):
    def __init__(self, config: VisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.dropout = config.attention_dropout

        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
        
        self.quant = Q.QuantStub()
        self.dequant = Q.DeQuantStub()

    def forward(self, hidden_states):
        # the hidden states are the embeddings of the patches, so (batch_size, num_patches, embed_dim)
        B, T, E = hidden_states.shape
        hidden_states = self.quant(hidden_states)  # int8
        q_states = self.q_proj(hidden_states)
        k_states = self.k_proj(hidden_states)
        v_states = self.v_proj(hidden_states)

        q_states = q_states.view(B, T, self.num_heads, E // self.num_heads).transpose(1, 2)
        k_states = k_states.view(B, T, self.num_heads, E // self.num_heads).transpose(1, 2)
        v_states = v_states.view(B, T, self.num_heads, E // self.num_heads).transpose(1, 2)
        
        # int8 quantization
        q_states = self.quant(q_states)
        k_states = self.quant(k_states)
        v_states = self.quant(v_states)

        attn_weights = (q_states @ k_states.transpose(-2, -1)) * (1.0 / math.sqrt(k_states.size(-1)))
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = self.dequant(attn_weights)  # float32
        attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training)
        
        attn_outs = attn_weights @ v_states
        
        attn_outs = self.quant(attn_outs)  # int8
        attn_outs = attn_outs.transpose(1, 2)
        attn_outs = attn_outs.reshape(B, T, E).contiguous()
        attn_outs = self.out_proj(attn_outs)
        attn_outs = self.dequant(attn_outs)
        return attn_outs

In [None]:
attn_32 = Attention(model_config).eval()
attn_32

Attention(
  (k_proj): Linear(in_features=48, out_features=48, bias=True)
  (v_proj): Linear(in_features=48, out_features=48, bias=True)
  (q_proj): Linear(in_features=48, out_features=48, bias=True)
  (out_proj): Linear(in_features=48, out_features=48, bias=True)
  (quant): QuantStub()
  (dequant): DeQuantStub()
)

### Quantize the attention module

In [None]:
def attn_calibrate(model, data_loader):
    model.eval()
    with T.no_grad():
        for img, _ in data_loader:
            model(embd_int8(img))

In [18]:
attn_int8 = quantize_model(attn_32, 'attn', attn_calibrate)
attn_int8

torch.float32




Attention(
  (k_proj): QuantizedLinear(in_features=48, out_features=48, scale=1.0, zero_point=0, qscheme=torch.per_tensor_affine)
  (v_proj): QuantizedLinear(in_features=48, out_features=48, scale=1.0, zero_point=0, qscheme=torch.per_tensor_affine)
  (q_proj): QuantizedLinear(in_features=48, out_features=48, scale=1.0, zero_point=0, qscheme=torch.per_tensor_affine)
  (out_proj): QuantizedLinear(in_features=48, out_features=48, scale=1.0, zero_point=0, qscheme=torch.per_tensor_affine)
  (quant): Quantize(scale=tensor([1.]), zero_point=tensor([0]), dtype=torch.quint8)
  (dequant): DeQuantize()
)

In [None]:
QK_output = np.load('QK_output.npy')
QK_output[0]

array([[[[-0.5777374 , -0.20340034, -0.26059115, ..., -0.1801702 ,
           0.05257939, -0.37605557],
         [ 0.07773671, -0.4927413 , -0.5331277 , ..., -0.17008369,
          -0.1610846 ,  0.2325479 ],
         [-1.2353468 , -0.5868912 , -0.8142619 , ...,  0.5414332 ,
          -0.3233808 , -0.12766021],
         ...,
         [-0.4429888 , -0.03145947, -0.5798223 , ..., -0.11446146,
          -0.10229073,  0.15403038],
         [-0.5578185 ,  0.10655621,  0.31388047, ...,  0.40862387,
          -0.10129814,  0.2565011 ],
         [ 0.08744194, -0.04625409, -0.15002586, ..., -0.10700875,
          -0.2133356 ,  0.03119151]],

        [[-0.6749521 ,  0.07763428, -0.20227295, ...,  0.13347055,
           0.37727988,  0.09413266],
         [ 0.4384782 ,  0.18808739,  0.41153464, ...,  0.00200756,
          -0.05175502,  0.18237492],
         [ 0.03340231, -0.31025946,  0.48925775, ..., -0.10289848,
          -0.19903986, -0.2742836 ],
         ...,
         [-0.46215126, -0.1157022 

### Old Quantized MSA Layer

In [None]:
attn_int8

Attention(
  (k_proj): QuantizedLinear(in_features=768, out_features=768, scale=0.017669403925538063, zero_point=130, qscheme=torch.per_tensor_affine)
  (v_proj): QuantizedLinear(in_features=768, out_features=768, scale=0.01814001053571701, zero_point=134, qscheme=torch.per_tensor_affine)
  (q_proj): QuantizedLinear(in_features=768, out_features=768, scale=0.018048712983727455, zero_point=131, qscheme=torch.per_tensor_affine)
  (out_proj): QuantizedLinear(in_features=768, out_features=768, scale=0.0008625031914561987, zero_point=131, qscheme=torch.per_tensor_affine)
  (quant): Quantize(scale=tensor([0.0274]), zero_point=tensor([127]), dtype=torch.quint8)
  (dequant): DeQuantize()
)

## MLP

In [None]:
class MLP(nn.Module):
    def __init__(self, config: VisionConfig):
        super().__init__()
        self.config = config
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.quant = Q.QuantStub()
        self.dequant = Q.DeQuantStub()

    def forward(self, hidden_states: T.Tensor) -> T.Tensor:
        hidden_states = self.quant(hidden_states)
        hidden_states = self.fc1(hidden_states)
        hidden_states = nn.functional.gelu(hidden_states, approximate="tanh")
        hidden_states = self.fc2(hidden_states)
        hidden_states = self.dequant(hidden_states)
        return hidden_states

mlp = MLP(model_config)
print(f"MLP: {mlp}\nShape: {mlp(embd_fp32(input_tensor_fp32[:1])).shape}")

MLP: MLP(
  (fc1): Linear(in_features=48, out_features=144, bias=True)
  (fc2): Linear(in_features=144, out_features=48, bias=True)
  (quant): QuantStub()
  (dequant): DeQuantStub()
)
Shape: torch.Size([1, 64, 48])


## Encoder

In [19]:
class EncoderLayer(nn.Module):
    def __init__(self, config: VisionConfig):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = Attention(config)
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        self.mlp = MLP(config)
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        
        # quantization
        self.quant = T.ao.quantization.QuantStub()
        self.dequant = T.ao.quantization.DeQuantStub()


    def forward(self, hidden_states):
        hidden_states = self.quant(hidden_states)
        residual = hidden_states
        hidden_states = self.layer_norm1(hidden_states)
        hidden_states = self.self_attn(hidden_states)
        hidden_states = residual + hidden_states
        hidden_states = self.dequant(hidden_states)

        hidden_states = self.quant(hidden_states)
        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        hidden_states = self.dequant(hidden_states)
        return hidden_states

encoder_layer_32 = EncoderLayer(VisionConfig(hidden_size=768, intermediate_size=3072))
encoder_layer_32(T.randn(1, 196, 768)).shape

torch.Size([1, 196, 768])

In [24]:
encoder_layer_32.qconfig = T.ao.quantization.get_default_qconfig(backend)

# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
encoder_layer_32_prepared = T.ao.quantization.prepare(encoder_layer_32)

# calibrate the prepared model to determine quantization parameters for activations
# in a real world setting, the calibration would be done with a representative dataset
input_fp32 = T.randn(1, 196, 768)
encoder_layer_32_prepared(input_fp32)

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
encoder_layer_int8 = T.ao.quantization.convert(encoder_layer_32_prepared)
encoder_layer_int8

EncoderLayer(
  (self_attn): Attention(
    (k_proj): QuantizedLinear(in_features=768, out_features=768, scale=0.017260996624827385, zero_point=130, qscheme=torch.per_tensor_affine)
    (v_proj): QuantizedLinear(in_features=768, out_features=768, scale=0.01868443191051483, zero_point=128, qscheme=torch.per_tensor_affine)
    (q_proj): QuantizedLinear(in_features=768, out_features=768, scale=0.017791934311389923, zero_point=125, qscheme=torch.per_tensor_affine)
    (out_proj): QuantizedLinear(in_features=768, out_features=768, scale=0.0009516198770143092, zero_point=124, qscheme=torch.per_tensor_affine)
    (quant): Quantize(scale=tensor([0.0284]), zero_point=tensor([127]), dtype=torch.quint8)
    (dequant): DeQuantize()
  )
  (layer_norm1): QuantizedLayerNorm((768,), eps=1e-06, elementwise_affine=True)
  (mlp): MLP(
    (fc1): QuantizedLinear(in_features=768, out_features=3072, scale=0.017856968566775322, zero_point=128, qscheme=torch.per_tensor_affine)
    (fc2): QuantizedLinear(in_fe

## Save Weights

In [74]:
def save_model(model: nn.Module, model_name: str):
    save_dir = f"{model_name}_quant_weights"
    os.makedirs(save_dir, exist_ok=True)
    
    def extract_packed_params(qlinear):
        """Extract packed weight, bias, scale, zero_point from quantized Linear layer"""
        packed_weight, bias = qlinear.weight(), qlinear.bias()
        scale = qlinear.scale
        zero_point = qlinear.zero_point
        return packed_weight, bias, scale, zero_point

    for name, param in model.state_dict().items():
        if isinstance(param, T.Tensor):  # 只存 Tensor，避免 dtype 錯誤
            print(f"{name}: {param.shape}")
            layer_name = name.split(".")[0]
            
            # 處理量化 Linear 層，提取 `_packed_params`
            layer = getattr(model, layer_name)
            if isinstance(layer, T.nn.quantized.Linear):
                if hasattr(model, layer_name):
                    packed_weight, bias, scale, zero_point = extract_packed_params(layer)
                    # print(f"{packed_weight}, {bias}, {scale}, {zero_point}")

                    # 存儲 `int8` 權重
                    int8_weight = packed_weight.int_repr().cpu().numpy()
                    np.save(os.path.join(save_dir, f"{name}.npy"), int8_weight)
                    print(f"✅ Saved {layer_name}.weight as int8 (scale={scale}, zero_point={zero_point})")

                    # 存儲 `float32` bias
                    np.save(os.path.join(save_dir, f"{name}_bias.npy"), bias.detach().numpy())
                    print(f"✅ Saved {layer_name}.bias as float32")
            # 如果是量化張量，先轉為 int8
            elif param.dtype in [T.qint8, T.quint8]:
                # print(param)
                zero_point = 0
                if param.qscheme == T.per_tensor_affine:
                    zero_point = param.q_zero_point()
                    int8_tensor = (param.int_repr() - zero_point).cpu().numpy()  # 轉成 int8
                else:
                    int8_tensor = param.int_repr().cpu().numpy()
                np.save(os.path.join(save_dir, f"{name}.npy"), int8_tensor)  # 存成 .npy
                print(f"✅ Saved {name} as int8 (zero_point={zero_point})")
            else:
                weight_arr = param.detach().numpy()
                np.save(os.path.join(save_dir, f"{name}.npy"), weight_arr)
                print(f"✅ Saved {name} as float32")
        else:
            print(f"⚠️ Skip {name} (Not Tensor, type: {type(param)})")

    print(f" {model_name} weights have been saved!")

In [73]:
save_model(embd_int8, 'Embeddings')

patch_embedding.weight: torch.Size([48, 3, 4, 4])
✅ Saved patch_embedding.weight as int8 (zero_point=0)
patch_embedding.bias: torch.Size([48])
✅ Saved patch_embedding.bias as float32
patch_embedding.scale: torch.Size([])
✅ Saved patch_embedding.scale as float32
patch_embedding.zero_point: torch.Size([])
✅ Saved patch_embedding.zero_point as float32
⚠️ Skip position_embedding._packed_params.dtype (Not Tensor, type: <class 'torch.dtype'>)
position_embedding._packed_params._packed_weight: torch.Size([64, 48])
✅ Saved position_embedding._packed_params._packed_weight as int8 (zero_point=0)
quant.scale: torch.Size([1])
✅ Saved quant.scale as float32
quant.zero_point: torch.Size([1])
✅ Saved quant.zero_point as float32
 Embeddings weights have been saved!


In [72]:
save_model(attn_int8, 'Attention')

k_proj.scale: torch.Size([])
✅ Saved k_proj.weight as int8 (scale=0.02344927377998829, zero_point=124)
✅ Saved k_proj.bias as float32
k_proj.zero_point: torch.Size([])
✅ Saved k_proj.weight as int8 (scale=0.02344927377998829, zero_point=124)
✅ Saved k_proj.bias as float32
⚠️ Skip k_proj._packed_params.dtype (Not Tensor, type: <class 'torch.dtype'>)
⚠️ Skip k_proj._packed_params._packed_params (Not Tensor, type: <class 'tuple'>)
v_proj.scale: torch.Size([])
✅ Saved v_proj.weight as int8 (scale=0.022955600172281265, zero_point=131)
✅ Saved v_proj.bias as float32
v_proj.zero_point: torch.Size([])
✅ Saved v_proj.weight as int8 (scale=0.022955600172281265, zero_point=131)
✅ Saved v_proj.bias as float32
⚠️ Skip v_proj._packed_params.dtype (Not Tensor, type: <class 'torch.dtype'>)
⚠️ Skip v_proj._packed_params._packed_params (Not Tensor, type: <class 'tuple'>)
q_proj.scale: torch.Size([])
✅ Saved q_proj.weight as int8 (scale=0.020705705508589745, zero_point=123)
✅ Saved q_proj.bias as float32