建立一個torch model給hls4ml轉換

一開始不會有model4hls.pth，這個是從baseline model的transformer encoder取出來的weight

In [1]:
import os
import random
import argparse
import json
from warnings import warn
from typing import List, Dict
from pathlib import Path
from functools import partial
from textwrap import wrap
from contextlib import suppress
from statistics import mean, stdev

import numpy as np
from tqdm import tqdm
import wandb
import matplotlib.pyplot as plt
from mpl_toolkits import axes_grid1
from einops import rearrange, reduce
from timm.models import create_model
from timm.utils import accuracy
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import re

import models
from datasets import build_dataset
from train import get_args_parser, adjust_config, set_seed, set_run_name, count_params

  def vit_tiny_patch16_224(pretrained: bool = False, **kwargs):
  def vit_small_patch16_224(pretrained: bool = False, **kwargs):
  def vit_small_patch8_224(pretrained: bool = False, **kwargs):
  def vit_base_patch16_224(pretrained: bool = False, **kwargs):
  def vit_base_patch8_224(pretrained: bool = False, **kwargs):
  def vit_large_patch16_224(pretrained: bool = False, **kwargs):
  def vit_large_patch14_224(pretrained: bool = False, **kwargs):
  def vit_huge_patch14_224(pretrained: bool = False, **kwargs):
  def vit_base_patch16_224_miil(pretrained: bool = False, **kwargs):
  def vit_medium_patch16_gap_240(pretrained: bool = False, **kwargs):
  def vit_medium_patch16_gap_256(pretrained: bool = False, **kwargs):
  def vit_base_patch16_gap_224(pretrained: bool = False, **kwargs):
  def vit_huge_patch14_gap_224(pretrained: bool = False, **kwargs):
  def vit_giant_patch16_gap_224(pretrained: bool = False, **kwargs):
  def vit_base_patch16_clip_224(pretrained: bool = False, **kwargs):
  d

In [2]:
parser = argparse.ArgumentParser('DeiT training and evaluation script', parents=[get_args_parser()])
parser.add_argument('--compute_attention_average', action='store_true')
parser.add_argument('--compute_attention_cka', action='store_true')
parser.set_defaults(output_dir='results_inference')
args = parser.parse_args(args=[])

# args.model = 'topk_deit_tiny_patch16_224.fb_in1k'
args.model = 'evit_deit_tiny_patch16_224.fb_in1k'
# args.model = 'tome_deit_tiny_patch16_224.fb_in1k'
args.cfg = 'configs/cub_test3.4p_ft_weakaugs.yaml'
# args.cfg = 'configs/cotton_ft_weakaugs.yaml'
args.device = 'cpu'
args.keep_rate = [0.7]
args.reduction_loc = [3, 6, 9]
args.train_trainval = True
args.input_size = 224
args.model_depth = 12
# clca
# args.ifa_head = True
# args.clc = True
# args.num_clr = 1
adjust_config(args)
# args.finetune = './results_tiny/{}_topk_deit_tiny_patch16_224.fb_in1k_61.pth'.format(args.dataset_name)
# args.finetune = './results_tiny/{}_topk_deit_tiny_patch16_224.fb_in1k_{}_61.pth'.format(args.dataset_name, args.keep_rate[0])
# args.finetune = './results_tiny/{}_topk_deit_tiny_patch16_224.fb_in1k_{}_cla_clc_1_61.pth'.format(args.dataset_name, args.keep_rate[0])

model_name = args.model  # 例如 'evit_deit_tiny_patch16_224.fb_in1k'
enable_evit = re.search(r'evit', model_name) or re.search(r'tome', model_name)
enable_tome = re.search(r'tome', model_name)
# if enable_evit:
#     print('model 包含 evit')


{'dataset_name': 'cub', 'dataset_root_path': '../../data/cub/CUB_200_2011', 'df_train': 'train.csv', 'df_trainval': 'train_val.csv', 'df_val': 'val.csv', 'df_test': 'test_100.csv', 'folder_train': 'images', 'folder_val': 'images', 'folder_test': 'images', 'df_classid_classname': 'classid_classname.csv'}
{'pretrained': True}
{'horizontal_flip': True}


In [3]:
set_seed(args.seed)

dataset_train, args.num_classes = build_dataset(is_train=True, args=args)
dataset_val, _ = build_dataset(is_train=False, args=args)

sampler_train = torch.utils.data.RandomSampler(dataset_train)
sampler_val = torch.utils.data.SequentialSampler(dataset_val)

train_loader = torch.utils.data.DataLoader(
    dataset_train, sampler=sampler_train,
    batch_size=args.batch_size,
    num_workers=args.num_workers,
    pin_memory=args.pin_mem,
    drop_last=False,
)
test_loader = torch.utils.data.DataLoader(
    dataset_val, sampler=sampler_val,
    batch_size=args.batch_size,
    num_workers=args.num_workers,
    pin_memory=args.pin_mem,
    drop_last=False
)

if args.finetune and args.ifa_head and args.clc:
    args.setting = 'ft_clca'
elif args.finetune and args.ifa_head:
    args.setting = 'ft_cla'
elif args.finetune:
    args.setting = 'ft_bl'
else:
    args.setting = 'fz_bl'

print(f"Creating model: {args.model}")
model = create_model(
    args.model,
    pretrained=True,
    pretrained_cfg=None,
    pretrained_cfg_overlay=None,
    num_classes=1000,
    drop_rate=args.drop,
    drop_path_rate=args.drop_path,
    drop_block_rate=None,
    img_size=args.input_size,
    args = args
)
if args.dataset_name.lower() != "imagenet":
    model.reset_classifier(args.num_classes)
if args.num_clr:
    model.add_clr(args.num_clr)

if args.finetune:
    checkpoint = torch.load(args.finetune, map_location='cpu')
    # model.load_state_dict(checkpoint['model'], strict=True)

    checkpoint_model = checkpoint['model']
    state_dict = model.state_dict()
    for k in ['head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias']:
        if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape:
            print(f"Removing key {k} from pretrained checkpoint")
            del checkpoint_model[k]

    # interpolate position embedding
    pos_embed_checkpoint = checkpoint_model['pos_embed']
    embedding_size = pos_embed_checkpoint.shape[-1]
    num_patches = model.patch_embed.num_patches
    num_extra_tokens = model.pos_embed.shape[-2] - num_patches
    # height (== width) for the checkpoint position embedding
    orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
    # height (== width) for the new position embedding
    new_size = int(num_patches ** 0.5)
    # class_token and dist_token are kept unchanged
    extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
    # only the position tokens are interpolated
    pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
    pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
    pos_tokens = torch.nn.functional.interpolate(
        pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
    pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
    new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
    checkpoint_model['pos_embed'] = new_pos_embed

    model.load_state_dict(checkpoint_model, strict=False)

model.to(args.device)
model.eval()

Compose(
    Resize(size=(256, 256), interpolation=bicubic, max_size=None, antialias=True)
    RandomCrop(size=(224, 224), padding=None)
    RandomHorizontalFlip(p=0.5)
    ToTensor()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
)
Compose(
    Resize(size=(256, 256), interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    ToTensor()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
)
Creating model: evit_deit_tiny_patch16_224.fb_in1k
[0.7, 0.7, 0.7] [3, 6, 9]


EViT(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (pre_softmax): Identity()
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
      

### 建立`model4hls`使得HLS4ML可以識別，並轉移DeiT-T `model`權重至`model4hls`

In [4]:
class Transformer4HLS(torch.nn.Module):
    def __init__(self, d_model, nhead, num_encoder_layers, dim_feedforward, dropout, activation, norm_first, device, ifa_head, num_clr=0):
        super().__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.num_encoder_layers = num_encoder_layers
        self.dim_feedforward = dim_feedforward
        self.dropout = dropout
        self.activation = activation
        self.norm_first = norm_first
        self.device = device
        self.ifa_head = ifa_head
        self.num_clr = num_clr

        self._init_transformer()
        # if num_clr > 0:
        #     self._init_clr()

    def _init_transformer(self):
        # norm = nn.LayerNorm(self.d_model)
        if self.ifa_head:
            norm = None
        else:
            norm = nn.LayerNorm(self.d_model)
        self.transformer_encoder = nn.TransformerEncoder(
                                    nn.TransformerEncoderLayer(d_model=self.d_model, 
                                                                nhead=self.nhead,
                                                                dim_feedforward=self.dim_feedforward,
                                                                dropout=self.dropout,
                                                                activation=self.activation,
                                                                norm_first=self.norm_first,
                                                                device=self.device),
                                    self.num_encoder_layers,
                                    norm=norm
                                    )
    # def _init_clr(self):
    #     self.clr = nn.Parameter(torch.zeros(1, self.num_clr, self.d_model))

    def forward(self, src):  
        output = self.transformer_encoder(src)
        return output

torch.manual_seed(0)
model4hls = Transformer4HLS(d_model=192, 
                          nhead=3, 
                          num_encoder_layers=args.model_depth, 
                          dim_feedforward=768, 
                          dropout=0, 
                          activation='gelu', 
                          norm_first=True, 
                          device='cpu',
                          ifa_head=args.ifa_head,
                          num_clr=args.num_clr if args.num_clr else 0)

model4hls.eval()

for i in range(args.model_depth):
    model4hls.transformer_encoder.layers[i].self_attn.in_proj_weight    = model.blocks[i].attn.qkv.weight
    model4hls.transformer_encoder.layers[i].self_attn.in_proj_bias      = model.blocks[i].attn.qkv.bias
    model4hls.transformer_encoder.layers[i].self_attn.out_proj.weight   = model.blocks[i].attn.proj.weight
    model4hls.transformer_encoder.layers[i].self_attn.out_proj.bias     = model.blocks[i].attn.proj.bias
    model4hls.transformer_encoder.layers[i].linear1.weight              = model.blocks[i].mlp.fc1.weight
    model4hls.transformer_encoder.layers[i].linear1.bias                = model.blocks[i].mlp.fc1.bias
    model4hls.transformer_encoder.layers[i].linear2.weight              = model.blocks[i].mlp.fc2.weight
    model4hls.transformer_encoder.layers[i].linear2.bias                = model.blocks[i].mlp.fc2.bias
    model4hls.transformer_encoder.layers[i].norm1.weight                = model.blocks[i].norm1.weight
    model4hls.transformer_encoder.layers[i].norm1.bias                  = model.blocks[i].norm1.bias
    model4hls.transformer_encoder.layers[i].norm2.weight                = model.blocks[i].norm2.weight
    model4hls.transformer_encoder.layers[i].norm2.bias                  = model.blocks[i].norm2.bias

if not args.ifa_head:
    model4hls.transformer_encoder.norm.weight   = model.norm.weight
    model4hls.transformer_encoder.norm.bias     = model.norm.bias

# if args.num_clr and hasattr(model, 'clr'):
#     print(model4hls.clr.data)
#     model4hls.clr.data = model.clr.data.clone()
#     print(f"Transferred CLR tokens: {model.clr.shape} -> {model4hls.clr.shape}")
    
    # print(model4hls.clr.data)

# torch.save(model4hls, './model4hls_{}.pth'.format(args.input_size))



### 比較model和model4hls的輸出

In [5]:
# for idx in range(1):
#     random_tensor = torch.randn(1, 3, args.input_size, args.input_size)
#     # 將 random_tensor 移動到與模型相同的設備
#     random_tensor = random_tensor.to(args.device)
#     model4hls.to(args.device)
#     # print(random_tensor)
#     with torch.no_grad():
#         x = model.patch_embed(random_tensor)
#         x = model._pos_embed(x)
#         x = model.patch_drop(x)
#         x = model.norm_pre(x)
#         print('Input shape of encoders = {}'.format(x.shape))
#         out = x
#         out2 = x
#         # out, left_token, sample_idx, compl = model.blocks[0](x)
#         # out2 = model4hls.transformer_encoder.layers[0](x.permute(1, 0, 2))
#         for i, blk in enumerate(model.blocks):
#             # print('Processing block {}'.format(i))
#             if enable_evit:
#                 out, left_token, sample_idx, compl = blk(out) # for evit
#             else:
#                 out, left_token, sample_idx = blk(out) # for topk  
#         out2 = model4hls(out2.permute(1, 0, 2))
#         out2 = out2.permute(1, 0, 2)
#         if not args.ifa_head:
#             out = model.norm(out)
#         print(out.shape)
#         print(out2.shape)
#         print(out[0][0][0:3])
#         print(out2[0][0][0:3])
        # difference = (out - out2).abs().max()
        
        # print('Difference between pytorch model and model4hls = {}'.format(difference))

### 生成`transformer_quant_config`量化transformer encoder，並calibrate quantized model觀察quantizer的數值範圍並重新生成calibated `transformer_quant_config`
#### Tips : 由於calibration可能會很久(取決於使用多大的calibation dataset)，建議將calibrated `transformer_quant_config`存檔

In [6]:
from quantizers import *
from synchronizer import *
from quantizers_old import calibrate_transformer as old_calibrate_transformer
import hls4ml
import json
import copy
from pprint import pprint

def add_topk_cfg_if_needed(qcfg_layer: dict):
    if 'topk' not in qcfg_layer:
        qcfg_layer['topk'] = {
            'input':  {'bitwidth': 18, 'int_bitwidth': 8},
        }

def add_clc_push_cfg_if_needed(qcfg_layer: dict):
    if 'clc_push' not in qcfg_layer:
        qcfg_layer['clc_push'] = {
            'input':  {'bitwidth': 18, 'int_bitwidth': 8},
        }

def add_clc_recover_cfg_if_needed(qcfg_layer: dict):
    if 'clc_recover' not in qcfg_layer:
        qcfg_layer['clc_recover'] = {
            'input':  {'bitwidth': 18, 'int_bitwidth': 8},
        }

def inject_reduction_and_clc_to_quant_cfg(transformer_quant_config: dict,
                                          num_layers: int,
                                          reduction_loc: list[int],
                                          keep_rates: list[float],
                                          use_clc: bool,
                                          clc_recover_at_last: bool):
    if keep_rates:
            if len(keep_rates) == 1 and len(reduction_loc) > 1:
                keep_rates = keep_rates * len(reduction_loc)
            elif len(keep_rates) == len(reduction_loc):
                keep_rates = keep_rates
            else:
                keep_rates = (keep_rates * ((len(reduction_loc) // len(keep_rates)) + 1))[:len(reduction_loc)]
    print(f"Final keep rates for reduction layers: {keep_rates}")

    for loc in reduction_loc:
        if loc < num_layers:
            print(f"Injecting TopK quant config at layer {loc} with keep_rate {keep_rates[reduction_loc.index(loc)] if keep_rates else 'N/A'}")
            # if keep_rates[reduction_loc.index(loc)] < 1.0:
            add_topk_cfg_if_needed(transformer_quant_config[loc])

    # 2) CLC：在「push 發生到 recover 之前的層」加 clc_push，
    #         並在 recover 的層加 clc_recover
    if use_clc:
        if clc_recover_at_last:
            recovery_layers = reduction_loc + [num_layers - 2] if num_layers >= 2 else reduction_loc
        else:
            recovery_layers = reduction_loc
            
        last_recover = max(recovery_layers)
        for i in range(last_recover):  # < last_recover 的層都可能 push
            if i < num_layers:
                add_clc_push_cfg_if_needed(transformer_quant_config[i])
        for i in recovery_layers:  # recover 的層
            if i < num_layers:
                add_clc_recover_cfg_if_needed(transformer_quant_config[i])

def load_transformer_quant_config(quant_config_path: str = "./quant_config.json",
                                  norm_quant_config_path: str = "./norm_quant_config.json",
                                  num_layers: int = 1) -> dict:
    with open(quant_config_path, 'r') as f:
        quant_config = json.load(f)
    with open(norm_quant_config_path, 'r') as f:
        norm_quant_config = json.load(f)
    transformer_quant_config = {}
    for i in range(num_layers):
        transformer_quant_config[i] = copy.deepcopy(quant_config)
    transformer_quant_config['norm'] = copy.deepcopy(norm_quant_config)
    return transformer_quant_config

2025-10-31 04:29:35.771627: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-31 04:29:35.802946: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [7]:
from tqdm import tqdm

transformer_quant_config = load_transformer_quant_config(num_layers=args.model_depth)

inject_reduction_and_clc_to_quant_cfg(
    transformer_quant_config,
    num_layers=args.model_depth,
    reduction_loc=args.reduction_loc,
    keep_rates=(args.keep_rate if isinstance(args.keep_rate, list) else [args.keep_rate]),
    use_clc=args.clc,
    clc_recover_at_last=args.clc_recover_at_last
)
print('Before calibration:')
pprint(transformer_quant_config)

qmodel = QTransformerEncoder([QTransformerEncoderLayer(192, 
                                                       3, 
                                                       768, 
                                                       activation='gelu', 
                                                       quant_config=transformer_quant_config[i], 
                                                       calibration=True, 
                                                       device=args.device,
                                                       enable_topk=i in args.reduction_loc,
                                                       enable_evit=enable_evit,
                                                       enable_clc=args.clc,
                                                       num_clr=args.num_clr) for i in range(args.model_depth)], 
                             args.model_depth, 
                             QLayerNorm(192, quant_config=transformer_quant_config['norm'], calibration=True, device=args.device),
                             TorchQuantizer(bitwidth=18, int_bitwidth=5, signed=True, calibration=True),
                             args,
                             dtype=torch.float64)
qmodel.transfer_weights(model4hls)
qmodel.to(args.device)
qmodel.eval()

# 假設你要用多個 batch 做校準
num_calib_batches = 1  # 例如只取前 1 個 batch 校準
args.batch_size = 512
calib_iter = iter(train_loader)

for i in tqdm(range(num_calib_batches), desc="Calibrating"):
    images, target = next(calib_iter)
    images = images.to(args.device)
    # ...前處理...
    x = model.patch_embed(images)
    x = model._pos_embed(x)
    x = model.patch_drop(x)
    x = model.norm_pre(x)
    print(x.min())
    print(x.max())
    # 執行一次校準
    # 檢查calibration data和test data的統計差異
    print("Calibration data stats:")
    print(f"  mean: {x.mean()}, std: {x.std()}")
    transformer_quant_config = calibrate_transformer(qmodel, transformer_quant_config, x.permute(1, 0, 2).type(torch.float64))

# print('After calibration:')
# pprint(transformer_quant_config)
#save transformer_quant_config
# torch.save(transformer_quant_config, './transformer_quant_config_{}_{}_{}_{}_{}.pth'.format(args.input_size, args.finetune.split('/')[-1].replace('.pth', ''), args.dataset_name, num_calib_batches, args.batch_size))
# torch.save(transformer_quant_config, './transformer_quant_config_{}_test.pth'.format(args.input_size))

Final keep rates for reduction layers: [0.7, 0.7, 0.7]
Injecting TopK quant config at layer 3 with keep_rate 0.7
Injecting TopK quant config at layer 6 with keep_rate 0.7
Injecting TopK quant config at layer 9 with keep_rate 0.7
Before calibration:
{0: {'ffn': {'in_proj': {'bias': {'bitwidth': 18, 'int_bitwidth': 8},
                         'input': {'bitwidth': 18,
                                   'int_bitwidth': 8,
                                   'quantize': False},
                         'output': {'bitwidth': 18,
                                    'int_bitwidth': 8,
                                    'quantize': False},
                         'weight': {'bitwidth': 18, 'int_bitwidth': 8}},
             'out_proj': {'bias': {'bitwidth': 18, 'int_bitwidth': 8},
                          'input': {'bitwidth': 18, 'int_bitwidth': 8},
                          'output': {'bitwidth': 18, 'int_bitwidth': 8},
                          'weight': {'bitwidth': 18, 'int_bitwidth': 

Calibrating:   0%|          | 0/1 [00:00<?, ?it/s]

tensor(-11.060524940490723, grad_fn=<MinBackward1>)
tensor(13.810079574584961, grad_fn=<MaxBackward1>)
Calibration data stats:
  mean: 0.013068894855678082, std: 1.2197645902633667
input:  tensor([ 0.043638907372952, -0.071037769317627, -1.415930986404419],
       dtype=torch.float64, requires_grad=True)
tensor([182,  65, 195,   0, 186, 111,  55,  13, 120, 187, 189, 181,  64,  93,
          1, 123,  79,  92, 119, 108, 130, 112, 105,  29, 194, 135, 136,  27,
         41,   5, 167, 102, 106, 121,  97, 137, 188, 126, 122,   2, 133,  83,
        117,  94, 107,  90,  80, 185,  91,  89, 150, 176,  78, 191,  69,   6,
        159, 125, 144,  14, 129, 168, 177,  77, 172, 139,  82, 140, 134, 101,
         84, 143,  63, 190, 183, 154,  51, 192,  12, 132, 109,  34, 162,  95,
        153,  70,  30,  66, 160, 147, 128, 115,   4, 104,  67,   3, 118, 149,
        171,  60, 184,  87, 173,  35, 103,  54,  76, 110, 158, 114, 175,  56,
        148,  42,  53,  58,  28,  48, 142,  26,  47, 193,  96,  46, 15

Calibrating: 100%|██████████| 1/1 [01:21<00:00, 81.11s/it]


In [8]:
#load transformer_quant_config
# transformer_quant_config = torch.load('./transformer_quant_config_{}_{}_{}.pth'.format(args.input_size, args.finetune.split('/')[-1].replace('.pth', ''), args.dataset_name))
# transformer_quant_config = torch.load('./transformer_quant_config_{}_{}_{}_batch32.pth'.format(args.input_size, args.finetune.split('/')[-1].replace('.pth', ''), args.dataset_name))
# transformer_quant_config = torch.load('./transformer_quant_config_{}_{}_{}_{}_{}.pth'.format(args.input_size, args.finetune.split('/')[-1].replace('.pth', ''), args.dataset_name, num_calib_batches, args.batch_size))
# transformer_quant_config = torch.load('./transformer_quant_config_{}_test.pth'.format(args.input_size))
# transformer_quant_config = torch.load('./transformer_quant_config_224_cub_topk_deit_tiny_patch16_224.fb_in1k_0.25_61_cub_10_256.pth')
pprint(transformer_quant_config)

{0: {'ffn': {'in_proj': {'bias': {'bitwidth': 18, 'int_bitwidth': 3},
                         'input': {'bitwidth': 18,
                                   'int_bitwidth': -inf,
                                   'quantize': False},
                         'output': {'bitwidth': 18,
                                    'int_bitwidth': -inf,
                                    'quantize': False},
                         'weight': {'bitwidth': 18, 'int_bitwidth': 1}},
             'out_proj': {'bias': {'bitwidth': 18, 'int_bitwidth': 3},
                          'input': {'bitwidth': 18, 'int_bitwidth': 5},
                          'output': {'bitwidth': 18, 'int_bitwidth': 4},
                          'weight': {'bitwidth': 18, 'int_bitwidth': 0}}},
     'input': {'bitwidth': 18, 'int_bitwidth': 8},
     'norm1': {'bias': {'bitwidth': 18, 'int_bitwidth': 2},
               'input': {'bitwidth': 18, 'int_bitwidth': 5},
               'mean': {'bitwidth': 18, 'int_bitwidth': -1},
    

In [9]:
# inject_reduction_and_clc_to_quant_cfg(
#     transformer_quant_config,
#     num_layers=args.model_depth,
#     reduction_loc=args.reduction_loc,
#     keep_rates=(args.keep_rate if isinstance(args.keep_rate, list) else [args.keep_rate]),
#     use_clc=args.clc,
#     clc_recover_at_last=args.clc_recover_at_last
# )

### 生成`state` for Simulated Annealing(若沒有要透過Simulated Annealing優化，這邊只是作為同步`quant_config`和`hls_config`的方法)並測試sync_quant_config
- `state`包含影響BRAM數目的變數`BRAMstate`以及不影響BRAM數目的變數`DSPstate`(或者說影響DSP數目的變數，但目前並沒有 ***TODO : 將DSP相關變數加入Design Search Space***)
- `num_layers`為Transformer Block的數量
- `weight_bits`主要包含MHSA的兩個linear的weight(或者是Q、K、V的weight以及O的weight)、FFN的兩個linear layer的weight的bit-wdith
- `table_input_bits`和`table_output_bits`包含，MHSA的exponential、倒數查表、LayerNorm的variance查表、FFN的GeLU(CDF)查表。
  - 2的`table_input_bits`次方即為Look-up table的Entry數量，因此這個數值只會設置約12上下(設成32可能會讓軟體本身overflow)
  - `table_output_bits`即為Look-up table的width。由於BRAM的配置18 bits或9 bits的使用效率最高，因此這邊通常只會是這兩個數值或其倍數
- `intermediate_bits`包含MHSA中的QKV cache，由於對Deit-tiny來說，QKV所需緩存很大，因此使用UltraRAM實現，而UltraRAM使用72 bits = 24 bits* 3 heads最有效率，並不將此列入BRAM計算(***TODO : KV cache存至HBM***)
- `result_bits`包含所有layer的output，使用FIFO實現，由於選取適當的Tile size可減小FIFO深度，所以使用LUTRAM實現並不列入BRAM計算(***TODO : Formulize FIFO深度與Tile size的關係以估計LUTRAM數量***)

In [10]:
import importlib
import synchronizer

importlib.reload(synchronizer)
from synchronizer import *

BRAMstate = gen_init_BRAMaware_state(num_layers=args.model_depth, 
                                   weight_bits=8, 
                                   table_input_bits=12, 
                                   table_output_bits=18, 
                                   intermediate_bits=24,
                                   result_bits=18,
                                   include_norm=not args.ifa_head)
# BRAMstate = gen_init_BRAMaware_state(num_layers=args.model_depth, 
#                                    weight_bits=8, 
#                                    table_input_bits=12, 
#                                    table_output_bits=8, 
#                                    intermediate_bits=24,
#                                    result_bits=8,
#                                    include_norm=not args.ifa_head)
# BRAMstate = gen_init_BRAMaware_state(num_layers=args.model_depth, 
#                                      weight_bits=32, 
#                                      table_input_bits=12, 
#                                      table_output_bits=32, 
#                                      intermediate_bits=32,
#                                      result_bits=32,
#                                      include_norm=not args.ifa_head)
DSPstate = gen_init_nonBRAMaware_state(num_layers=args.model_depth, include_norm=not args.ifa_head)
REDstate  = gen_reduction_state(transformer_quant_config, result_bits=18)
# state = {**BRAMstate, **DSPstate}
state = {**BRAMstate, **DSPstate, **REDstate}

model4hls.to(device='cpu')
if not args.ifa_head:
    input_shapes = [[1, int((args.input_size/16)**2+1), 192]]  # [batch_size, num_tokens, embed_dim]
else:
    input_shapes = [[1, int((args.input_size/16)**2+1+args.num_clr), 192]]  # [batch_size, num_tokens, embed_dim]
config = hls4ml.utils.config_from_pytorch_model(model4hls, 
                                              granularity='name',
                                              backend='Vitis',
                                              input_shapes=input_shapes, 
                                              default_precision='ap_fixed<18,5,AP_RND_CONV,AP_SAT>', 
                                              inputs_channel_last=True, 
                                              transpose_outputs=False)
valid = sync_quant_config(transformer_quant_config, config, state)
print(valid)

True


In [11]:
pprint(transformer_quant_config)

{0: {'ffn': {'in_proj': {'bias': {'bitwidth': 18, 'int_bitwidth': 3},
                         'input': {'bitwidth': 18,
                                   'int_bitwidth': -inf,
                                   'quantize': False},
                         'output': {'bitwidth': 24,
                                    'int_bitwidth': -inf,
                                    'quantize': False},
                         'weight': {'bitwidth': 8, 'int_bitwidth': 1}},
             'out_proj': {'bias': {'bitwidth': 18, 'int_bitwidth': 3},
                          'input': {'bitwidth': 24, 'int_bitwidth': 5},
                          'output': {'bitwidth': 18, 'int_bitwidth': 4},
                          'weight': {'bitwidth': 8, 'int_bitwidth': 0}}},
     'input': {'bitwidth': 18, 'int_bitwidth': 8},
     'norm1': {'bias': {'bitwidth': 18, 'int_bitwidth': 2},
               'input': {'bitwidth': 18, 'int_bitwidth': 5},
               'mean': {'bitwidth': 18, 'int_bitwidth': -1},
      

In [12]:
pprint(config)

{'LayerName': {'layers_0_add1': {'Precision': {'result': 'ap_fixed<18,5,AP_RND_CONV>'},
                                 'Trace': False},
               'layers_0_add2': {'Precision': {'result': 'ap_fixed<18,5,AP_RND_CONV>'},
                                 'Trace': False},
               'layers_0_ffn': {'CdfTableRange': 4,
                                'CdfTableSize': 4096,
                                'Precision': {'accum': 'ap_fixed<18,5,AP_RND_CONV,AP_SAT>',
                                              'cdf_table': 'ufixed<18,0,RND_CONV,SAT,0>',
                                              'hidden': 'ap_fixed<24,5,AP_RND_CONV>',
                                              'in_proj_bias': 'ap_fixed<18,3,AP_RND_CONV>',
                                              'in_proj_weight': 'ap_fixed<8,1,AP_RND_CONV>',
                                              'out_proj_bias': 'ap_fixed<18,3,AP_RND_CONV>',
                                              'out_proj_weight': 'ap_fix

In [13]:
# fdgfdfhfd

### 建立quantize model `qmodel` 並載入calibared和sync up後的 `transformer_quant_config`。配置HLS config中的Tile size以最大化BRAM以及硬體使用效率並產生 `hls_model` 和HLS project

In [14]:
# args.keep_rate = [0.25]
# args.reduction_loc = [3, 6, 9]

qmodel = QTransformerEncoder([QTransformerEncoderLayer(embed_dim=192, 
                                                       num_heads=3, 
                                                       hidden_dim=768, 
                                                       activation='gelu', 
                                                       quant_config=transformer_quant_config[i], 
                                                       calibration=False, 
                                                       device='cpu',
                                                       enable_topk=i in args.reduction_loc,
                                                       enable_evit=enable_evit,
                                                       enable_clc=args.clc,
                                                       num_clr=args.num_clr) for i in range(args.model_depth)], 
                             args.model_depth, 
                             QLayerNorm(normalized_shape=192, quant_config=transformer_quant_config['norm'], calibration=False, device='cpu', ifa_head=args.ifa_head),
                             TorchQuantizer(bitwidth=18, int_bitwidth=5, signed=True, calibration=False),
                            #  TorchQuantizer(bitwidth=32, int_bitwidth=5, signed=True, calibration=False),
                             args,
                             dtype=torch.float64)
qmodel.transfer_weights(model4hls)
qmodel.to(torch.device('cpu'))
qmodel.eval()

for layer_config in config['LayerName'].keys():
    if layer_config.endswith('self_attn'):
        config['LayerName'][layer_config]['TilingFactor'] = [1,1,1]
    elif layer_config.endswith('ffn'):
        config['LayerName'][layer_config]['TilingFactor'] = [1,1,12]

if enable_tome:
    method = 'tome'
elif enable_evit:
    method = 'evit'
else:
    method = 'topk'
config['Model']['Reduction'] = {
    'reduction_loc': args.reduction_loc,    # 0-based
    'keep_rate': args.keep_rate,              # or [0.5,0.5,0.5]
    'use_clc': args.clc,
    'method': method
}
# print('before convert:', config['Model'].get('Reduction'))
if not args.ifa_head:
    input_shapes = [[1, int((args.input_size/16)**2+1), 192]]  # [batch_size, num_tokens, embed_dim]
else:
    input_shapes = [[1, int((args.input_size/16)**2+1+args.num_clr), 192]]  # [batch_size, num_tokens, embed_dim]
# print(input_shapes[0])

hls_model = hls4ml.converters.convert_from_pytorch_model(
                                                            model4hls,
                                                            input_shapes,
                                                            # output_dir='./hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_test'.format(args.input_size),
                                                            # output_dir='./hls/deit_tiny_w8_Bdk-1_Bffn-12_{}'.format(args.input_size),
                                                            # output_dir='./hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_{}_trained'.format(args.input_size, args.dataset_name),
                                                            # output_dir='./hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_{}_trained_topk-{}'.format(args.input_size, args.dataset_name, args.keep_rate[0]),
                                                            output_dir='./hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_{}_kr-{}'.format(args.input_size, method, args.keep_rate[0]),
                                                            # output_dir='./hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_{}_kr-{}_{}_{}'.format(args.input_size, method, args.keep_rate[0], args.keep_rate[1], args.keep_rate[2]),
                                                            # output_dir='./hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_{}_kr-{}_clc'.format(args.input_size, method, args.keep_rate[0]),
                                                            project_name='myproject',
                                                            backend='Vitis',
                                                            part='xcu55c-fsvh2892-2L-e',
                                                            #board='alveo-u55c',
                                                            hls_config=config,
                                                            io_type='io_tile_stream',
                                                        )
hls_model.compile()

Enable CLC: False
Number of CLR tokens: 0
Reduction locations: [3, 6, 9]
Recovery layers: []
Keep rates: [0.7, 0.7, 0.7]
Writing HLS project
Done


In [15]:
print(model4hls)

Transformer4HLS(
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-11): 12 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
        )
        (linear1): Linear(in_features=192, out_features=768, bias=True)
        (dropout): Dropout(p=0, inplace=False)
        (linear2): Linear(in_features=768, out_features=192, bias=True)
        (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0, inplace=False)
        (dropout2): Dropout(p=0, inplace=False)
      )
    )
    (norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
  )
)


In [16]:
from torch.fx import symbolic_trace

traced_model = symbolic_trace(model4hls)

for node in traced_model.graph.nodes:
    print(node)

src
transformer_encoder
output


In [17]:
print('./hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_{}_kr-{}'.format(args.input_size, method, args.keep_rate))
# print('./hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_{}_kr-{}_clc'.format(args.input_size, method, args.keep_rate[0]))

./hls/deit_tiny_w8_Bdk-1_Bffn-12_224_evit_kr-[0.7]


In [18]:
# fgshfhshf

### 比較`qmodel` 、 `hls_model`和`model4hls`的輸出。理論上，`qmodel` 和 `hls_model`的輸入要一致

In [19]:
import importlib
import quantizers

# 重新載入 quantizers 模組
importlib.reload(quantizers)
from quantizers import *
model4hls = model4hls.to(torch.device('cuda'))
qmodel = QTransformerEncoder([QTransformerEncoderLayer(embed_dim=192, 
                                                       num_heads=3, 
                                                       hidden_dim=768, 
                                                       activation='gelu', 
                                                       quant_config=transformer_quant_config[i], 
                                                       calibration=False, 
                                                       device='cuda',
                                                       enable_topk=i in args.reduction_loc,
                                                       enable_evit=enable_evit,
                                                       enable_clc=args.clc,
                                                       num_clr=args.num_clr) for i in range(args.model_depth)], 
                             args.model_depth, 
                             QLayerNorm(normalized_shape=192, 
                                        quant_config=transformer_quant_config['norm'], 
                                        calibration=False, 
                                        device='cuda', ifa_head=args.ifa_head),
                             TorchQuantizer(bitwidth=18, int_bitwidth=5, signed=True, calibration=False),
                             dtype=torch.float64,
                             args=args)
qmodel.transfer_weights(model4hls)
qmodel.to(torch.device('cuda'))
qmodel.eval()

Enable CLC: False
Number of CLR tokens: 0
Reduction locations: [3, 6, 9]
Recovery layers: []
Keep rates: [0.7, 0.7, 0.7]


QTransformerEncoder(
  (layers): ModuleList(
    (0-11): 12 x QTransformerEncoderLayer(
      (self_attn): QFlashMultiheadAttention(
        (out_proj): QLinear(
          in_features=192, out_features=192, bias=True
          (weight_qtzr): TorchQuantizer()
          (bias_qtzr): TorchQuantizer()
          (input_qtzr): TorchQuantizer()
          (output_qtzr): TorchQuantizer()
        )
        (in_proj): QLinear(
          in_features=192, out_features=576, bias=True
          (weight_qtzr): TorchQuantizer()
          (bias_qtzr): TorchQuantizer()
          (input_qtzr): TorchQuantizer()
          (output_qtzr): TorchQuantizer()
        )
        (scale_qtzr): TorchQuantizer()
        (row_sum_qtzr): TorchQuantizer()
        (exp_input_qtzr): TorchQuantizer()
        (exp_output_qtzr): TorchQuantizer()
        (inv_input_qtzr): TorchQuantizer()
        (inv_output_qtzr): TorchQuantizer()
        (attn_out_qtzr): TorchQuantizer()
      )
      (linear1): Linear(in_features=192, out_f

In [20]:
if enable_evit:
    method = 'evit'
else:
    method = 'topk'

images, target = next(iter(test_loader))
args.device = 'cuda'
images = images.to(args.device)
target = target.to(args.device)
model = model.to(args.device)
with torch.no_grad():
    x = model.patch_embed(images[0:1])
    x = model._pos_embed(x)
    x = model.patch_drop(x)
    x = model.norm_pre(x)
    # print("Test data stats:")  
    # print(f"  mean: {x.mean()}, std: {x.std()}")
    print('Double check input shape of encoders = {}'.format(x.shape))
    # print(x.shape) 
    print(x[0][0][0:3])
    out = x

    cls_list = [] if model.ifa_head is not False else None
    cross_layer_cache = [] if model.clc else None
    if hasattr(model, 'clr') and not model.clc_pool_clr:
        curr_num_pool = out.shape[1] - model.num_clr
    else:
        curr_num_pool = out.shape[1]
    for i, blk in enumerate(model.blocks):
        # print('Processing block {}'.format(i))
        if enable_evit:
            out, left_token, sample_idx, compl = blk(out) # for evit
        else:
            out, left_token, sample_idx = blk(out) # for topk
        # if sample_idx is not None:
        #     print('Pytorch sample_idx: ', sample_idx[sample_idx != -1])
        # cross layer post reduction aggregation
        if model.clc and (i in model.recovery_layers):
            curr_num_pool = out.shape[1]
            prev_feats = torch.cat(cross_layer_cache, dim=1)
            out = torch.cat([out, prev_feats], dim=1)
            cross_layer_cache = []

        # add relevant tokens act as carriers to cross layer caches
        if model.clc and (i < model.recovery_layers[-1]):
            cross_layer_carriers = []

            if model.clc_include_gap:
                if model.clc_pool_cls:
                    feats_to_pool = out[:, :curr_num_pool]
                else:
                    feats_to_pool = out[:, 1:curr_num_pool]
                    
                gap = reduce(feats_to_pool, 'b s d -> b 1 d', 'mean')
                cross_layer_carriers.append(gap)

            if hasattr(model, 'clr'):
                cross_layer_carriers.append(out[:, -model.num_clr:])

            cross_layer_carriers = torch.cat(cross_layer_carriers, dim=1)
            cross_layer_cache.append(cross_layer_carriers)

        # add cls token / pooled tokens to the lists
        if cls_list is not None and (i in model.reduction_loc or i == len(model.blocks) - 1):
            cls_list.append(out[:, 0])

        # print('Layer 1 output: ', output[0][0][0:3])
            
    if not args.ifa_head:
        out = model.norm(out)
    pytorch_out = out
    print(pytorch_out[0][0][0:3])
    print(pytorch_out.shape)
    print('-----------------------------------------')
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_test/tb_data/tb_input_features.dat'.format(args.input_size), x.cpu().numpy().flatten().reshape(1, -1), fmt="%.12f", delimiter=" ")
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}/tb_data/tb_input_features.dat'.format(args.input_size), x.numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_pruning_kr-{}/tb_data/tb_input_features.dat'.format(args.input_size, args.keep_rate[0]), x.numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_{}_kr-{}/tb_data/tb_input_features.dat'.format(args.input_size, method, args.keep_rate[0]), x.cpu().numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")

    q_output = qmodel(x.permute(1, 0, 2).type(torch.float64))
    q_output = q_output.permute(1, 0, 2)
    print(q_output[0][0][0:3])
    print(q_output.shape)

    # encoder_out2 = model4hls(x.permute(1, 0, 2))
    # encoder_out2 = encoder_out2.permute(1, 0, 2)
    # print(encoder_out2)
    # print(encoder_out2.shape)

    hls_output = hls_model.predict(x.cpu().numpy())
    print(hls_output[0:3])
    print(hls_output.shape)

    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_test/tb_data/tb_output_predictions.dat'.format(args.input_size), out.cpu().numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}/tb_data/tb_output_predictions.dat'.format(args.input_size), encoder_out2.numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_{}_kr-{}/tb_data/tb_output_predictions.dat'.format(args.input_size, method, args.keep_rate[0]), pytorch_out.cpu().numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    # print(hls_output.shape)
    # test_output1 = hls_output - encoder_out2.cpu().flatten().numpy()
    print('Sum of difference between qmodel output and pytorch output = {}'.format(np.abs(q_output.cpu().flatten().numpy() - pytorch_out.cpu().flatten().numpy()).sum()))
    print('Max difference between qmodel output and pytorch output = {}'.format(np.abs(q_output.cpu().flatten().numpy() - pytorch_out.cpu().flatten().numpy()).max()))
    # print('Sum of difference between pytorch output and hls_output = {}'.format((out.cpu().flatten().numpy() - hls_output).sum()))
    print('Sum of difference between qmodel output and hls_output = {}'.format(np.abs((q_output.cpu().flatten().numpy() - hls_output)).sum()))
    # test_output1_sum = np.sum(test_output1)
    # print('Sum of difference between pytorch output and hls_output = {}'.format(test_output1_sum))

Double check input shape of encoders = torch.Size([1, 197, 192])
tensor([ 0.043638907372952, -0.071037769317627, -1.415930986404419],
       device='cuda:0')
tensor([-0.572836697101593, -0.362583488225937, -0.766521215438843],
       device='cuda:0')
torch.Size([1, 72, 192])
-----------------------------------------
input:  tensor([ 0.043579101562500, -0.071044921875000, -1.415893554687500],
       device='cuda:0', dtype=torch.float64)
tensor([103, 150, 188, 195,  17,  49,  13,  45,  46,  48, 182, 136, 187,  86,
         31,  44,  72,   0, 184, 183,  50, 104, 189, 126,  78,  92, 186, 100,
        118,  75,  19,  18, 122, 131,  91,  35, 105,  61,  77,  58,  33,   5,
         34,  32,  60, 117, 185,  98,  63,  62, 140,  89, 133,  47, 171,   6,
         30,  93,  27, 121, 154, 174, 159, 160,  59, 167,   4, 106, 119, 181,
        128, 190, 172, 135,  70,   7, 168, 158,  76, 120, 173, 107,  83, 146,
        157, 170, 145,  64,  56, 114,  20, 111,  12,  74,  55,   2, 108,  11,
        144, 1

In [21]:
# print(np.sum(np.abs(output.cpu().flatten().numpy() - encoder_out2.cpu().flatten().numpy())))

# 轉換為 numpy
output_np = q_output.cpu().numpy().flatten()
# output_np = pytorch_out.cpu().numpy().flatten()
encoder_out2_np = pytorch_out.cpu().numpy().flatten()
# encoder_out2_np = hls_output

# 計算各種相關性指標
print("\n=== 相關性分析 ===")

# 1. Pearson 相關係數
correlation_matrix = np.corrcoef(output_np, encoder_out2_np)
pearson_corr = correlation_matrix[0, 1]
print(f"Pearson 相關係數: {pearson_corr:.6f}")

# 2. 餘弦相似度
cos_sim = np.dot(output_np, encoder_out2_np) / (np.linalg.norm(output_np) * np.linalg.norm(encoder_out2_np))
print(f"餘弦相似度: {cos_sim:.6f}")

# 3. 均方誤差 (MSE)
mse = np.mean((output_np - encoder_out2_np)**2)
print(f"均方誤差 (MSE): {mse:.6e}")

# 4. 平均絕對誤差 (MAE)
mae = np.mean(np.abs(output_np - encoder_out2_np))
print(f"平均絕對誤差 (MAE): {mae:.6e}")

# 5. 最大絕對誤差
max_error = np.max(np.abs(output_np - encoder_out2_np))
print(f"最大絕對誤差: {max_error:.6e}")

# 6. 相對誤差 (如果沒有零值)
if not np.any(encoder_out2_np == 0):
    relative_error = np.mean(np.abs((output_np - encoder_out2_np) / encoder_out2_np))
    print(f"平均相對誤差: {relative_error:.6f}")


=== 相關性分析 ===
Pearson 相關係數: 0.430170
餘弦相似度: 0.430696
均方誤差 (MSE): 4.276929e+00
平均絕對誤差 (MAE): 1.605554e+00
最大絕對誤差: 1.004463e+01
平均相對誤差: 5.196208


In [22]:
gdsdsfdsf

NameError: name 'gdsdsfdsf' is not defined

In [None]:
images, target = next(iter(test_loader))
images = images.cuda()
with torch.no_grad():
    x = model(images[0:1])
    # x = model.patch_embed(images[0:1])
    # x = model._pos_embed(x)
    # x = model.patch_drop(x)
    # x = model.norm_pre(x)

    # if model.viz_mode:
    #         decisions = {}
    #         features = {}

    # cls_list = [] if model.ifa_head is not False else None
    # cross_layer_cache = [] if model.clc else None
    # if hasattr(model, 'clr') and not model.clc_pool_clr:
    #     curr_num_pool = x.shape[1] - model.num_clr
    # else:
    #     curr_num_pool = x.shape[1]

    # print('Double check input shape of encoders = {}'.format(x.shape))
    # print(x.shape)  
    # print(x)     
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_test/tb_data/tb_input_features.dat'.format(args.input_size), x.cpu().numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")  
    # # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_topk_kr-{}_clca/tb_data/tb_input_features.dat'.format(args.input_size, args.keep_rate[0]), x.numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")  
    # for i, blk in enumerate(model.blocks):
    #     # out, left_token, sample_idx, compl = blk(x) # for evit
    #     x, left_token, sample_idx = blk(x) # for topk 
    #     print(i, x.shape)
    # out = model.norm(x)
    # # out = model(images[0:1])
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_test/tb_data/tb_output_predictions.dat'.format(args.input_size), out.cpu().numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_topk_kr-{}_clca/tb_data/tb_output_predictions.dat'.format(args.input_size, args.keep_rate[0]), out.numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    # out = out[:, 0]  # class token
    # out = model.fc_norm(out)
    # out = model.head_drop(out)
    # out = model.head(out)
    # print(out.shape)
    # print(out)
    # csim_out = np.loadtxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_pruning/tb_data/csim_results.log'.format(args.input_size, args.keep_rate[0]))
    # csim_out = torch.from_numpy(csim_out).float()
    # csim_out = csim_out.view(1, 26, 192) # input size 224, keep rate 0.5, final tokens 26
    # csim_out = csim_out[:, 0]  # class token
    # csim_out = model.fc_norm(csim_out)
    # csim_out = model.head_drop(csim_out)
    # csim_out = model.head(csim_out)
    # print(csim_out.shape)
    # print(csim_out)
    # np.savetxt('pytorch_out.txt', out.cpu().numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    # np.savetxt('csim_out.txt', csim_out.cpu().numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")

In [None]:
all_inputs = []
all_outputs = []

# images, target = next(iter(test_loader))
for batch_idx, (images, target) in enumerate(test_loader):
    with torch.no_grad():
        for img_idx in range(images.shape[0]):  # 處理batch中的每個圖片
            print('Processing image index: {}'.format(img_idx))
            # 獲取patch embedding的輸出
            x = model.patch_embed(images[img_idx:img_idx+1])
            x = model._pos_embed(x)
            x = model.patch_drop(x)
            x = model.norm_pre(x)

            # print('Double check input shape of encoders = {}'.format(x.shape))
            # print(x.shape)  
            # print(x)  
            all_inputs.append(x.numpy().flatten())   
            for i, blk in enumerate(model.blocks):
                # out, left_token, sample_idx, compl = blk(x) # for evit
                x, left_token, sample_idx = blk(x) # for topk 
            out = model.norm(x)
            # out = model(images[0:1])
            all_outputs.append(out.numpy().flatten())
            # out = out[:, 0]  # class token
            # out = model.fc_norm(out)
            # out = model.head_drop(out)
            # out = model.head(out)
            # print(out.shape)
            # print(out)
            # csim_out = np.loadtxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_topk_kr-{}/tb_data/csim_results.log'.format(args.input_size, args.keep_rate[0]))
            # csim_out = torch.from_numpy(csim_out).float()
            # csim_out = csim_out.view(1, 26, 192) # input size 224, keep rate 0.5, final tokens 26
            # csim_out = csim_out[:, 0]  # class token
            # csim_out = model.fc_norm(csim_out)
            # csim_out = model.head_drop(csim_out)
            # csim_out = model.head(csim_out)
            # print(csim_out.shape)
            # print(csim_out)
            # np.savetxt('pytorch_out.txt', out.cpu().numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
            # np.savetxt('csim_out.txt', csim_out.cpu().numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")

all_inputs = np.array(all_inputs)
all_outputs = np.array(all_outputs)

np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_topk_kr-{}/tb_data/tb_input_features_all_{}.dat'.format(args.input_size, args.keep_rate[0], args.dataset_name), 
           all_inputs, fmt="%.6f", delimiter=" ")
np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_topk_kr-{}/tb_data/tb_output_predictions_all_{}.dat'.format(args.input_size, args.keep_rate[0], args.dataset_name), 
           all_outputs, fmt="%.6f", delimiter=" ")

# np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}/tb_data/tb_input_features_all_{}.dat'.format(args.input_size, args.dataset_name), 
#            all_inputs, fmt="%.6f", delimiter=" ")
# np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}/tb_data/tb_output_predictions_all_{}.dat'.format(args.input_size, args.dataset_name), 
#            all_outputs, fmt="%.6f", delimiter=" ")

print(f"成功保存 {len(all_inputs)} 個樣本")

In [None]:
import math

all_inputs = []
all_outputs = []
all_predictions = []  # 儲存最終預測結果
all_targets = []      # 儲存真實標籤

correct = 0
total = 0

# images, target = next(iter(test_loader))
for batch_idx, (images, target) in enumerate(test_loader):
    images = images.to(args.device)
    target = target.to(args.device)
    with torch.no_grad():
        for img_idx in range(images.shape[0]):  # 處理batch中的每個圖片
            # print('Processing image index: {}'.format(img_idx))
            current_target = target[img_idx].item()  # 獲取當前圖片的真實標籤
            
            # 獲取patch embedding的輸出
            x = model.patch_embed(images[img_idx:img_idx+1])
            x = model._pos_embed(x)
            x = model.patch_drop(x)
            x = model.norm_pre(x)
            
            ########################################################
            # using pytorch model
            # 儲存輸入特徵
            # all_inputs.append(x.numpy().flatten())   
            cls_list = [] if model.ifa_head is not False else None
            cross_layer_cache = [] if model.clc else None
            if hasattr(model, 'clr') and not model.clc_pool_clr:
                curr_num_pool = x.shape[1] - model.num_clr
            else:
                curr_num_pool = x.shape[1]
            # 通過所有 transformer blocks
            for i, blk in enumerate(model.blocks):
                x, left_token, sample_idx = blk(x) # for topk 

                # cross layer post reduction aggregation
                if model.clc and (i in model.recovery_layers):
                    curr_num_pool = x.shape[1]
                    prev_feats = torch.cat(cross_layer_cache, dim=1)
                    x = torch.cat([x, prev_feats], dim=1)
                    cross_layer_cache = []

                # add relevant tokens act as carriers to cross layer caches
                if model.clc and (i < model.recovery_layers[-1]):
                    cross_layer_carriers = []

                    if model.clc_include_gap:
                        if model.clc_pool_cls:
                            feats_to_pool = x[:, :curr_num_pool]
                        else:
                            feats_to_pool = x[:, 1:curr_num_pool]
                            
                        gap = reduce(feats_to_pool, 'b s d -> b 1 d', 'mean')
                        cross_layer_carriers.append(gap)

                    if hasattr(model, 'clr'):
                        cross_layer_carriers.append(x[:, -model.num_clr:])

                    cross_layer_carriers = torch.cat(cross_layer_carriers, dim=1)
                    cross_layer_cache.append(cross_layer_carriers)

                # add cls token / pooled tokens to the lists
                if cls_list is not None and (i in model.reduction_loc or i == len(model.blocks) - 1):
                    cls_list.append(x[:, 0])
            
            # 正規化
            out = model.norm(x)
            # all_outputs.append(out.numpy().flatten())
            ########################################################
            # using q model
            # out = model4hls(x.permute(1, 0, 2))
            # out = qmodel(x.permute(1, 0, 2).type(torch.float64))
            # out = out.permute(1, 0, 2)
            # all_outputs.append(out)
            ########################################################
            # using hls model
            # out = hls_model.predict(x.numpy())
            # all_outputs.append(out)

            # # 確定最終的 token 數量
            # if args.keep_rate == []:
            #     expected_final_tokens = 197
            # else:
            #     expected_final_tokens = math.ceil(args.keep_rate[0] * math.ceil(args.keep_rate[0] * math.ceil(args.keep_rate[0] * (197 - 1)))) + 1  # 根據您的設定調整
            # expected_features = 192  # 根據模型的特徵數量

            # # 重新整形 HLS 結果
            # out = out.reshape(-1, expected_final_tokens, expected_features)
            # out = torch.from_numpy(out).float()  # 確保是 torch tensor
            ########################################################

            if model.ifa_head:
                # 如果是 IFA head，需要收集不同層的 cls token
                inter_cls = torch.stack(cls_list, dim=-1)
                logits = model.ifa_head(inter_cls)
            else:
                # 標準分類流程
                if model.attn_pool is not None:
                    cls_token = model.attn_pool(out)
                elif model.global_pool == 'avg':
                    cls_token = out[:, model.num_prefix_tokens:].mean(dim=1)
                elif model.global_pool:
                    cls_token = out[:, 0]  # class token
                else:
                    cls_token = out[:, 0]  # 預設使用 class token
                
                # 確保 cls_token 是 float32 類型，與模型權重匹配
                if cls_token.dtype == torch.float64:
                    cls_token = cls_token.float()  # 轉換為 float32
                
                cls_token = model.fc_norm(cls_token)
                cls_token = model.head_drop(cls_token)
                logits = model.head(cls_token)
            
            # 獲取預測結果
            predicted = torch.argmax(logits, dim=1).item()
            
            # 儲存預測和真實標籤
            all_predictions.append(predicted)
            all_targets.append(current_target)
            
            # 計算準確率
            if predicted == current_target:
                correct += 1
            total += 1
            
            # 每處理10張圖片就印一次準確率
            if total % 10 == 0:
                current_accuracy = 100 * correct / total
                print(f'處理了 {total} 張圖片，目前準確率: {current_accuracy:.2f}%')

# 計算最終準確率
final_accuracy = 100 * correct / total
print(f'\n最終結果:')
print(f'總共處理: {total} 張圖片')
print(f'正確預測: {correct} 張')
print(f'準確率: {final_accuracy:.2f}%')

# 儲存結果
all_inputs = np.array(all_inputs)
all_outputs = np.array(all_outputs)
all_predictions = np.array(all_predictions)
all_targets = np.array(all_targets)

In [None]:
import math
from tqdm import tqdm

all_inputs = []
qmodel_all_outputs = []
qmodel_all_predictions = []  # 儲存最終預測結果
all_targets = []      # 儲存真實標籤

correct = 0
total = 0

args.device = 'cuda'
args.batch_size = 512
test_loader = torch.utils.data.DataLoader(
    dataset_val, sampler=sampler_val,
    batch_size=args.batch_size,
    num_workers=args.num_workers,
    pin_memory=args.pin_mem,
    drop_last=False
)

model.to(args.device)

# images, targets = next(iter(test_loader))
for batch_idx, (images, targets) in tqdm(enumerate(test_loader), desc="Batch", total=len(test_loader)):
    images = images.to(args.device)
    targets = targets.to(args.device)
    batch_size = images.shape[0]
    if batch_idx == 0:
        print(f'Batch size: {batch_size}, Image shape: {images.shape}')

    with torch.no_grad():
        print('Processing batch index: {}'.format(batch_idx))

        # 獲取patch embedding的輸出
        x = model.patch_embed(images)
        x = model._pos_embed(x)
        x = model.patch_drop(x)
        x = model.norm_pre(x)
        
        ########################################################
        # using pytorch model
        # 儲存輸入特徵
        # all_inputs.append(x.numpy().flatten())   
        
        # 通過所有 transformer blocks
        # for i, blk in enumerate(model.blocks):
        #     x, left_token, sample_idx = blk(x) # for topk 
        
        # # 正規化
        # out = model.norm(x)
        # all_outputs.append(out.numpy().flatten())
        ########################################################
        # using q model
        # out = model4hls(x.permute(1, 0, 2))
        out = qmodel(x.permute(1, 0, 2).type(torch.float64))
        out = out.permute(1, 0, 2)
        qmodel_all_outputs.append(out)
        ########################################################
        # using hls model
        # out = hls_model.predict(x.numpy())
        # all_outputs.append(out)

        # # 確定最終的 token 數量
        # if args.keep_rate == []:
        #     expected_final_tokens = 197
        # else:
        #     expected_final_tokens = math.ceil(args.keep_rate[0] * math.ceil(args.keep_rate[0] * math.ceil(args.keep_rate[0] * (197 - 1)))) + 1  # 根據您的設定調整
        # expected_features = 192  # 根據模型的特徵數量

        # # 重新整形 HLS 結果
        # out = out.reshape(-1, expected_final_tokens, expected_features)
        # out = torch.from_numpy(out).float()  # 確保是 torch tensor
        ########################################################

        # if model.ifa_head is not False:
        #     # IFA head 流程
        #     inter_cls = torch.stack(cls_list, dim=-1)
        #     logits = model.ifa_head(inter_cls)
        # else:
        # 標準分類流程
        # 如果模型沒有 norm，手動添加 LayerNorm
        # temp_norm = nn.LayerNorm(out.shape[-1]).to(out.device).double()
        # out = temp_norm(out)
        
        if model.attn_pool is not None:
            cls_token = model.attn_pool(out)
        elif model.global_pool == 'avg':
            cls_token = out[:, model.num_prefix_tokens:].mean(dim=1)
        elif model.global_pool:
            cls_token = out[:, 0]  # class token
        else:
            cls_token = out[:, 0]  # 預設使用 class token
        
        # 確保 cls_token 是 float32 類型
        if cls_token.dtype == torch.float64:
            cls_token = cls_token.float()
        
        cls_token = model.fc_norm(cls_token)
        cls_token = model.head_drop(cls_token)
        logits = model.head(cls_token)
        
        # 獲取預測結果
        predictions = torch.argmax(logits, dim=1)
        
        # 儲存預測和真實標籤
        qmodel_all_predictions.extend(predictions.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())
        
        # 計算準確率
        correct += (predictions == targets).sum().item()
        total += batch_size
        
        # 每處理10張圖片就印一次準確率
        if total % 10 == 0:
            current_accuracy = 100 * correct / total
            print(f'處理了 {total} 張圖片，目前準確率: {current_accuracy:.2f}%')

# 計算最終準確率
final_accuracy = 100 * correct / total
print(f'\n最終結果:')
print(f'總共處理: {total} 張圖片')
print(f'正確預測: {correct} 張')
print(f'準確率: {final_accuracy:.2f}%')

# 儲存結果
all_inputs = np.array(all_inputs)
all_outputs = np.array(all_outputs)
qmodel_all_predictions = np.array(qmodel_all_predictions)
all_targets = np.array(all_targets)

In [None]:
min_samples = min(len(qmodel_all_predictions), len(all_targets))
# 預測一致性分析
if len(all_predictions) == len(qmodel_all_predictions):
    prediction_match = np.sum(all_predictions[:min_samples] == qmodel_all_predictions)
    prediction_consistency = 100 * prediction_match / min_samples
    print(f"預測一致性: {prediction_consistency:.2f}%")

In [None]:
hdgfhdh

In [None]:
pprint(qmodel)

In [None]:
# 全面測試所有數據，不只是前10個
import torch
import numpy as np

all_predictions = []
all_targets = []
all_logits = []
all_confidence_scores = []

correct = 0
total = 0

print("=== 全面測試所有數據 ===")

for batch_idx, (images, target) in enumerate(test_loader):
    print(f"批次 {batch_idx}: 標籤 {target.tolist()}")
    images = images.to(args.device)
    
    with torch.no_grad():
        for img_idx in range(images.shape[0]):
            current_target = target[img_idx].item()
            
            # 你的特徵提取程式碼
            x = model.patch_embed(images[img_idx:img_idx+1])
            x = model._pos_embed(x)
            x = model.patch_drop(x)
            x = model.norm_pre(x)
            
            for i, blk in enumerate(model.blocks):
                x, left_token, sample_idx = blk(x)
            
            out = model.norm(x)
            cls_token = out[:, 0]
            
            if hasattr(model, 'fc_norm'):
                cls_token = model.fc_norm(cls_token)
            if hasattr(model, 'head_drop'):
                cls_token = model.head_drop(cls_token)
            if hasattr(model, 'head'):
                logits = model.head(cls_token)
            
            # 獲取預測和信心分數
            probabilities = torch.softmax(logits, dim=1)
            predicted = torch.argmax(logits, dim=1).item()
            confidence = probabilities[0, predicted].item()
            max_logit = logits[0, predicted].item()
            
            # 儲存結果
            all_predictions.append(predicted)
            all_targets.append(current_target)
            all_logits.append(logits.cpu().numpy().flatten())
            all_confidence_scores.append(confidence)
            
            # 檢查是否正確
            is_correct = predicted == current_target
            if is_correct:
                correct += 1
            else:
                print(f"*** 錯誤預測 ***")
                print(f"圖片 {total}: 真實={current_target}, 預測={predicted}")
                print(f"Logits: {logits.flatten()[:10]}...")
                print(f"信心分數: {confidence:.4f}")
            
            total += 1

print(f"\n=== 最終結果 ===")
print(f"總共處理: {total} 張圖片")
print(f"正確預測: {correct} 張")
print(f"準確率: {100 * correct / total:.2f}%")

# 轉換為 numpy 陣列
all_predictions = np.array(all_predictions)
all_targets = np.array(all_targets)
all_confidence_scores = np.array(all_confidence_scores)

print(f"\n=== 詳細分析 ===")
print(f"預測標籤範圍: {np.min(all_predictions)} ~ {np.max(all_predictions)}")
print(f"真實標籤範圍: {np.min(all_targets)} ~ {np.max(all_targets)}")
print(f"信心分數範圍: {np.min(all_confidence_scores):.4f} ~ {np.max(all_confidence_scores):.4f}")
print(f"平均信心分數: {np.mean(all_confidence_scores):.4f}")

# 檢查每個類別的準確率
print(f"\n=== 每個類別的準確率 ===")
unique_classes = np.unique(all_targets)
for cls in unique_classes:
    cls_mask = all_targets == cls
    cls_correct = np.sum((all_predictions == all_targets) & cls_mask)
    cls_total = np.sum(cls_mask)
    cls_accuracy = 100 * cls_correct / cls_total if cls_total > 0 else 0
    
    # 該類別的信心分數
    cls_confidence = np.mean(all_confidence_scores[cls_mask])
    
    print(f"類別 {cls:2d}: {cls_correct}/{cls_total} = {cls_accuracy:6.2f}%, 平均信心: {cls_confidence:.4f}")

# 檢查混淆情況
print(f"\n=== 混淆分析 ===")
wrong_predictions = all_predictions != all_targets
if np.any(wrong_predictions):
    print("錯誤預測的詳細情況:")
    for i, (pred, true) in enumerate(zip(all_predictions[wrong_predictions], all_targets[wrong_predictions])):
        confidence = all_confidence_scores[wrong_predictions][i]
        print(f"  樣本: 真實={true}, 預測={pred}, 信心={confidence:.4f}")
else:
    print("沒有錯誤預測！")

# 檢查信心分數最低的預測
print(f"\n=== 信心分數最低的10個預測 ===")
low_confidence_indices = np.argsort(all_confidence_scores)[:10]
for i, idx in enumerate(low_confidence_indices):
    pred = all_predictions[idx]
    true = all_targets[idx]
    conf = all_confidence_scores[idx]
    correct_mark = "✓" if pred == true else "✗"
    print(f"{i+1:2d}. 樣本{idx:2d}: 真實={true:2d}, 預測={pred:2d}, 信心={conf:.4f} {correct_mark}")

# 最後確認：使用完整模型測試
print(f"\n=== 完整模型驗證 ===")
model.eval()
correct_full = 0
total_full = 0

with torch.no_grad():
    for images, target in test_loader:
        images = images.to(args.device)
        target = target.to(args.device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total_full += target.size(0)
        correct_full += (predicted == target).sum().item()

print(f"完整模型準確率: {100 * correct_full / total_full:.2f}%")
print(f"手動實現準確率: {100 * correct / total:.2f}%")
print(f"結果一致: {correct_full == correct and total_full == total}")

In [None]:
# 讀取 HLS C simulation 的結果
try:
    # csim_results = np.loadtxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_topk_kr-{}/tb_data/csim_results_all_{}.log'.format(args.input_size, args.keep_rate[0], args.dataset_name))
    csim_results = np.loadtxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}/tb_data/csim_results_all_{}.log'.format(args.input_size, args.dataset_name))
    print(f"\nHLS結果形狀: {csim_results.shape}")
    
    # 確定最終的 token 數量
    expected_final_tokens = 197
    # expected_final_tokens = math.ceil(args.keep_rate[0] * math.ceil(args.keep_rate[0] * math.ceil(args.keep_rate[0] * (197 - 1)))) + 1  # 根據您的設定調整
    expected_features = 192     # DeiT-tiny 的特徵維度
    
    # 重新整形 HLS 結果
    num_samples = csim_results.shape[0]
    csim_results = csim_results.reshape(num_samples, expected_final_tokens, expected_features)
    print(f"重新整形後的 HLS 結果: {csim_results.shape}")
    
    # 轉換為 PyTorch tensor
    csim_tensor = torch.from_numpy(csim_results).float()
    
    # 提取 class token (第一個 token)
    cls_tokens = csim_tensor[:, 0, :]  # shape: (num_samples, 192)
    print(f"Class tokens 形狀: {cls_tokens.shape}")
    
    # 通過分類頭計算 HLS 預測
    hls_predictions = []
    
    with torch.no_grad():
        for i in range(cls_tokens.shape[0]):
            cls_token = cls_tokens[i:i+1]  # shape: (1, 192)
            
            # 根據模型架構選擇正確的分類頭
            if hasattr(model, 'fc_norm'):
                cls_token = model.fc_norm(cls_token)
            
            if hasattr(model, 'head_drop'):
                cls_token = model.head_drop(cls_token)
            
            # 分類頭
            if hasattr(model, 'head'):
                logits = model.head(cls_token)
            elif hasattr(model, 'fc'):
                logits = model.fc(cls_token)
            else:
                logits = cls_token
            
            predicted = torch.argmax(logits, dim=1).item()
            hls_predictions.append(predicted)
    
    hls_predictions = np.array(hls_predictions)
    
    # 確保數量匹配
    min_samples = min(len(hls_predictions), len(all_targets))
    hls_predictions = hls_predictions[:min_samples]
    targets_for_hls = all_targets[:min_samples]
    
    # 計算 HLS 準確率
    hls_correct = np.sum(hls_predictions == targets_for_hls)
    hls_total = len(targets_for_hls)
    hls_accuracy = 100 * hls_correct / hls_total
    
    print(f"\n=== HLS C Simulation 準確率結果 ===")
    print(f"總共處理: {hls_total} 個樣本")
    print(f"正確預測: {hls_correct} 個")
    print(f"HLS準確率: {hls_accuracy:.2f}%")
    
    print(f"\n=== PyTorch vs HLS 比較 ===")
    print(f"PyTorch 準確率: {final_accuracy:.2f}%")
    print(f"HLS 準確率: {hls_accuracy:.2f}%")
    print(f"準確率差異: {abs(final_accuracy - hls_accuracy):.2f}%")
    
    # 預測一致性分析
    if len(all_predictions) == len(hls_predictions):
        prediction_match = np.sum(all_predictions[:min_samples] == hls_predictions)
        prediction_consistency = 100 * prediction_match / min_samples
        print(f"預測一致性: {prediction_consistency:.2f}%")
    
    # 儲存 HLS 預測結果
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_topk_kr-{}/tb_data/hls_predictions.dat'.format(args.input_size, args.keep_rate[0]), 
    #            hls_predictions, fmt="%d", delimiter=" ")
    
    # 儲存準確率報告
    # with open('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_topk_kr-{}/tb_data/hls_accuracy_report.txt'.format(args.input_size, args.keep_rate[0]), 'w') as f:
    #     f.write(f'HLS C Simulation 準確率報告\n')
    #     f.write(f'========================\n')
    #     f.write(f'PyTorch 準確率: {final_accuracy:.4f}%\n')
    #     f.write(f'HLS 準確率: {hls_accuracy:.4f}%\n')
    #     f.write(f'準確率差異: {abs(final_accuracy - hls_accuracy):.4f}%\n')
    #     f.write(f'預測一致性: {prediction_consistency:.4f}%\n')
    
    # print(f"\nHLS 準確率報告已保存")
    
except FileNotFoundError:
    print("找不到 csim_results_all.log 檔案，請先執行 HLS C simulation")

In [None]:
images, target = next(iter(test_loader))
args.device = 'cuda'
images = images.to(args.device)
target = target.to(args.device)
model = model.to(args.device)
with torch.no_grad():
    x = model.patch_embed(images[0:1])
    x = model._pos_embed(x)
    x = model.patch_drop(x)
    x = model.norm_pre(x)
    print('Double check input shape of encoders = {}'.format(x.shape))
    print(x.shape) 
    # print(x)
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}/tb_data/tb_input_features.dat'.format(args.input_size), x.numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_pruning_kr-{}/tb_data/tb_input_features.dat'.format(args.input_size, args.keep_rate[0]), x.numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    output = qmodel(x.permute(1, 0, 2).type(torch.float64))
    output = output.permute(1, 0, 2)
    encoder_out2 = model4hls(x.permute(1, 0, 2))
    encoder_out2 = encoder_out2.permute(1, 0, 2)
    # hls_output = hls_model.predict(x.numpy())
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}/tb_data/tb_output_predictions.dat'.format(args.input_size), encoder_out2.numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    # np.savetxt('hls/deit_tiny_w8_Bdk-1_Bffn-12_{}_pruning_kr-{}/tb_data/tb_output_predictions.dat'.format(args.input_size, args.keep_rate[0]), encoder_out2.numpy().flatten().reshape(1, -1), fmt="%.6f", delimiter=" ")
    print(output)
    print(encoder_out2)
    # print(hls_output)
    # test_output1 = hls_output - encoder_out2.flatten().numpy()
    # print('Sum of difference between qmodel output and hls_output = {}'.format((output.flatten().numpy() - hls_output).sum()))
    # test_output1_sum = np.sum(test_output1)
    # print('Sum of difference between pytorch output and hls_output = {}'.format(test_output1_sum))

In [None]:
output = output.to(torch.device('cpu'))
encoder_out2 = encoder_out2.to(torch.device('cpu'))

In [None]:
print(np.sum(np.abs(output.flatten().numpy() - encoder_out2.flatten().numpy())))

# 轉換為 numpy
output_np = output.numpy().flatten()
encoder_out2_np = encoder_out2.numpy().flatten()

# 計算各種相關性指標
print("\n=== 相關性分析 ===")

# 1. Pearson 相關係數
correlation_matrix = np.corrcoef(output_np, encoder_out2_np)
pearson_corr = correlation_matrix[0, 1]
print(f"Pearson 相關係數: {pearson_corr:.6f}")

# 2. 餘弦相似度
cos_sim = np.dot(output_np, encoder_out2_np) / (np.linalg.norm(output_np) * np.linalg.norm(encoder_out2_np))
print(f"餘弦相似度: {cos_sim:.6f}")

# 3. 均方誤差 (MSE)
mse = np.mean((output_np - encoder_out2_np)**2)
print(f"均方誤差 (MSE): {mse:.6e}")

# 4. 平均絕對誤差 (MAE)
mae = np.mean(np.abs(output_np - encoder_out2_np))
print(f"平均絕對誤差 (MAE): {mae:.6e}")

# 5. 最大絕對誤差
max_error = np.max(np.abs(output_np - encoder_out2_np))
print(f"最大絕對誤差: {max_error:.6e}")

# 6. 相對誤差 (如果沒有零值)
if not np.any(encoder_out2_np == 0):
    relative_error = np.mean(np.abs((output_np - encoder_out2_np) / encoder_out2_np))
    print(f"平均相對誤差: {relative_error:.6f}")

In [None]:
print(encoder_out2.shape)

### 預測`hls_model`以及產生的HLS project的BRAM數目，這會`state`的配置與tile size的大小有關

In [None]:
from estimators import *
def layer_estimater(quant_config):
    bram_dict = {}  
    for layer_name in quant_config.keys():
        bram_dict[layer_name] = {}
        for var_name in quant_config[layer_name].keys():
            #pprint(quant_config[layer_name][var_name])
            bram_dict[layer_name][var_name] = VivadoVariableBRAMEstimator(name=var_name,**quant_config[layer_name][var_name])

    num_ram = 0
    for layer_name in bram_dict.keys():
        for var_name in bram_dict[layer_name].keys():
            ram_est = bram_dict[layer_name][var_name]
            num_ram += ram_est.get_num_ram()
    return num_ram
num_ram = layer_estimater(parse_hls_model(hls_model))
print(num_ram)