# Experiment 1 Per-Layer Sensitivity

Quantize only one block at a time, keep others FP32

In [1]:
import torch
import yaml
from src.utils import *
from src.data_loader import get_data_loaders
from src.models.cnn_model_LayerWiseQuant import M5Modular, PTQM5Modular, PTQM5_LayerWiseQuant
from src.evaluate import test
fp_dict = torch.load("../models/cnn_fp32_model.pth")
ptq_dict = torch.load("../models/cnn_ptq_model.pth")
LWQ_dict_dicts = {
    1: torch.load("../models/cnn_ptq_LayerWiseQuant_q1_model.pth"),
    2: torch.load("../models/cnn_ptq_LayerWiseQuant_q2_model.pth"),
    3: torch.load("../models/cnn_ptq_LayerWiseQuant_q3_model.pth"),
    4: torch.load("../models/cnn_ptq_LayerWiseQuant_q4_model.pth"),
}

data_config = {
    "raw_dir": "../data/raw",
    "processed_dir": "./data/processed",
    "sample_rate": 8000,
    "batch_size": 256,
    "version": "v0.1"
}
train_loader, test_loader, _ = get_data_loaders(data_config)

  device=storage.device,


In [2]:
# Load FP model
config_fp = '../configs/cnn_fp32.yaml'
with open(config_fp, 'r') as f:
    config = yaml.safe_load(f)
    
params_fp = config["model"]["base_cnn"]
model_fp = M5Modular(
        n_input=params_fp["n_input"],
        n_output=params_fp["n_output"],
        stride=params_fp["stride"],
        n_channel=params_fp["n_channel"],
        conv_kernel_sizes=params_fp["conv_kernel_sizes"]
        )
model_fp.load_state_dict(fp_dict)
model_fp.to('cpu')

# evaluate FP model
acc_fp = test(model_fp, test_loader)
print(f"FP32 model accuracy: {acc_fp:.4f}")

FP32 model accuracy: 83.0713


In [3]:
# Load fully quantized PTQ model
# Load PTQ model
config_PTQ = '../configs/cnn_ptq.yaml'
with open(config_PTQ, 'r') as f:
    config = yaml.safe_load(f)
    
params_PTQ = config["model"]["base_cnn"]
model_PTQ = PTQM5Modular(
            n_input=params_PTQ["n_input"],
            n_output=params_PTQ["n_output"],
            stride=params_PTQ["stride"],
            n_channel=params_PTQ["n_channel"],
            conv_kernel_sizes=params_PTQ["conv_kernel_sizes"]
        )
# Fuse and prepare for quantization
model_PTQ.eval()
model_PTQ.fuse_model()
model_PTQ.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')

model_PTQ.train()
torch.ao.quantization.prepare_qat(model_PTQ, inplace=True)

# Convert to quantized model
model_PTQ.eval()
model_PTQ = torch.ao.quantization.convert(model_PTQ, inplace=False)

# Load checkpoint
model_PTQ.load_state_dict(ptq_dict)
model_PTQ.to('cpu')

# evaluate PTQ model
acc_PTQ = test(model_PTQ, test_loader)
print(f"PTQ model accuracy: {acc_PTQ:.4f}")



PTQ model accuracy: 75.8473


In [4]:

config_LWQ = '../configs/cnn_ptq_LayerWiseQuant.yaml'
with open(config_LWQ, 'r') as f:
    config = yaml.safe_load(f)

# for i in range(1, 2):
for i in config["model"]["quantization"]:
    model_LWQ = PTQM5_LayerWiseQuant(
        quantized_block_idx = i,
        n_input=config["model"]["base_cnn"]["n_input"],
        n_output=config["model"]["base_cnn"]["n_output"],
        stride=config["model"]["base_cnn"]["stride"],
        n_channel=config["model"]["base_cnn"]["n_channel"],
        conv_kernel_sizes=config["model"]["base_cnn"]["conv_kernel_sizes"],
    )

    # Fuse and prepare for quantization
    model_LWQ.eval()
    model_LWQ.fuse_model()
    model_LWQ.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')
    
    model_LWQ.train()
    torch.ao.quantization.prepare(model_LWQ, inplace=True)

    # Convert to quantized model
    model_LWQ.eval()
    model_LWQ = torch.ao.quantization.convert(model_LWQ, inplace=False)

    # # Load checkpoint
    model_LWQ.load_state_dict(LWQ_dict_dicts[i])
    model_LWQ.to('cpu')

    # evaluate single layer quantized model
    acc_LWQ = test(model_LWQ, test_loader)
    print(f"Layer-Wise Quantized Model (Layer {i} quantized) accuracy: {acc_LWQ:.4f}")


NotImplementedError: Could not run 'quantized::conv1d_relu' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::conv1d_relu' is only available for these backends: [Meta, QuantizedCPU, QuantizedCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, AutogradMTIA, AutogradMeta, Tracer, AutocastCPU, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

Meta: registered at /pytorch/aten/src/ATen/core/MetaFallbackKernel.cpp:23 [backend fallback]
QuantizedCPU: registered at /pytorch/aten/src/ATen/native/quantized/cpu/qconv.cpp:2045 [kernel]
QuantizedCUDA: registered at /pytorch/aten/src/ATen/native/quantized/cudnn/Conv.cpp:391 [kernel]
BackendSelect: fallthrough registered at /pytorch/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:194 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:503 [backend fallback]
Functionalize: registered at /pytorch/aten/src/ATen/FunctionalizeFallbackKernel.cpp:349 [backend fallback]
Named: registered at /pytorch/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at /pytorch/aten/src/ATen/ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at /pytorch/aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at /pytorch/aten/src/ATen/ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:100 [backend fallback]
AutogradOther: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:63 [backend fallback]
AutogradCPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:67 [backend fallback]
AutogradCUDA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:75 [backend fallback]
AutogradXLA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:83 [backend fallback]
AutogradMPS: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:91 [backend fallback]
AutogradXPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:71 [backend fallback]
AutogradHPU: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:104 [backend fallback]
AutogradLazy: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:87 [backend fallback]
AutogradMTIA: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:79 [backend fallback]
AutogradMeta: registered at /pytorch/aten/src/ATen/core/VariableFallbackKernel.cpp:95 [backend fallback]
Tracer: registered at /pytorch/torch/csrc/autograd/TraceTypeManual.cpp:294 [backend fallback]
AutocastCPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:322 [backend fallback]
AutocastXPU: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:465 [backend fallback]
AutocastMPS: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at /pytorch/aten/src/ATen/autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at /pytorch/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at /pytorch/aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at /pytorch/aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at /pytorch/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at /pytorch/aten/src/ATen/functorch/TensorWrapper.cpp:207 [backend fallback]
PythonTLSSnapshot: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:202 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at /pytorch/aten/src/ATen/functorch/DynamicLayer.cpp:499 [backend fallback]
PreDispatch: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:206 [backend fallback]
PythonDispatcher: registered at /pytorch/aten/src/ATen/core/PythonFallbackKernel.cpp:198 [backend fallback]


In [None]:
config_LWQ = '../configs/cnn_ptq_LayerWiseQuant.yaml'
with open(config_LWQ, 'r') as f:
    config = yaml.safe_load(f)

model_LWQ = PTQM5_LayerWiseQuant(
    quantized_block_idx = 1,
    n_input=config["model"]["base_cnn"]["n_input"],
    n_output=config["model"]["base_cnn"]["n_output"],
    stride=config["model"]["base_cnn"]["stride"],
    n_channel=config["model"]["base_cnn"]["n_channel"],
    conv_kernel_sizes=config["model"]["base_cnn"]["conv_kernel_sizes"],
)
model_LWQ.eval()
model_LWQ.fuse_model()
model_LWQ.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')

model_LWQ.train()
torch.ao.quantization.prepare_qat(model_LWQ, inplace=True)

# Convert to quantized model
model_LWQ.eval()
model_LWQ = torch.ao.quantization.convert(model_LWQ, inplace=False)

# Load checkpoint
model_LWQ.load_state_dict(LWQ_dict_dicts[i])
model_LWQ.to('cpu')

# evaluate single layer quantized model
acc_LWQ = test(model_LWQ, test_loader)
print(f"Layer-Wise Quantized Model (Layer {i} quantized) accuracy: {acc_LWQ:.4f}")


There are bugs in the previous commented code because of these reasons:

- Only quantized one layer (block1), but other layers are still in ```nn.Sequential``` with float ops.

- The fusion step (```fuse_model```) fused the layers as Conv + ReLU, so ```convert()``` tries to use ```Conv1dReLU``` as a fused quantized op.

- However, the input to that block is still float, since earlier layers are not quantized — and ```quantized::conv1d_relu``` expects quantized input tensors.

- *PyTorch’s eager mode quantization does not support partial layer quantization well* when fused operators like ```Conv1dReLU``` are involved.