In [None]:
#| default_exp quantize.quantizer

In [None]:
#| export
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.ao.quantization import get_default_qconfig_mapping
import torch.ao.quantization.quantize_fx as quantize_fx
from torch.ao.quantization.quantize_fx import convert_fx, prepare_fx

In [None]:
#| include: false
from nbdev.showdoc import *
from fastai.vision.all import *
import warnings
warnings.filterwarnings('ignore')

In [None]:
config = get_default_qconfig_mapping("x86")

In [None]:
from torch.ao.quantization.observer import *
from torch.ao.quantization.qconfig_mapping import *
from torch.ao.quantization.qconfig import *

In [None]:
qconfig = torch.ao.quantization.QConfig(
            activation=MinMaxObserver.with_args(dtype=torch.qint8),
            weight=MinMaxObserver.with_args(dtype=torch.qint8))

In [None]:
qconfig

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8){})

In [None]:
qconfig_mapping = get_default_qconfig_mapping("x86")

qconfig = torch.ao.quantization.QConfig(
            activation=MinMaxObserver.with_args(dtype=torch.qint8),
            weight=MinMaxObserver.with_args(dtype=torch.quint8))

qconfig_mapping.set_global(qconfig)

x, _ = dls.valid.one_batch()
model_prepared = prepare_fx(model.eval(), qconfig_mapping, x

convert_fx(model_prepared)

In [None]:
model = timm.create_model('resnet34', pretrained=True)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.ao.quantization import get_default_qconfig_mapping
import torch.ao.quantization.quantize_fx as quantize_fx
from torch.ao.quantization.quantize_fx import convert_fx, prepare_fx

# %% ../../nbs/06_quantize.quantizer.ipynb 4
class Quantizer():
    def __init__(self, backend="x86"):
        self.qconfig = get_default_qconfig_mapping(backend)
    
    def quantize(self, model, calibration_dl):
        x, _ = calibration_dl.valid.one_batch()
        model_prepared = prepare_fx(model.eval(), self.qconfig, x)
        _ = [model_prepared(xb.to('cpu')) for xb, _ in calibration_dl.valid]
            
        return convert_fx(model_prepared)

In [None]:
path = untar_data(URLs.PETS)
files = get_image_files(path/"images")

def label_func(f): return f[0].isupper()

dls = ImageDataLoaders.from_name_func(path, files, label_func, item_tfms=Resize(64))

In [None]:
import timm
pretrained_resnet_34 = timm.create_model('resnet34', pretrained=True)
qt = Quantizer(granularity='tensor')

q_model = qt.quantize(pretrained_resnet_34, dls); q_model

GraphModule(
  (conv1): QuantizedConvReLU2d(3, 64, kernel_size=(7, 7), stride=(2, 2), scale=0.03916042298078537, zero_point=0, padding=(3, 3))
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Module(
    (0): Module(
      (conv1): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.034414783120155334, zero_point=135, padding=(1, 1))
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.10346762835979462, zero_point=99, padding=(1, 1))
    )
    (1): Module(
      (conv1): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.07803963869810104, zero_point=150, padding=(1, 1))
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.1477663815021515, zero_point=154, padding=(1, 1))
    

In [None]:
x, _ = dls.valid.one_batch()
model_prepared = prepare_fx(model.eval(), qconfig_mapping, x)

In [None]:
convert_fx(model_prepared)

GraphModule(
  (conv1): ConvReLU2d(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): ReLU(inplace=True)
  )
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Module(
    (0): Module(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (act2): ReLU(inplace=True)
    )
    (1): Module(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (act2): ReLU(inplace=True)
    )
    (2): Module(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (drop_block): Identity()
      (act1

In [None]:
import torch.quantization as quant

In [None]:
class Quantizer:
    def __init__(self, activation_observer=MinMaxObserver, 
                 weight_observer=PerChannelMinMaxObserver,
                 activation_qtype=torch.quint8, weight_qtype=torch.qint8,
                 granularity='channel'):
        self.activation_observer = activation_observer
        self.weight_observer = weight_observer
        self.activation_qtype = activation_qtype
        self.weight_qtype = weight_qtype
        self.granularity = granularity

    def prepare_qconfig(self):
        if self.granularity == 'tensor':
            qconfig = QConfig(
                activation=self.activation_observer.with_args(dtype=self.activation_qtype),
                weight=self.weight_observer.with_args(dtype=self.weight_qtype)
            )
        elif self.granularity == 'channel':
            qconfig = QConfig(
                activation=self.activation_observer.with_args(dtype=self.activation_qtype),
                weight=self.weight_observer.with_args(dtype=self.weight_qtype, qscheme=torch.per_channel_symmetric)
            )
        else:
            raise ValueError("Granularity must be 'tensor' or 'channel'")
        return qconfig

    def quantize(self, model, calibration_dl):
        qconfig = self.prepare_qconfig()
        #qconfig_mapping = get_default_qconfig_mapping("x86")
        qconfig_mapping = QConfigMapping().set_global(qconfig)
        #qconfig_mapping = qconfig_mapping.set_global(qconfig)

        
        #qconfig = get_default_qconfig("x86")
        #qconfig_mapping = QConfigMapping().set_global(qconfig)

        
        #print(qconfig_mapping)
        
        x, _ = calibration_dl.valid.one_batch()

        model_prepared = quantize_fx.prepare_fx(model.eval(), qconfig_mapping, x)
        
        _ = [model_prepared(xb.to('cpu')) for xb, _ in calibration_dl.valid]

        model_quantized = quantize_fx.convert_fx(model_prepared)
        return model_quantized

In [None]:
def prepare_qconfig():
        qconfig = quant.QConfig(
                activation=HistogramObserver.with_args(reduce_range=True, dtype=torch.quint8),
                weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric)
            )
        return qconfig

In [None]:
qconfig = prepare_qconfig()
qconfig_mapping = QConfigMapping().set_global(qconfig)
print(qconfig_mapping)

model = timm.create_model('resnet34', pretrained=True)
x, _ = dls.valid.one_batch()
model_prepared = quantize_fx.prepare_fx(model.eval(), qconfig_mapping, x)
_ = [model_prepared(xb.to('cpu')) for xb, _ in dls.valid]
model_quantized = quantize_fx.convert_fx(model_prepared); model_quantized

NameError: name 'prepare_qconfig' is not defined

In [None]:
get_default_qconfig("x86")

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})

In [None]:
#qconfig = prepare_qconfig()
#qconfig_mapping = get_default_qconfig_mapping("x86")
qconfig = get_default_qconfig("x86")
qconfig_mapping = QConfigMapping().set_global(qconfig)
#qconfig_mapping = qconfig_mapping.set_global(qconfig); 
print(qconfig_mapping)

model = timm.create_model('resnet34', pretrained=True)
x, _ = dls.valid.one_batch()
model_prepared = quantize_fx.prepare_fx(model.eval(), qconfig_mapping, x)
_ = [model_prepared(xb.to('cpu')) for xb, _ in dls.valid]
model_quantized = quantize_fx.convert_fx(model_prepared); model_quantized

QConfigMapping (
 global_qconfig
  QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
 object_type_qconfigs
  OrderedDict()
 module_name_regex_qconfigs
  OrderedDict()
 module_name_qconfigs
  OrderedDict()
 module_name_object_type_order_qconfigs
  OrderedDict()
)


GraphModule(
  (conv1): QuantizedConvReLU2d(3, 64, kernel_size=(7, 7), stride=(2, 2), scale=0.0586698092520237, zero_point=0, padding=(3, 3))
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Module(
    (0): Module(
      (conv1): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.05712748318910599, zero_point=73, padding=(1, 1))
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.18472586572170258, zero_point=40, padding=(1, 1))
    )
    (1): Module(
      (conv1): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.1304682344198227, zero_point=70, padding=(1, 1))
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.22737379372119904, zero_point=86, padding=(1, 1))
    )
   

In [None]:
model_quantized(x.to('cpu')).dtype

torch.float32

In [None]:
class Quantizer():
    def __init__(self, backend="x86"):
        self.qconfig = get_default_qconfig_mapping(backend)
    
    def quantize(self, model, calibration_dl):
        x, _ = calibration_dl.valid.one_batch()
        model_prepared = prepare_fx(model.eval(), self.qconfig, x)
        _ = [model_prepared(xb.to('cpu')) for xb, _ in calibration_dl.valid]
            
        return convert_fx(model_prepared)

In [None]:
show_doc(Quantizer)

---

[source](https://github.com/nathanhubens/fasterai/tree/master/blob/master/fasterai/quantize/quantizer.py#L16){target="_blank" style="float:right; font-size:smaller"}

### Quantizer

>      Quantizer (activation_observer=<class
>                 'torch.ao.quantization.observer.MinMaxObserver'>,
>                 weight_observer=<class
>                 'torch.ao.quantization.observer.MinMaxObserver'>,
>                 activation_qtype=torch.qint8, weight_qtype=torch.quint8,
>                 granularity='tensor')

Initialize self.  See help(type(self)) for accurate signature.

In [None]:
path = untar_data(URLs.PETS)
files = get_image_files(path/"images")

def label_func(f): return f[0].isupper()

dls = ImageDataLoaders.from_name_func(path, files, label_func, item_tfms=Resize(64))

In [None]:
import timm
pretrained_resnet_34 = timm.create_model('resnet34', pretrained=True)

In [None]:
class Quantizer:
    def __init__(self, activation_observer=MinMaxObserver, 
                 weight_observer=PerChannelMinMaxObserver,
                 activation_qtype=torch.quint8, weight_qtype=torch.qint8,
                 granularity='channel'):
        self.activation_observer = activation_observer
        self.weight_observer = weight_observer
        self.activation_qtype = activation_qtype
        self.weight_qtype = weight_qtype
        self.granularity = granularity

    def prepare_qconfig(self):
        if self.granularity == 'tensor':
            qconfig = QConfig(
                activation=self.activation_observer.with_args(dtype=self.activation_qtype),
                weight=self.weight_observer.with_args(dtype=self.weight_qtype)
            )
        elif self.granularity == 'channel':
            qconfig = QConfig(
                activation=self.activation_observer.with_args(dtype=self.activation_qtype),
                weight=self.weight_observer.with_args(dtype=self.weight_qtype, qscheme=torch.per_channel_symmetric)
            )
        else:
            raise ValueError("Granularity must be 'tensor' or 'channel'")
        return qconfig

    def quantize(self, model, calibration_dl):
        qconfig = self.prepare_qconfig()
        #qconfig_mapping = get_default_qconfig_mapping("x86")
        qconfig_mapping = QConfigMapping().set_global(qconfig)
        #qconfig_mapping = qconfig_mapping.set_global(qconfig)

        print(qconfig)
        #qconfig = get_default_qconfig("x86")
        #qconfig_mapping = QConfigMapping().set_global(qconfig)

        #print(qconfig_mapping)
        
        x, _ = calibration_dl.valid.one_batch()

        model_prepared = quantize_fx.prepare_fx(model.eval(), qconfig_mapping, x)
        
        _ = [model_prepared(xb.to('cpu')) for xb, _ in calibration_dl.valid]

        model_quantized = quantize_fx.convert_fx(model_prepared)
        return model_quantized

In [None]:
qt = Quantizer(granularity='channel')
q_model = qt.quantize(pretrained_resnet_34, dls);

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.quint8){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})


In [None]:
qt = Quantizer(granularity='channel', weight_qtype=torch.quint4x2)
q_model = qt.quantize(pretrained_resnet_34, dls);

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.quint8){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.quint4x2, qscheme=torch.per_channel_symmetric){})


In [None]:
q_model.conv1[0].weight.dtype

torch.float32

In [None]:
from fasterbench.fasterbench.benchmark import evaluate_cpu_speed, get_model_size, get_num_parameters

In [None]:
x, y = dls.one_batch()

In [None]:
evaluate_cpu_speed(q_model, x[0][None])

(3.935255412943661, 0.5983745225580773, 254.11311212757525)

In [None]:
get_model_size(q_model)

22049152

In [None]:
qt = Quantizer()
q_model = qt.quantize(pretrained_resnet_34, dls)

In [None]:
evaluate_cpu_speed(q_model, x[0][None])

(3.9707702537998557, 0.736204133147707, 251.8403070646163)

In [None]:
get_model_size(q_model)

22049152

In [None]:
q_model

GraphModule(
  (conv1): QuantizedConvReLU2d(3, 64, kernel_size=(7, 7), stride=(2, 2), scale=0.06453105062246323, zero_point=0, padding=(3, 3))
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Module(
    (0): Module(
      (conv1): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.05736920237541199, zero_point=75, padding=(1, 1))
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.18261493742465973, zero_point=39, padding=(1, 1))
    )
    (1): Module(
      (conv1): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.12924925982952118, zero_point=70, padding=(1, 1))
      (drop_block): Identity()
      (act1): ReLU(inplace=True)
      (aa): Identity()
      (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.22187039256095886, zero_point=86, padding=(1, 1))
    )
 

In [None]:
%%timeit
q_model(x.cpu())

53.3 ms ± 2.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
pretrained_resnet_34 = timm.create_model('resnet34', pretrained=True)

In [None]:
%%timeit
pretrained_resnet_34(x.cpu())

75 ms ± 70.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
q_model.conv1.weight().dtype

torch.qint8

In [None]:
q_model.conv1.weight()[0].dtype

AttributeError: 'ConvReLU2d' object has no attribute 'weight'

In [None]:
#model = resnet18()
#model.fc = nn.Linear(512, 2)

qt = Quantizer()

q_model = qt.quantize(model, dls); q_model

KeyboardInterrupt: 