In [1]:
from adapt.approx_layers.systolic_utils import compare_exact_vs_approx#, testlinear,check_equal
#check_equal()

# Model evaluation and re-training with AdaPT on Cifar10 dataset

In this notebook you can evaluate different approximate multipliers on various models based on Cifar10 dataset

Steps:
* Select models to load 
* Select number of threads to use
* Choose approximate multiplier 
* Load model for evaluation
* Load dataset
* Run model calibration for quantization
* Run model evaluation
* Run approximate-aware re-training
* Rerun model evaluation

**Note**:
* This notebook should be run on a X86 machine

* Please make sure you have run the installation steps first

In [2]:
import os
import zipfile
import torch

import requests
from torch.utils.data import DataLoader
from torchvision import transforms as T
from torchvision.datasets import CIFAR10
from tqdm import tqdm
import torch.nn as nn

## Select models to load 

The weights must be downloaded in state_dicts folder.


In [3]:
from models.resnet import resnet18, resnet34, resnet50
#from models.resnet_systolic import resnet50_systolic
from models.vgg import vgg11_bn, vgg13_bn, vgg19_bn
from models.densenet import densenet121, densenet161, densenet169
from models.inception import inception_v3 # slow, propably bad cifar10 implementation of inception for PT

## Select number of threads to use

For optimal performance set them as the number of your cpu threads (not cpu cores)

In [4]:
threads = 40
torch.set_num_threads(threads)

#maybe better performance
%env OMP_PLACES=cores
%env OMP_PROC_BIND=close
%env OMP_WAIT_POLICY=active

env: OMP_PLACES=cores
env: OMP_PROC_BIND=close
env: OMP_WAIT_POLICY=active


## Choose approximate multiplier 

Two approximate multipliers are already provided

**mul8s_acc** - (header file: mul8s_acc.h)   <--  default

**mul8s_1L2H** - (header file: mul8s_1L2H.h)



In order to use your custom multiplier you need to use the provided tool (LUT_generator) to easily create the C++ header for your multiplier. Then you just place it inside the adapt/cpu-kernels/axx_mults folder. The name of the axx_mult here must match the name of the header file. The same axx_mult is used in all layers. 

Tip: If you want explicitly to set for each layer a different axx_mult you must do it from the model definition using the respective AdaPT_Conv2d class of each layer.

In [5]:
axx_mult = 'mul8s_acc'

## Load model for evaluation

Jit compilation method loads 'on the fly' the C++ extentions of the approximate multipliers. Then the pytorch model is loaded

In [6]:
model = resnet50(pretrained=True, axx_mult = axx_mult)

model.eval() # for evaluation

Using /root/.cache/torch_extensions as PyTorch extensions root...
Emitting ninja build file /root/.cache/torch_extensions/PyInit_conv2d_mul8s_acc/build.ninja...
Building extension module PyInit_conv2d_mul8s_acc...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skip

Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...


ResNet(
  (conv1): AdaPT_Conv2d(
    3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
    (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
    (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): AdaPT_Conv2d(
        64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False
        (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
        (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
      )
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): AdaPT_Conv2d(
      

## Load dataset


In [7]:
def val_dataloader(mean = (0.4914, 0.4822, 0.4465), std = (0.2471, 0.2435, 0.2616)):

    transform = T.Compose(
        [
            T.ToTensor(),
            T.Normalize(mean, std),
        ]
    )
    dataset = CIFAR10(root="datasets/cifar10_data", train=False, download=True, transform=transform)
    dataloader = DataLoader(
        dataset,
        batch_size=128,
        num_workers=0,
        drop_last=True,
        pin_memory=False,
    )
    return dataloader

transform = T.Compose(
        [
            T.RandomCrop(32, padding=4),
            T.RandomHorizontalFlip(),
            T.ToTensor(),
            T.Normalize(mean = (0.4914, 0.4822, 0.4465), std = (0.2471, 0.2435, 0.2616)),
        ]
    )
dataset = CIFAR10(root="datasets/cifar10_data", train=True, download=True, transform=transform)

evens = list(range(0, len(dataset), 10))
trainset_1 = torch.utils.data.Subset(dataset, evens)

data = val_dataloader()

# data_t is used for calibration purposes and is a subset of train-set
data_t = DataLoader(trainset_1, batch_size=128,
                                            shuffle=False, num_workers=0)


Files already downloaded and verified
Files already downloaded and verified


## Run model calibration for quantization

Calibrates the quantization parameters 

Need to re-run it each time the model changes

In [8]:
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import calib

def collect_stats(model, data_loader, num_batches):
     """Feed data to the network and collect statistic"""

     # Enable calibrators
     for name, module in model.named_modules():
         if isinstance(module, quant_nn.TensorQuantizer):
             if module._calibrator is not None:
                 module.disable_quant()
                 module.enable_calib()
             else:
                 module.disable()

     for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
         model(image.cpu())
         if i >= num_batches:
             break

     # Disable calibrators
     for name, module in model.named_modules():
         if isinstance(module, quant_nn.TensorQuantizer):
             if module._calibrator is not None:
                 module.enable_quant()
                 module.disable_calib()
             else:
                 module.enable()

def compute_amax(model, **kwargs):
 # Load calib result
 for name, module in model.named_modules():
     if isinstance(module, quant_nn.TensorQuantizer):
         if module._calibrator is not None:
             if isinstance(module._calibrator, calib.MaxCalibrator):
                 module.load_calib_amax()
             else:
                 module.load_calib_amax(**kwargs)
         print(F"{name:40}: {module}")
 model.cpu()

# It is a bit slow since we collect histograms on CPU
#with torch.no_grad():
#    stats = collect_stats(model, data_t, num_batches=2)
#    amax = compute_amax(model, method="percentile", percentile=99.99)
    
    # optional - test different calibration methods
    #amax = compute_amax(model, method="mse")
    #amax = compute_amax(model, method="entropy")
    

## Run model evaluation

Tip: observe how the execution becomes faster and faster with each batch as the CPU achieves better cache re-use on the LUT table

In [9]:
import timeit
#correct = 0
#total = 0

#model.eval()
#start_time = timeit.default_timer()
#with torch.no_grad():
#    for iteraction, (images, labels) in tqdm(enumerate(data), total=len(data)):
#        images, labels = images.to("cpu"), labels.to("cpu")
#        outputs = model(images)
#        _, predicted = torch.max(outputs.data, 1)
#        total += labels.size(0)
#        correct += (predicted == labels).sum().item()
#print(timeit.default_timer() - start_time)
#print('Accuracy of the network on the 10000 test images: %.4f %%' % (
#    100 * correct / total))

## Run approximate-aware re-training


In [10]:
from adapt.references.classification.train import evaluate, train_one_epoch, load_data

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# finetune the model for one epoch based on data_t subset 
#train_one_epoch(model, criterion, optimizer, data_t, "cpu", 0, 1)



In [11]:
import torch

save_path = "aa_finetuned_model.pth"

# Load the state dict
state_dict = torch.load(save_path, map_location="cpu")

# Apply it
model.load_state_dict(state_dict, strict=False)
model.eval()

ResNet(
  (conv1): AdaPT_Conv2d(
    3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
    (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
    (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): AdaPT_Conv2d(
        64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False
        (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
        (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
      )
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): AdaPT_Conv2d(
      

In [12]:


model_cpu = model.to("cpu").eval()

# Strip quantizer buffers before saving
cpu_sd = {k: v for k, v in model_cpu.state_dict().items()
          if not k.endswith("._amax") and k.split(".")[-1] not in
          {"_amax", "_amax_hist", "_amax_threshold", "_observed_amax", "_calibrator"}}

torch.save(cpu_sd, save_path)


## Rerun model evaluation

In [13]:
correct = 0
total = 0

device = torch.device("cpu")
#model = resnet50(pretrained=False, axx_mult=axx_mult)
#model.load_state_dict(torch.load(save_path))
model.eval()
start_time = timeit.default_timer()
#with torch.no_grad():
#    for iteraction, (images, labels) in tqdm(enumerate(data), total=len(data)):
#        images, labels = images.to("cpu"), labels.to("cpu")
#        outputs = model(images)
#        _, predicted = torch.max(outputs.data, 1)
#        total += labels.size(0)
#        correct += (predicted == labels).sum().item()
#print(timeit.default_timer() - start_time)
#print('Accuracy of the network on the 10000 test images: %.4f %%' % (
#    100 * correct / total))

In [14]:
# force TensorQuantizers to load buffers on CPU instead of .cuda()
def force_quantizer_cpu():
    import torch
    from pytorch_quantization.nn import modules

    orig_fn = modules.tensor_quantizer.TensorQuantizer._load_from_state_dict

    def new_fn(self, state_dict, prefix, *args, **kwargs):
        key = prefix + '_amax'
        if key in state_dict:
            buf = state_dict[key].data.cpu()   # force CPU
            self.register_buffer("_amax", buf)
        else:
            # fallback to original
            orig_fn(self, state_dict, prefix, *args, **kwargs)

    modules.tensor_quantizer.TensorQuantizer._load_from_state_dict = new_fn

force_quantizer_cpu()

In [15]:
#from torchvision.models import resnet50
from models.resnet import resnet18, resnet34, resnet50
from torchvision import transforms
from adapt.approx_layers.systolic_utils import compare_exact_vs_approx
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2470, 0.2435, 0.2616)),   # CIFAR-10 stats (example)
])

test_dataset = CIFAR10(root="datasets/cifar10_data", train=False, download=True, transform=transform)

# IMPORTANT: num_workers=0 to avoid OpenMP + fork segfaults with C++ extensions
from torch.utils.data import DataLoader
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=0, pin_memory=False)

# Your already-initialized/trained model:
# model = resnet50(pretrained=True, axx_mult=axx_mult)  # (or your own wrapper)


Files already downloaded and verified


In [18]:
from adapt.approx_layers.systolic_utils import compare_exact_vs_approx, make_exact_and_approx_models
from models.resnet_systolic import resnet50_systolic
axx_mult="mul8s_acc"
model_exact,model_approx = make_exact_and_approx_models(model,  
                    model_factory=lambda: resnet50(pretrained=True, axx_mult = axx_mult)
                    , axx_mult=axx_mult)

model_exact.eval()
model_approx.eval()


Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded ex

Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded ex

ResNet(
  (conv1): AdaPT_Conv2d_Systolic(
    (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
    (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): AdaPT_Conv2d_Systolic(
        (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
        (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
      )
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): AdaPT_Conv2d_Systolic(
        (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
        (q

In [19]:

correct = 0
total = 0

device = torch.device("cpu")
#model = resnet50(pretrained=False, axx_mult=axx_mult)
#model.load_state_dict(torch.load(save_path))
model_approx.eval()
start_time = timeit.default_timer()
with torch.no_grad():
    for iteraction, (images, labels) in tqdm(enumerate(data), total=len(data)):
        images, labels = images.to("cpu"), labels.to("cpu")
        outputs = model_approx(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(timeit.default_timer() - start_time)
print('Accuracy of the network on the 10000 test images: %.4f %%' % (
    100 * correct / total))

100%|█

74.76232749699557
Accuracy of the network on the 10000 test images: 93.5096 %





In [20]:

correct = 0
total = 0

device = torch.device("cpu")
#model = resnet50(pretrained=False, axx_mult=axx_mult)
#model.load_state_dict(torch.load(save_path))
model_exact.eval()
start_time = timeit.default_timer()
with torch.no_grad():
    for iteraction, (images, labels) in tqdm(enumerate(data), total=len(data)):
        images, labels = images.to("cpu"), labels.to("cpu")
        outputs = model_exact(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(timeit.default_timer() - start_time)
print('Accuracy of the network on the 10000 test images: %.4f %%' % (
    100 * correct / total))

100%|█

69.71960497600958
Accuracy of the network on the 10000 test images: 93.5096 %





In [21]:
axx_mult="mul8s_1L2H"
model_exact,model_approx = make_exact_and_approx_models(model,  
                    model_factory=lambda: resnet50(pretrained=True, axx_mult = axx_mult)
                    , axx_mult=axx_mult)

model_exact.eval()
model_approx.eval()

Using /root/.cache/torch_extensions as PyTorch extensions root...
Emitting ninja build file /root/.cache/torch_extensions/PyInit_conv2d_mul8s_1L2H/build.ninja...
Building extension module PyInit_conv2d_mul8s_1L2H...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module PyInit_conv2d_mul8s_1L2H...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_1L2H, skipping build step...
Loading extension module PyInit_conv2d_mul8s_1L2H...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_1L2H, skipping build step...
Loading extension module PyInit_conv2d_mul8s_1L2H...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_1L

Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_1L2H, skipping build step...
Loading extension module PyInit_conv2d_mul8s_1L2H...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_1L2H, skipping build step...
Loading extension module PyInit_conv2d_mul8s_1L2H...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_1L2H, skipping build step...
Loading extension module PyInit_conv2d_mul8s_1L2H...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_1L2H, skipping build step...
Loading extension module PyInit_conv2d_mul8s_1L2H...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-l

ResNet(
  (conv1): AdaPT_Conv2d_Systolic(
    (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
    (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): AdaPT_Conv2d_Systolic(
        (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
        (quantizer_w): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
      )
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): AdaPT_Conv2d_Systolic(
        (quantizer): TensorQuantizer(8bit per-tensor amax=dynamic calibrator=HistogramCalibrator quant)
        (q

In [22]:

correct = 0
total = 0

device = torch.device("cpu")
#model = resnet50(pretrained=False, axx_mult=axx_mult)
#model.load_state_dict(torch.load(save_path))
model_exact.eval()
start_time = timeit.default_timer()
with torch.no_grad():
    for iteraction, (images, labels) in tqdm(enumerate(data), total=len(data)):
        images, labels = images.to("cpu"), labels.to("cpu")
        outputs = model_exact(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(timeit.default_timer() - start_time)
print('Accuracy of the exact network on the 10000 test images: %.4f %%' % (
    100 * correct / total))


correct = 0
total = 0

device = torch.device("cpu")
#model = resnet50(pretrained=False, axx_mult=axx_mult)
#model.load_state_dict(torch.load(save_path))
model_approx.eval()
start_time = timeit.default_timer()
with torch.no_grad():
    for iteraction, (images, labels) in tqdm(enumerate(data), total=len(data)):
        images, labels = images.to("cpu"), labels.to("cpu")
        outputs = model_approx(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(timeit.default_timer() - start_time)
print('Accuracy of the approx network on the 10000 test images: %.4f %%' % (
    100 * correct / total))

100%|█


93.8867325559986
Accuracy of the network on the 10000 test images: 93.5096 %


100%|█

66.77874595100002
Accuracy of the network on the 10000 test images: 93.5096 %





In [None]:
axx_mult="mul8s_acc"
acc_baseline, acc_exact, acc_approx, delta = compare_exact_vs_approx(
    model,
    test_loader,
    model_factory=lambda: resnet50(pretrained=False, axx_mult = axx_mult),
    axx_mult=axx_mult,
    device="cpu",
)

print(f"Baseline: {acc_baseline:.4f} | Exact systolic: {acc_exact:.4f} | "
      f"Approx systolic: {acc_approx:.4f} | Δ={delta:.4f}")

Evaluating baseline
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detec

Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded ex

Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module PyInit_conv2d_mul8s_acc, skipping build step...
Loading extension module PyInit_conv2d_mul8s_acc...
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded ex

In [None]:
axx_mult = 'mul8s_1L2H'
acc_baseline, acc_exact, acc_approx, delta = compare_exact_vs_approx(
    model,
    test_loader,
    model_factory=lambda: resnet50(pretrained=True, axx_mult = axx_mult),
    axx_mult=axx_mult,
    device="cpu",
)

print(f"Baseline: {acc_baseline:.4f} | Exact systolic: {acc_exact:.4f} | "
      f"Approx systolic: {acc_approx:.4f} | Δ={delta:.4f}")