In [None]:
!sudo apt install valgrind

!pip install torch
!pip install torchvision
!pip install detectors
!pip install timm

!pip install mlflow

!pip install onnx
!pip install onnxscript
!pip install onnxruntime
!pip install netron

!pip install torch_tb_profiler
!pip install pytorch-lightning
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  gdb libbabeltrace1 libc6-dbg libdebuginfod-common libdebuginfod1 libipt2
  libsource-highlight-common libsource-highlight4v5
Suggested packages:
  gdb-doc gdbserver valgrind-dbg valgrind-mpi kcachegrind alleyoop valkyrie
The following NEW packages will be installed:
  gdb libbabeltrace1 libc6-dbg libdebuginfod-common libdebuginfod1 libipt2
  libsource-highlight-common libsource-highlight4v5 valgrind
0 upgraded, 9 newly installed, 0 to remove and 49 not upgraded.
Need to get 32.3 MB of archives.
After this operation, 111 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libdebuginfod-common all 0.186-1build1 [7,878 B]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libbabeltrace1 amd64 1.5.8-2build1 [160 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libdebuginfod1 amd6

In [None]:
#@title MLFlow Tracking Auth
import mlflow
import os
from getpass import getpass
import configparser
from google.colab import userdata

def load_env_from_file(path:str):
  # Create a ConfigParser object
  config = configparser.ConfigParser()

  # Read the INI file
  config.read(path)

  # Iterate over sections and keys to set environment variables
  for section in config.sections():
    for key, value in config.items(section):
      # Set each key-value pair as an environment variable
      os.environ[key.upper()] = value

# MLFlow auth
mlflow_auth_path = 'mlflow_auth.ini'
if os.path.exists(mlflow_auth_path):
  load_env_from_file(mlflow_auth_path)
else:
  try:
    os.environ['MLFLOW_TRACKING_URI'] = userdata.get('MLFLOW_TRACKING_URI')
    os.environ['MLFLOW_TRACKING_USERNAME'] = userdata.get('MLFLOW_TRACKING_USERNAME')
    os.environ['MLFLOW_TRACKING_PASSWORD'] = userdata.get('MLFLOW_TRACKING_PASSWORD')
  except userdata.SecretNotFoundError:
    os.environ['MLFLOW_TRACKING_URI'] = input('Enter MLflow uri: ')
    os.environ['MLFLOW_TRACKING_USERNAME'] = input('Enter your MLflow username: ')
    os.environ['MLFLOW_TRACKING_PASSWORD'] = getpass('Enter your MLflow password: ')
    # now proxied
    # os.environ['AWS_ACCESS_KEY_ID'] = input('Enter your s3 compatible Identity: ')
    # os.environ['AWS_SECRET_ACCESS_KEY'] = getpass('Enter your s3 compatible Key: ')

mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])

experiment = mlflow.set_experiment("onnx_quantization_test5")

print(mlflow.get_artifact_uri())

mlflow.end_run()
with mlflow.start_run():
  print(mlflow.get_artifact_uri())
  print(experiment.artifact_location)

Enter MLflow uri: http://mlflow.cavidano.com
Enter your MLflow username: collin
Enter your MLflow password: ··········


2024/09/20 23:49:01 INFO mlflow.tracking.fluent: Experiment with name 'onnx_quantization_test5' does not exist. Creating a new experiment.


mlflow-artifacts:/6/d4f50522faa44b47a9e727b35e39ae2e/artifacts


2024/09/20 23:49:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run languid-gull-863 at: http://mlflow.cavidano.com/#/experiments/6/runs/d4f50522faa44b47a9e727b35e39ae2e.
2024/09/20 23:49:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow.cavidano.com/#/experiments/6.


mlflow-artifacts:/6/138583dac56e4334879a806fd0ad3e46/artifacts
mlflow-artifacts:/6


2024/09/20 23:49:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run sedate-bass-999 at: http://mlflow.cavidano.com/#/experiments/6/runs/138583dac56e4334879a806fd0ad3e46.
2024/09/20 23:49:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow.cavidano.com/#/experiments/6.


In [None]:
#@title Wrapper for calibration data loader so onnx can use it

#add work dir to args later rely on user to handle cleanup
import torch
import onnx
import onnxruntime as ort
from onnxruntime import quantization
import numpy as np

# Wrapper for calibration data loader so onnx can use it
class QuantizationDataReader(quantization.CalibrationDataReader):
  def __init__(self, torch_dl, input_name):
    self.torch_dl = torch_dl
    self.input_name = input_name
    self.datasize = len(self.torch_dl)
    self.enum_data = iter(self.torch_dl)

  def to_numpy(self, pt_tensor):
    return pt_tensor.detach().cpu().numpy() if pt_tensor.requires_grad else pt_tensor.cpu().numpy()

  def get_next(self):
    batch = next(self.enum_data, None)
    if batch is not None:
      return {self.input_name: self.to_numpy(batch[0])}
    else:
      return None

  def rewind(self):
    self.enum_data = iter(self.torch_dl)

def onnx_convert(model_pt, output_path, torch_input:torch.tensor, training=False)-> None:
  # if it contains batch norms double check they arent causing accuracy issues at eval time.
  #If so run this in training mode

  torch.onnx.export(model_pt,
                    torch_input,
                    output_path,
                    export_params=True,
                    opset_version=14,
                    do_constant_folding=(not training),
                    training=(torch.onnx.TrainingMode.TRAINING if training else torch.onnx.TrainingMode.Eval),
                    input_names = ['input'],
                    output_names = ['output'],
                    dynamic_axes={'input' : {0 : 'batch_size'},
                                  'output' : {0 : 'batch_size'}})

def onnx_quantize(init_model_path:str, model_name:str, calibration_data_loader:torch.utils.data.DataLoader)->str:
  ort_provider = ['CUDAExecutionProvider'] if torch.cuda.is_available() else ['CPUExecutionProvider']
  ort_sess = ort.InferenceSession(init_model_path, providers=ort_provider)

  # load and preprocess
  model_onnx = onnx.load(init_model_path)
  onnx.checker.check_model(model_onnx)
  model_prep_path = f'{model_name}_prep.onnx'
  quantization.shape_inference.quant_pre_process(init_model_path, model_prep_path, skip_symbolic_shape=False)

  qdr = QuantizationDataReader(calibration_data_loader, input_name=ort_sess.get_inputs()[0].name)

  # actual quantization
  model_int8_path = f'{model_name}_int8.onnx'
  q_static_opts = {"ActivationSymmetric": torch.cuda.is_available(), "WeightSymmetric":True}
  quantization.quantize_static(model_input=model_prep_path,
                                                model_output=model_int8_path,
                                                calibration_data_reader=qdr,
                                                extra_options=q_static_opts)
  return model_int8_path


In [None]:
#@title Torch setup
import torch
from torch import optim, nn,  utils, Tensor
from itertools import repeat
from torchsummary import summary
from torchvision import models, transforms
import torchvision

batch_size = 128
dataset = torchvision.datasets.CIFAR10(root="./data", download=True, transform=transforms.ToTensor())
calib_ds, test_ds = torch.utils.data.random_split(dataset, [0.5, 0.5])

# shortening test_ds again
test_ds = torch.utils.data.Subset(test_ds, range(0, 1000))

calibration_data_loader = utils.data.DataLoader(calib_ds, batch_size=batch_size, shuffle=False)
test_data_loader = utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:10<00:00, 16278068.72it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data


In [None]:
#@title Model download and quantization
# https://huggingface.co/edadaltocg/resnet18_cifar10
import detectors
import timm
model = timm.create_model("resnet18_cifar10", pretrained=True)

#I NEED TO FIX THIS IT SHOULD BE ABLE TO WORK WITHOUT BEING IN TRAIN MODE
# dont remember how fixed this conversion?.... but batchnorm used to be causing an issue...
# OH THATS RIGHT HAD TO PUT IT IN TRAIN MODE WHICH IS NOT RIGHTHA yeahhhh

# nuke batchnorm from orbit
# hoping if onnx has no value for these it will fallback to per batch norms just like pytorch itself will
# def stop(m):
#   if isinstance(m, nn.BatchNorm2d):
#     m.track_running_stats = False
#     m.running_mean = None
#     m.running_var = None
# model.eval()
# model.apply(lambda m: stop(m))


# Thats Fun...
# UserWarning: ONNX export mode is set to TrainingMode.EVAL, but operator 'batch_norm' is set to train=True. Exporting with train=True.

# pretrained via model method
# model = torch.hub.load("pytorch/vision", "resnet18", weights="IMAGENET1K_V1")
# model.eval()

converted_model_path = "onnx_res18.onnx"
torch_input = torch.randn(batch_size, 3, 32, 32) # has to match dataset and of course model
onnx_convert(model, converted_model_path, torch_input, training=True)

onnx_q_model_path = onnx_quantize(converted_model_path, "resnet18", calibration_data_loader)

Downloading: "https://huggingface.co/edadaltocg/resnet18_cifar10/resolve/main/pytorch_model.bin" to /root/.cache/torch/hub/checkpoints/resnet18_cifar10.pth
100%|██████████| 42.7M/42.7M [00:03<00:00, 13.1MB/s]
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
  _C._jit_pass_onnx_remove_inplace_

In [None]:
def to_numpy(tensor):
  return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

In [None]:
#@title reload and log onnx models BROKEN
# have to log from an onnx object and conversion and quantization always gives paths so have to reload...
# mlflow.end_run()

onnx_model = onnx.load(converted_model_path)
onnx_q_model = onnx.load(onnx_q_model_path)

import logging
logging.getLogger("mlflow").setLevel(logging.DEBUG)
np_in = to_numpy(torch_input)

mlflow.onnx.log_model(onnx_model, "mlflow-artifacts:/onnx_res18.onnx")
mlflow.onnx.log_model(onnx_q_model, "mlflow-artifacts:/onnx_res18_int8.onnx")

# mlflow.onnx.save_model(onnx_model, "onnx_res18.onnx")
# mlflow.onnx.save_model(onnx_q_model, "onnx_res18_int8.onnx")

# mlflow.onnx.log_model(onnx_model, "onnx_res18.onnx")

# mlflow.onnx.log_model(onnx_model, "onnx_res18.onnx", input_example = np_in)
# mlflow.onnx.log_model(onnx_q_model, "onnx_res18_int8.onnx", input_example = np_in)



<function print>

In [None]:
#@title load onnx models from registry BROKEN
# should save to same file names
converted_model_path = "onnx_res18.onnx"
onnx_q_model_path = "onnx_res18_int8.onnx"

# pretty sure I need run ids for this to work
mlflow.onnx.load_model("mlflow-artifacts:/onnx_res18.onnx", dst_path="./")
mlflow.onnx.load_model("mlflow-artifacts:/onnx_res18_int8.onnx", dst_path="./")

In [None]:
#@title Eval code onnx model inits and helpers

def run_pytorch(model, data):
  return model(data)

def run_onnx(session, data):
  ort_inputs = {session.get_inputs()[0].name: to_numpy(data)}
  return session.run(None, ort_inputs)[0]

if torch.cuda.is_available():
  model.to('cuda')
model.eval()

ort_provider = ['CUDAExecutionProvider'] if torch.cuda.is_available() else ['CPUExecutionProvider']
ort_converted_sess = ort.InferenceSession(converted_model_path, providers=ort_provider)
ort_quantized_sess = ort.InferenceSession(onnx_q_model_path, providers=ort_provider)

num_threads = torch.get_num_threads()


In [None]:
#@title COMPARE ALL FORMS ACCURACY
from tqdm import tqdm

correct_pt = 0
correct_converted_onnx = 0
correct_quantized_onnx = 0
pt_to_conv_tot_abs_error = 0
conv_to_quant_tot_abs_error = 0

for img_batch, label_batch in tqdm(test_data_loader, ascii=True, unit="batches"):

  # pytorch
  if torch.cuda.is_available():
    img_batch = img_batch.to('cuda')
    label_batch = label_batch.to('cuda')

  with torch.no_grad():
    pt_outs = model(img_batch)

  pt_preds = torch.argmax(pt_outs, dim=1)
  correct_pt += torch.sum(pt_preds == label_batch)

  # converted
  ort_converted_outs = run_onnx(ort_converted_sess, img_batch)
  ort_preds = np.argmax(ort_converted_outs, axis=1)
  correct_converted_onnx += np.sum(np.equal(ort_preds, to_numpy(label_batch)))

  # quantized
  ort_quantized_outs = run_onnx(ort_quantized_sess, img_batch)
  ort_preds = np.argmax(ort_quantized_outs, axis=1)
  correct_quantized_onnx += np.sum(np.equal(ort_preds, to_numpy(label_batch)))

  # abs errors
  pt_to_conv_tot_abs_error += np.sum(np.abs(to_numpy(pt_outs - ort_converted_outs)))
  conv_to_quant_tot_abs_error += np.sum(np.abs(ort_converted_outs - ort_quantized_outs))

print("\n")

print(f"pytorch   top-1 acc = {100.0 * correct_pt/len(test_ds)} with {correct_pt} correct samples")
print(f"converted top-1 acc = {100.0 * correct_converted_onnx/len(test_ds)} with {correct_converted_onnx} correct samples")
print(f"quantized top-1 acc = {100.0 * correct_quantized_onnx/len(test_ds)} with {correct_quantized_onnx} correct samples")

mae = pt_to_conv_tot_abs_error/(len(test_ds))
print(f"pt_to_conv: mean abs error = {mae} with total abs error {pt_to_conv_tot_abs_error}")

mae = conv_to_quant_tot_abs_error/(len(test_ds))
print(f"conv_to_quant: mean abs error = {mae} with total abs error {conv_to_quant_tot_abs_error}")

In [None]:
#@title timing with torch.benchmark
import torch.utils.benchmark as benchmark

results = []
label = "Average Inference Times"

num_runs = 100
batch_sizes = [1, 64, 128, 512]
batch_sizes = [32]

# def run_pytorch(model, data):
#   return model(data)

# def run_onnx(session, data):
#   ort_inputs = {session.get_inputs()[0].name: data}
#   return session.run(None, ort_inputs)[0]


def timing(t):
  # return t.timeit(num_runs)
  return t.blocked_autorange(min_run_time=4)

with mlflow.start_run():
  for batch_size in batch_sizes:
    mlflow.log_param("batch_size", batch_size)
    data = torch.randn((batch_size, 3, 32, 32))
    np_data = to_numpy(data)

    sub_label = f"Batch Size: {batch_size}"

    # pytorch
    t = benchmark.Timer(
      stmt = 'run_pytorch(model, data)',
      setup = 'from __main__ import run_pytorch',
      globals={'model': model, 'data': data},
      num_threads=num_threads,
      label=label,
      sub_label=sub_label,
      description="Pytorch",
    )
    results.append(timing(t))
    mlflow.log_metric("time", str(results[-1]))

    # converted
    t = benchmark.Timer(
      stmt = 'run_onnx(session, data)',
      setup = 'from __main__ import run_onnx',
      globals={'session': ort_converted_sess, 'data': data},
      num_threads=num_threads,
      label=label,
      sub_label=sub_label,
      description="Onnx Converted",
    )
    results.append(timing(t))
    mlflow.log_metric("time", str(results[-1]))

    # quantized
    t = benchmark.Timer(
      stmt = 'run_onnx(session, data)',
      setup = 'from __main__ import run_onnx',
      globals={'session': ort_quantized_sess, 'data': data},
      num_threads=num_threads,
      label=label,
      sub_label=sub_label,
      description="Onnx Quantized"
    )
    results.append(timing(t))
    mlflow.log_metric("time", str(results[-1]))

  compare = benchmark.Compare(results)
  compare.print()

In [None]:
#@title timing with perf_counter
import time

results = []
num_runs = 100
batch_size = 32
data = torch.zeros((batch_size, 3, 32, 32))

def perf_benchmark(model, input_data, eval_func, num_runs=num_runs):
    times = []
    for _ in range(num_runs):
        start = time.perf_counter()
        eval_func(model, input_data)
        end = time.perf_counter()
        times.append(end - start)
    return np.mean(times), np.std(times)

print("perf_counter")
print("pytorch")
mean, std = perf_benchmark(model, data, run_pytorch)
print('mean: {}ms, std: {}ms'.format(mean * 1000, std * 1000))

print("converted")
mean, std = perf_benchmark(ort_converted_sess, data, run_onnx)
print('mean: {}ms, std: {}ms'.format(mean * 1000, std * 1000))

print("quantized")
mean, std = perf_benchmark(ort_quantized_sess, data, run_onnx)
print('mean: {}ms, std: {}ms'.format(mean * 1000, std * 1000))

In [None]:
#@title callgrind collection with torch.benchmark
import os

results = []
label = "Average Inference Times"

num_runs = 100
batch_sizes = [1, 64, 128, 512]
# batch_sizes = [1]

def timing(t):
  return t.collect_callgrind()

for batch_size in batch_sizes:
  data = torch.randn((batch_size, 3, 32, 32))
  np_data = to_numpy(data)
  sub_label = f"Batch Size: {batch_size}"

  # pytorch
  t = benchmark.Timer(
    stmt = 'run_pytorch(model, data)',
    setup = 'from __main__ import run_pytorch',
    globals={'model': model, 'data': data},
    num_threads=num_threads,
    label=label,
    sub_label=sub_label,
    description="Pytorch",
  )
  results.append(timing(t))

  # converted
  t = benchmark.Timer(
    stmt = 'run_onnx(session, data)',
    setup = 'from __main__ import run_onnx',
    globals={'session': ort_converted_sess, 'data': np_data},
    num_threads=num_threads,
    label=label,
    sub_label=sub_label,
    description="Onnx Converted",
  )
  results.append(timing(t))

  # quantized
  t = benchmark.Timer(
    stmt = 'run_onnx(session, data)',
    setup = 'from __main__ import run_onnx',
    globals={'session': ort_quantized_sess, 'data': np_data},
    num_threads=num_threads,
    label=label,
    sub_label=sub_label,
    description="Onnx Quantized"
  )
  results.append(timing(t))

for result in results:
  print(result)

In [None]:
#@title torch profiler
import torch.profiler

batch_size = 32
data = torch.zeros((batch_size, 3, 32, 32))

with torch.profiler.profile(
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        record_shapes=True,
        profile_memory=True,
        with_stack=True
) as prof:
  for i in range(5):
    run_pytorch(model, data)
    prof.step()

  print(prof.key_averages().table(row_limit=-1))
  # print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))


In [None]:
#@title onnx profiler
import onnxruntime as rt
import json
import pprint


options = rt.SessionOptions()
options.enable_profiling = True

ort_provider = ['CUDAExecutionProvider'] if torch.cuda.is_available() else ['CPUExecutionProvider']

batch_size = 32
data = torch.zeros((batch_size, 3, 32, 32))

ort_converted_sess = ort.InferenceSession(onnx_q_model_path, sess_options=options, providers=ort_provider)
run_onnx(ort_converted_sess, data)
profile_file = ort_converted_sess.end_profiling()
print("converted")
print(profile_file)
# with open(profile_file, 'r') as f:
#   pprint.pp(json.load(f))


ort_quantized_sess = ort.InferenceSession(onnx_q_model_path, sess_options=options, providers=ort_provider)
run_onnx(ort_quantized_sess, data)
profile_file = ort_quantized_sess.end_profiling()
print("quantized")
print(profile_file)
# with open(profile_file, 'r') as f:
#   pprint.pp(json.load(f))

In [None]:
# netron viewer
import os
import torch
import netron
import portpicker
from google.colab import output

# model should come from another block
# output_path = "/content/output.pth"
# torch.save(model.state_dict(), output_path)
output_path = converted_model_path
port = portpicker.pick_unused_port()

# Read the model file and start the netron browser.
with output.temporary():
  netron.start(output_path, port, browse=True)

output.serve_kernel_port_as_iframe(port, height='800')