In [1]:
import onnx 
import torch 

import brevitas.onnx as bo

import inspect
import netron
from IPython.display import IFrame

def showSrc(what):
    print("".join(inspect.getsourcelines(what)[0]))

def showInNetron(model_filename):
    netron.start(model_filename, address=("0.0.0.0", 8081))
    return IFrame(src="http://0.0.0.0:8081/", width="100%", height=400)

The exported ONNX model (from Brevitas) must be given with the following lines:

In [2]:
from qonnx.core.modelwrapper import ModelWrapper

build_dir = "./yolo/"

model_file_path = build_dir + "/best.finn.onnx"
model_for_sim = ModelWrapper(model_file_path)

                i.e. domain=finn to domain=qonnx.custom_op.<general|fpgadataflow|...>


Graph based transformations and optimizations are applied to the ONNX model and the final model is saved:

In [3]:
from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs
from qonnx.transformation.infer_shapes import InferShapes
from qonnx.transformation.infer_datatypes import InferDataTypes
from qonnx.transformation.fold_constants import FoldConstants

model_for_sim = model_for_sim.transform(InferShapes())
model_for_sim = model_for_sim.transform(FoldConstants())
model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())
model_for_sim = model_for_sim.transform(GiveReadableTensorNames())
model_for_sim = model_for_sim.transform(InferDataTypes())
model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())

In [4]:
from finn.util.pytorch import ToTensor, NormalizePreProc
from qonnx.transformation.merge_onnx_models import MergeONNXModels
from qonnx.core.datatype import DataType

global_inp_name = model_for_sim.graph.input[0].name

ishape = model_for_sim.get_tensor_shape(global_inp_name)

 # preprocessing: torchvision's ToTensor divides uint8 inputs by 255
totensor_pyt = ToTensor() #preproc_pyt = NormalizePreProc(mean = [0.485, 0.456, 0.406], std = 0.226, channels = 3)
#preproc_pyt = NormalizePreProc(mean = [0.485, 0.456, 0.406], std = 1, channels = 3)


chkpt_preproc_name = build_dir+"/yolo_pre_post_tidy.onnx"
bo.export_qonnx(totensor_pyt, export_path=chkpt_preproc_name, input_shape=ishape) #(preproc_pyt, ishape, chkpt_preproc_name)

 # join preprocessing and core model
pre_model = ModelWrapper(chkpt_preproc_name)
model_for_sim = model_for_sim.transform(MergeONNXModels(pre_model))


 # add input quantization annotation: UINT8 for all BNN-PYNQ models
global_inp_name = model_for_sim.graph.input[0].name
model_for_sim.set_tensor_datatype(global_inp_name, DataType["UINT8"])

model_for_sim.save(build_dir+"/yolo_pre_post_tidy.onnx")
#showInNetron(build_dir+"/yolo_pre_post_tidy.onnx")



In [5]:
model_for_sim = model_for_sim.transform(InferShapes())
model_for_sim = model_for_sim.transform(FoldConstants())
model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())
model_for_sim = model_for_sim.transform(GiveReadableTensorNames())
model_for_sim = model_for_sim.transform(InferDataTypes())
model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())

model_for_sim.save(build_dir+"/yolo_pre_post_tidy.onnx")
#showInNetron(build_dir+"/yolo_pre_post_tidy.onnx")

Now, the model is ready for bitstream synthesis!

Before the synthesis, “folding_config.json” should be created for resource allocation and parallelism. The parameters on the file can be changed to get lower latency and higher throughput. This file should be like this:

With the following lines, .bit and .hwh files are synthesized:

In [6]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
#import os
#import shutil

build_dir = "./yolo/"

model_file = build_dir + "/yolo_pre_post_tidy.onnx"

final_output_dir = build_dir + "/output"

cfg = build.DataflowBuildConfig(
    output_dir          = final_output_dir,
    #mvau_wwidth_max     = 80,
    folding_config_file = build_dir + "/folding_config.json",
    auto_fifo_depths    = False,
    #large_fifo_mem_style = 'auto',
    target_fps          = 100000,
    synth_clk_period_ns = 10.0,
    board               = "KV260_SOM",
    #fpga_part           = "xc7z020clg400-1",
    shell_flow_type     = build_cfg.ShellFlowType.VIVADO_ZYNQ,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
        build_cfg.DataflowOutputType.BITFILE,
        build_cfg.DataflowOutputType.PYNQ_DRIVER,
        build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,
    ]
)

build.build_dataflow_cfg(model_file, cfg)

Building dataflow accelerator from ./yolo//yolo_pre_post_tidy.onnx
Intermediate outputs will be generated in /tmp/finn_dev_hao
Final outputs will be generated in ./yolo//output
Build log is at ./yolo//output/build_dataflow.log
Running step: step_qonnx_to_finn [1/18]
Running step: step_tidy_up [2/18]
Running step: step_streamline [3/18]
Running step: step_convert_to_hls [4/18]
Running step: step_create_dataflow_partition [5/18]
Running step: step_target_fps_parallelization [6/18]
Running step: step_apply_folding_config [7/18]
Running step: step_minimize_bit_width [8/18]
Running step: step_generate_estimate_reports [9/18]
Running step: step_hls_codegen [10/18]
Running step: step_hls_ipgen [11/18]
Running step: step_set_fifo_depths [12/18]
Running step: step_create_stitched_ip [13/18]
Running step: step_measure_rtlsim_performance [14/18]
Running step: step_out_of_context_synthesis [15/18]
Running step: step_synthesize_bitfile [16/18]
Running step: step_make_pynq_driver [17/18]
Running ste

0