## ONNX-MLIR
透過一個實作範例將一個鳶尾花朵邏輯迴歸分類器onnx轉換輸出動態連結庫(so)並使用C++進行推論。

### 準備鳶尾花邏輯回歸分類器的 ONNX 模型

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 載入鳶尾花資料集
iris = load_iris()
X, y = iris.data, iris.target

# 分割數據集為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 建立模型Pipeline：標準化 + 邏輯迴歸
model = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000))
])

# 訓練模型
model.fit(X_train, y_train)

In [2]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# 定義輸入類型
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]

# 轉換為 ONNX 模型
onnx_model = convert_sklearn(model, initial_types=initial_type, target_opset=9)

### 使用 Hummingbird 將模型轉換為 ONNX 格式

In [3]:
# ! pip install onnxruntime==1.19.2 onnx==1.16.1

In [4]:
from hummingbird.ml import convert

# 將 scikit-learn 模型轉換為 ONNX 格式
hb_model = convert(onnx_model, 'onnx')

# 保存轉換後的 ONNX 模型
hb_model.save('iris_logistic_regression_torch')

Model saved with digest: 1609dbcba26491d9bb02bec919f95901ba2140e5


'1609dbcba26491d9bb02bec919f95901ba2140e5'

### 使用 ONNX-MLIR 將模型編譯轉換為動態連結庫

In [22]:
!unzip -o iris_logistic_regression_torch.zip -d dist
!../onnx-mlir/Release/bin/onnx-mlir --EmitLib dist/deploy_model.onnx 

Archive:  iris_logistic_regression_torch.zip
  inflating: dist/container.pkl      
  inflating: dist/deploy_model.onnx  
  inflating: dist/model_configuration.txt  
  inflating: dist/model_type.txt     
[1/6] Sat Nov 16 11:50:02 2024 (0s) Importing ONNX Model to MLIR Module from "deploy_model.onnx"
[2/6] Sat Nov 16 11:50:02 2024 (0s) Compiling and Optimizing MLIR Module
[3/6] Sat Nov 16 11:50:02 2024 (0s) Translating MLIR Module to LLVM and Generating LLVM Optimized Bitcode
[4/6] Sat Nov 16 11:50:02 2024 (0s) Generating Object from LLVM Bitcode
[5/6] Sat Nov 16 11:50:03 2024 (1s) Linking and Generating the Output Shared Library
[6/6] Sat Nov 16 11:50:03 2024 (1s) Compilation completed


### 編寫 C++ 程式以載入並執行模型

In [34]:
!g++ --std=c++17 inference.cpp dist/deploy_model.so -o main -I../onnx-mlir/include
!./main

模型输出：2.48873e-05 0.00856126 0.991414 


In [33]:
!ld dist/deploy_model.so

Undefined symbols for architecture unknown:
  "_main", referenced from:
     implicit entry/start for main executable
ld: symbol(s) not found for architecture unknown


## TVM

In [None]:
!pip install apache-tvm

### 測試

In [164]:
import onnx
from onnx import helper

# 載入已轉換的 ONNX 模型
onnx_model = onnx.load('dist/deploy_model.onnx')

# 修改輸入張量的形狀
# 假設輸入張量名稱為 "input" 且目標形狀為 [1, 4]
for input in model.graph.input:
    input.type.tensor_type.shape.dim[0].dim_value = 1
    input.type.tensor_type.shape.dim[1].dim_value = 4
# 推斷模型形狀以應用變更
model = shape_inference.infer_shapes(model)
# 保存修改過的 ONNX 模型
onnx.save(model, 'iris_logistic_regression_fixed.onnx')


In [165]:
import onnx

# 載入 ONNX 模型
model = onnx.load("iris_logistic_regression_fixed.onnx")
print("Model inputs:")
for input in model.graph.input:
    print(f"Name: {input.name}, Shape: {[dim.dim_value if dim.dim_value > 0 else '?' for dim in input.type.tensor_type.shape.dim]}")


Model inputs:
Name: float_input, Shape: [1, 4]


In [181]:
!tvmc compile iris_logistic_regression_fixed.onnx --target="c" --output model.tar 

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
[21:44:46] /Users/runner/work/tlcpack/tlcpack/tvm/src/te/schedule/bound.cc:119: not in feed graph consumer = compute(p0_red_temp, body=[reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lhs_1], rhs=[argmax_rhs_0, argmax_rhs_1], identity_element=[-1, -3.40282e+38f]), source=[k1, p0[ax0, k1]], init=[], axis=[iter_var(k1, range(min=0, ext=3))], where=(bool)1, value_index=0), reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lh

### 使用 Scikit-learn 訓練邏輯回歸模型

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 載入鳶尾花資料集
iris = load_iris()
X, y = iris.data, iris.target

# 分割數據集為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 建立模型Pipeline：標準化 + 邏輯迴歸
model = Pipeline([
    ('scaler', StandardScaler()),
    # ('classifier', LogisticRegression(max_iter=1000))
    ('classifier', RandomForestClassifier())
])
# model = LogisticRegression(max_iter=1000)
# 訓練模型
model.fit(X_train, y_train)

In [5]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# 定義輸入類型
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]

# 轉換為 ONNX 模型
onnx_model = convert_sklearn(model, initial_types=initial_type, target_opset=9)

# 儲存ONNX模型
with open("iris_logistic_regression.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

### 使用 Hummingbird 將模型轉換為 ONNX/TVM 格式

In [6]:
from hummingbird.ml import convert

# model_tvm = convert(model, backend="tvm", test_input=X)
model_tvm = convert(onnx_model, backend="tvm", test_input=X)
model_tvm

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
[21:55:16] /Users/runner/work/tlcpack/tlcpack/tvm/src/te/schedule/bound.cc:119: not in feed graph consumer = compute(p0_red_temp, body=[reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lhs_1], rhs=[argmax_rhs_0, argmax_rhs_1], identity_element=[-1, -3.40282e+38f]), source=[k1, p0[ax0, k1]], init=[], axis=[iter_var(k1, range(min=0, ext=3))], where=(bool)1, value_index=0), reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lh

<hummingbird.ml.containers.sklearn.tvm_containers.TVMSklearnContainerClassification at 0x1394000d0>

In [7]:
from hummingbird.ml import convert

# 將 scikit-learn 模型轉換為 ONNX 格式
hb_model = convert(onnx_model, 'onnx')

# 保存轉換後的 ONNX 模型
hb_model.save('iris_logistic_regression_torch')

Model saved with digest: 7dc10ec35a1b17191d02c981934b6b41b8884137


'7dc10ec35a1b17191d02c981934b6b41b8884137'

## 使用 TVM 將 ONNX 模型編譯為 C 程式碼

In [1]:
!export CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=x86_64-linux-gnu-gcc

In [2]:
import tvm
from tvm import relay
from tvm.contrib import cc, utils
from tvm.contrib import graph_executor
import onnx
import sys

original_platform = sys.platform
sys.platform = "linux"
# 載入 ONNX 模型
onnx_model = onnx.load("dist/deploy_model.onnx")

# 將 ONNX 模型轉換為 Relay 模型
input_name = 'float_input'  # 輸入名稱可在 ONNX 模型中確認
shape_dict = {input_name: (1, 4)}
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)

# 設置目標架構，這裡假設為通用的 CPU
# target = 'llvm'
# target = tvm.target.Target("llvm", host="llvm -mtriple=x86_64-linux-gnu")
target = tvm.target.Target("llvm", host="llvm -mtriple=aarch64-linux-gnu")
# target = tvm.target.Target("llvm", host="llvm -mtriple=x86_64-apple-darwin")
with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target, params=params)

# 編譯輸出為 C 代碼
# lib.export_library("output.so", cc="x86_64-linux-gnu-gcc")
lib.export_library("output.so", cc="aarch64-linux-gnu-gcc")
# lib.export_library("output.so", cc="clang")

# 恢復原始平台
sys.platform = original_platform

[21:12:10] /Users/runner/work/tlcpack/tlcpack/tvm/src/te/schedule/bound.cc:119: not in feed graph consumer = compute(p0_red_temp, body=[reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lhs_1], rhs=[argmax_rhs_0, argmax_rhs_1], identity_element=[-1, -3.40282e+38f]), source=[k1, p0[ax0, k1]], init=[], axis=[iter_var(k1, range(min=0, ext=3))], where=(bool)1, value_index=0), reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lhs_1], rhs=[argmax_rhs_0, argmax_rhs_1], identity_element=[-1, -3.40282e+38f]), source=[k1, p0[ax0, k1]], init=[], axis=[iter_var(k1, ra

如果要在 macOS 跨平台編譯。在 macOS 上使用 TVM 的export_library函數時，預設會包含-undefined dynamic_lookup連結器標誌。會導致連結器報錯：
> Command line: x86_64-linux-gnu-gcc -shared -fPIC -undefined dynamic_lookup -o output.so 

遇到的錯誤是由於 macOS 特有的連結器標誌 -undefineddynamic_lookup 被傳遞給了 Linux 交叉編譯器，導致編譯失敗。為了解決這個問題，可以暫時修改sys.platform，讓 TVM 認為正在 Linux 上執行：

```py
import sys

original_platform = sys.platform
sys.platform = "linux"
```

In [4]:
!g++ -std=c++17 -o main test.cpp \
    -I/Users/yilintsai/Documents/tvm/include \
    -I/Users/yilintsai/Documents/tvm/3rdparty/dlpack/include \
    -I/Users/yilintsai/Documents/tvm/3rdparty/dmlc-core/include \
    -ltvm_runtime -ldl -pthread \
    -Wno-macro-redefined

In [5]:
!./main

Prediction Label: 2
Prediction Probabilities: [2.48873e-05, 0.00856126, 0.991414]


In [7]:
!python -c "import tvm; print(tvm.__file__)"

/Users/yilintsai/anaconda3/envs/onnx-mlir/lib/python3.10/site-packages/tvm/__init__.py


#### 無法順利產so 因此直接使用TVM推論

In [1]:
import onnx
import tvm
from tvm import relay
from tvm.contrib import graph_executor

# 載入 ONNX 模型
onnx_model = onnx.load("dist/deploy_model.onnx")

# 將 ONNX 模型轉換為 TVM 的 Relay 模型格式
input_name = 'float_input'  # 確認模型的輸入名稱，可從 ONNX 模型中查看
shape_dict = {input_name: (1, 4)}  # 定義輸入形狀 (1, 4) 代表輸入的形狀
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)

# 設定編譯目標為 CPU 上的 LLVM (可以根據需求設為 "cuda" 或 "opencl" 等其他架構)
target = "llvm"
with tvm.transform.PassContext(opt_level=3):
    # 編譯 Relay 模型，將其轉換為可以運行的格式
    lib = relay.build(mod, target=target, params=params)

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.
[22:00:42] /Users/runner/work/tlcpack/tlcpack/tvm/src/te/schedule/bound.cc:119: not in feed graph consumer = compute(p0_red_temp, body=[reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lhs_1], rhs=[argmax_rhs_0, argmax_rhs_1], identity_element=[-1, -3.40282e+38f]), source=[k1, p0[ax0, k1]], init=[], axis=[iter_var(k1, range(min=0, ext=3))], where=(bool)1, value_index=0), reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lh

In [8]:
import numpy as np

# 使用 TVM 的 Graph Executor 模組來載入編譯後的模型
module = graph_executor.GraphModule(lib["default"](tvm.cpu(0)))

# 定義輸入資料並設定模型輸入，這裡使用一筆測試資料
input_data = np.array([[6.3, 3.3, 6. , 2.5]], dtype=np.float32)
module.set_input(input_name, input_data)

# 執行模型推理
module.run()

# 定義輸出形狀，這裡假設模型的輸出形狀為 (1, 1)
output_shape = (1, 1)  # 可根據實際模型調整
tvm_output = module.get_output(1).asnumpy()  # 獲取輸出並轉換為 NumPy 陣列

# 輸出推理結果
print(tvm_output)

[[2.4887262e-05 8.5612610e-03 9.9141383e-01]]


In [9]:
# 打印輸出形狀
tvm_output = module.get_output(0).asnumpy()
print("Output shape:", tvm_output.shape)

Output shape: (1,)


### TVM 產 C

In [3]:
import tvm
from tvm import relay
from tvm.contrib import cc
import onnx

# 載入 ONNX 模型
onnx_model = onnx.load("dist/deploy_model.onnx")

# 將 ONNX 模型轉換為 Relay 模型
input_name = 'float_input'
shape_dict = {input_name: (1, 4)}
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)

# 設置目標架構
target = tvm.target.Target("c")

with tvm.transform.PassContext(opt_level=1):
    # 編譯 Relay 模型
    mod = relay.transform.SimplifyInference()(mod)
    lib = relay.build(mod, target=target, params=params)

# 匯出模型為 C 程式碼
lib.export_library("model.c", cc.create_ccompiler())


[23:17:30] /Users/runner/work/tlcpack/tlcpack/tvm/src/te/schedule/bound.cc:119: not in feed graph consumer = compute(p0_red_temp, body=[reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lhs_1], rhs=[argmax_rhs_0, argmax_rhs_1], identity_element=[-1, -3.40282e+38f]), source=[k1, p0[ax0, k1]], init=[], axis=[iter_var(k1, range(min=0, ext=3))], where=(bool)1, value_index=0), reduce(combiner=comm_reducer(result=[select(((argmax_lhs_1 > argmax_rhs_1) || ((argmax_lhs_1 == argmax_rhs_1) && (argmax_lhs_0 < argmax_rhs_0))), argmax_lhs_0, argmax_rhs_0), select((argmax_lhs_1 > argmax_rhs_1), argmax_lhs_1, argmax_rhs_1)], lhs=[argmax_lhs_0, argmax_lhs_1], rhs=[argmax_rhs_0, argmax_rhs_1], identity_element=[-1, -3.40282e+38f]), source=[k1, p0[ax0, k1]], init=[], axis=[iter_var(k1, ra

TVMError: Traceback (most recent call last):
  [bt] (8) 9   libtvm.dylib                        0x00000001208f4ecf tvm::NodeFunctor<void (tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<void (tvm::tir::Stmt const&)>*)>::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<void (tvm::tir::Stmt const&)>*) const + 303
  [bt] (7) 8   libtvm.dylib                        0x000000012190b973 tvm::codegen::CodeGenC::VisitStmt_(tvm::tir::BufferStoreNode const*) + 563
  [bt] (6) 7   libtvm.dylib                        0x0000000121901c89 tvm::codegen::CodeGenC::PrintExpr(tvm::PrimExpr const&) + 201
  [bt] (5) 6   libtvm.dylib                        0x000000012190f768 tvm::NodeFunctor<void (tvm::runtime::ObjectRef const&, tvm::tir::ExprFunctor<void (tvm::PrimExpr const&, std::__1::basic_ostream<char, std::__1::char_traits<char>>&)>*, std::__1::basic_ostream<char, std::__1::char_traits<char>>&)>::operator()(tvm::runtime::ObjectRef const&, tvm::tir::ExprFunctor<void (tvm::PrimExpr const&, std::__1::basic_ostream<char, std::__1::char_traits<char>>&)>*, std::__1::basic_ostream<char, std::__1::char_traits<char>>&) const + 312
  [bt] (4) 5   libtvm.dylib                        0x0000000121903c26 void tvm::codegen::PrintBinaryExpr<tvm::tir::AddNode>(tvm::tir::AddNode const*, char const*, std::__1::basic_ostream<char, std::__1::char_traits<char>>&, tvm::codegen::CodeGenC*) + 614
  [bt] (3) 4   libtvm.dylib                        0x000000012190f768 tvm::NodeFunctor<void (tvm::runtime::ObjectRef const&, tvm::tir::ExprFunctor<void (tvm::PrimExpr const&, std::__1::basic_ostream<char, std::__1::char_traits<char>>&)>*, std::__1::basic_ostream<char, std::__1::char_traits<char>>&)>::operator()(tvm::runtime::ObjectRef const&, tvm::tir::ExprFunctor<void (tvm::PrimExpr const&, std::__1::basic_ostream<char, std::__1::char_traits<char>>&)>*, std::__1::basic_ostream<char, std::__1::char_traits<char>>&) const + 312
  [bt] (2) 3   libtvm.dylib                        0x0000000121907352 tvm::codegen::CodeGenC::VisitExpr_(tvm::tir::CallNode const*, std::__1::basic_ostream<char, std::__1::char_traits<char>>&) + 1314
  [bt] (1) 2   libtvm.dylib                        0x000000012089e3f9 tvm::runtime::detail::LogFatal::Entry::Finalize() + 89
  [bt] (0) 1   libtvm.dylib                        0x000000012265d038 tvm::runtime::Backtrace() + 24
  File "/Users/runner/work/tlcpack/tlcpack/tvm/src/target/source/codegen_c.cc", line 608
TVMError: Unresolved call Op(tir.fabs)

In [None]:
import tvm
from tvm import relay
from tvm.contrib import cc, utils, graph_executor
import onnx
import numpy as np

# 載入 ONNX 模型
onnx_model = onnx.load("dist/deploy_model.onnx")

# 將 ONNX 模型轉換為 Relay 模型
input_name = 'float_input'  # 確認模型的輸入名稱
shape_dict = {input_name: (1, 4)}  # 定義輸入形狀 (1, 4)
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)

# 設定編譯目標為 "c" 以生成 C 程式碼
target = "c"
with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target=target, params=params)

# # 儲存編譯的共享庫 (dylib 為 macOS 共享庫)
# lib.export_library("logistic_regression_iris.so", cc.create_shared, cc="g++")
# print(f"共享庫已儲存")


In [55]:
lib.export_library("c_model.tar", cc.create_shared, cc="g++")

## 使用ONNX Runtime推論比對結果

In [12]:
import onnxruntime as ort

# 加載 ONNX 模型
session = ort.InferenceSession('dist/deploy_model.onnx')

# 準備輸入資料
input_name = session.get_inputs()[0].name
input_data = np.array([[6.3, 3.3, 6. , 2.5]], dtype=np.float32)

# 進行推理
pred_onnx = session.run(None, {input_name: input_data})[1]

# 輸出預測結果
print(pred_onnx)

[[2.4887262e-05 8.5612610e-03 9.9141383e-01]]
