Skip to content

Commit

Permalink
Merge branch 'main' into qnn_tensor_v2_support
Browse files Browse the repository at this point in the history
  • Loading branch information
HectorSVC committed May 1, 2024
2 parents 73d3d6b + f9febc4 commit ab0867c
Show file tree
Hide file tree
Showing 40 changed files with 520 additions and 155 deletions.
4 changes: 2 additions & 2 deletions cmake/deps.txt
Expand Up @@ -37,8 +37,8 @@ mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee
mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d
#use the commit of Final DDS removal. DDS output is now supported by ORT TRT.
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26
#use the latest commit of 10.0-GA
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/eb43908b02a296ea0594432f06e9d3fac288d672.zip;94d07871810a36a5bc70a1def5c50504101c9bd1
protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
Expand Down
24 changes: 24 additions & 0 deletions cmake/patches/protobuf/protobuf_cmake.patch
Expand Up @@ -29,3 +29,27 @@ index 04cb3303a..4025805cf 100644
# When building with "make", "lib" prefix will be added automatically by
# the build tool.
set(LIB_PREFIX)
diff --git a/src/google/protobuf/map.h b/src/google/protobuf/map.h
index 008c19225..cbab108c2 100644
--- a/src/google/protobuf/map.h
+++ b/src/google/protobuf/map.h
@@ -52,7 +52,8 @@
#endif // defined(__cpp_lib_string_view)

#if !defined(GOOGLE_PROTOBUF_NO_RDTSC) && defined(__APPLE__)
-#include <mach/mach_time.h>
+// apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
+#include <time.h>
#endif

#include <google/protobuf/stubs/common.h>
@@ -1154,7 +1155,8 @@ class Map {
#if defined(__APPLE__)
// Use a commpage-based fast time function on Apple environments (MacOS,
// iOS, tvOS, watchOS, etc).
- s += mach_absolute_time();
+ // apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
+ s += clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
#elif defined(__x86_64__) && defined(__GNUC__)
uint32_t hi, lo;
asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
2 changes: 2 additions & 0 deletions onnxruntime/python/tools/tensorrt/perf/build/build_image.py
Expand Up @@ -19,6 +19,8 @@
"8.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5",
"8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
"8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
"10.0.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0",
"10.0.cuda_12_4_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0",
"BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
}

Expand Down
11 changes: 5 additions & 6 deletions onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
Expand Up @@ -1198,7 +1198,7 @@ TEST(MathOpTest, Sum_6) {
#if defined(OPENVINO_CONFIG_GPU)
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO EP: Disabled due to accuracy mismatch for FP16
#else
test.Run();
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TRT10: Disabled due to segfault caused by older opset 6
#endif
}

Expand All @@ -1225,7 +1225,7 @@ TEST(MathOpTest, Sum_6_double) {
#if defined(OPENVINO_CONFIG_GPU)
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // OpenVINO EP: Disabled due to accuracy mismatch for FP16
#else
test.Run();
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TRT10: Disabled due to segfault caused by older opset 6
#endif
}

Expand Down Expand Up @@ -1452,7 +1452,7 @@ TEST(MathOpTest, Min_6) {
{1.0f, 0.0f, 1.0f,
-3.0f, 1.1f, -100.0f,
-5.4f, 0.01f, -10000.0f});
test.Run();
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TRT10: Disabled due to segfault caused by older opset 6
}

TEST(MathOpTest, Min_8) {
Expand Down Expand Up @@ -1708,7 +1708,7 @@ TEST(MathOpTest, Max_6) {
{1.0f, 0.0f, 3.0f,
-1.0f, 3.3f, 64.0f,
5.4f, 0.03f, 10000.0f});
test.Run();
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TRT10: Disabled due to segfault caused by older opset 6
}

TEST(MathOpTest, Max_8_Float) {
Expand Down Expand Up @@ -2718,8 +2718,7 @@ TEST(MathOpTest, Mean_6) {
{1.0f, 0.0f, 2.0f,
-2.0f, 2.2f, 10.0f,
-3.0f, 0.02f, -4.0f});
// OpenVINO: Disabled due to accuracy mismatch
test.Run();
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TRT10: Disabled due to segfault caused by older opset 6
}

TEST(MathOpTest, Mean_8) {
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/providers/cpu/model_tests.cc
Expand Up @@ -425,7 +425,7 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
// The other EPs can choose which opsets to test.
// If an EP doesn't have any CI build pipeline, then there is no need to specify any opset.
#ifdef USE_TENSORRT
// tensorrt: only enable opset 14 to 17 of onnx tests
// tensorrt: only enable opset 12 to 17 of onnx tests
provider_names[provider_name_tensorrt] = {opset12, opset14, opset15, opset16, opset17};
#endif
#ifdef USE_MIGRAPHX
Expand Down
12 changes: 8 additions & 4 deletions onnxruntime/test/providers/cpu/tensor/isnan_test.cc
Expand Up @@ -10,11 +10,15 @@ namespace onnxruntime {
namespace test {

template <typename T>
void run_is_nan_test(int opset, const std::vector<int64_t>& dims, const std::initializer_list<T>& input, const std::initializer_list<bool>& output) {
void run_is_nan_test(int opset, const std::vector<int64_t>& dims, const std::initializer_list<T>& input, const std::initializer_list<bool>& output, bool skip_trt = false) {
OpTester test("IsNaN", opset, kOnnxDomain);
test.AddInput<T>("X", dims, input);
test.AddOutput<bool>("Y", dims, output);
test.Run();
if (skip_trt) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
} else {
test.Run();
}
}

TEST(IsNaNOpTest, IsNaNFloat9) {
Expand Down Expand Up @@ -56,7 +60,7 @@ TEST(IsNaNOpTest, IsNaNBFloat16_20) {
std::vector<int64_t> dims{2, 2};
std::initializer_list<BFloat16> input = {BFloat16::One, BFloat16::NaN, BFloat16(2.0f), BFloat16::NaN};
std::initializer_list<bool> output = {false, true, false, true};
run_is_nan_test(20, dims, input, output);
run_is_nan_test(20, dims, input, output, true); // Skip as TRT10 supports BF16 but T4 GPU run on TRT CIs doesn't
}

TEST(IsNaNOpTest, IsNaNDouble9) {
Expand All @@ -78,7 +82,7 @@ TEST(IsNaNOpTest, IsNaNFloat8E4M3FN) {
std::vector<int64_t> dims{2, 2};
std::initializer_list<Float8E4M3FN> input = {Float8E4M3FN(1.0f), Float8E4M3FN(-NAN), Float8E4M3FN(2.0f), Float8E4M3FN(NAN)};
std::initializer_list<bool> output = {false, true, false, true};
run_is_nan_test(20, dims, input, output);
run_is_nan_test(20, dims, input, output, true); // No direct FP8 usage allowed without QDQ. Skip since TRT10
}

TEST(IsNaNOpTest, IsNaN_Float8E4M3FNUZ) {
Expand Down
Expand Up @@ -121,7 +121,7 @@ TEST(DequantizeLinearOpTest, Without_Zero_Point) {
test.AddInput<int8_t>("x", {}, {100});
test.AddInput<float>("x_scale", {}, {2.0f});
test.AddOutput<float>("y", {}, {200.0f});
test.Run();
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // No DQ allowed without corresponding Q. Skip since TRT10
}

// 1d zero & scale with default axis
Expand Down
31 changes: 18 additions & 13 deletions onnxruntime/test/providers/cpu/tensor/tile_op_test.cc
Expand Up @@ -166,62 +166,67 @@ void RunTestWrapper() {
// OpTester's AddInput and AddOutput do not support std::vector<bool>.
void RunTestForBool(std::initializer_list<bool> input_data, std::initializer_list<int64_t> input_dims,
std::initializer_list<int64_t> repeats, std::initializer_list<int64_t> repeats_dims,
std::initializer_list<bool> output_data, std::initializer_list<int64_t> output_dims) {
std::initializer_list<bool> output_data, std::initializer_list<int64_t> output_dims,
bool skip_trt = false) {
OpTester test("Tile");
test.AddInput<bool>("input", input_dims, input_data);
test.AddInput<int64_t>("repeats", repeats_dims, repeats);
test.AddOutput<bool>("output", output_dims, output_data);
test.Run();
if (skip_trt) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
} else {
test.Run();
}
}

void RunTestWrapperForBool() {
// Tile1DWithZeroRepeats
RunTestForBool({true, false, true}, {3}, {0}, {1}, {}, {0});
RunTestForBool({true, false, true}, {3}, {0}, {1}, {}, {0}, true);

// Tile2DWithZeroRepeats
RunTestForBool({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0});
RunTestForBool({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0}, true);

// Tile1D
RunTestForBool({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9});
RunTestForBool({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9}, true);

// Tile2D_1Axis
RunTestForBool({true, false, true, false}, {2, 2}, {2, 1}, {2}, {true, false, true, false, true, false, true, false},
{4, 2});
{4, 2}, true);

// Tile2D_2Axes
RunTestForBool(
{true, false, true, false}, {2, 2}, {2, 2}, {2},
{true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4});
{true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4}, true);

// Tile3D
RunTestForBool({true, false, true, false, true, false}, {2, 1, 3}, {1, 2, 1}, {3},
{true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3});
{true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3}, true);

// Tile1DWithOneRepeats
RunTestForBool({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3},
{true, false, true, false, true, true}, {2, 1, 3});
{true, false, true, false, true, true}, {2, 1, 3}, true);

// TileWhichIsBasicallyCopiesOfInputBuffer - 1
// This will trigger the MemCpy optimization path
RunTestForBool({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3},
{true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
{true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}, true);

// TileWhichIsBasicallyCopiesOfInputBuffer - 2
// This will trigger the MemCpy optimization path
RunTestForBool({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3},
{true, false, true, true, false, true, true, false, true}, {3, 1, 3});
{true, false, true, true, false, true, true, false, true}, {3, 1, 3}, true);

// TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1)
// This will trigger the (Batched) MemCpy optimization path
RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3},
{true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
{true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}, true);

// TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1)
// This will trigger the (Batched) MemCpy optimization path
RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3},
{true, false, true, true, false, true, true, false, true, true, false, true,
true, false, true, true, false, true, true, false, true, true, false, true},
{4, 2, 3});
{4, 2, 3}, true);
}

TEST(TensorOpTest, TileFloatType) { RunTestWrapper<float>(); }
Expand Down
2 changes: 2 additions & 0 deletions tools/ci_build/github/apple/build_host_protoc.sh
Expand Up @@ -22,9 +22,11 @@ pushd .
mkdir -p "$PROTOC_BUILD_PATH"
cd "$PROTOC_BUILD_PATH"
DEP_FILE_PATH="$ORT_REPO_ROOT/cmake/deps.txt"
PATCH_FILE_PATH="$ORT_REPO_ROOT/cmake/patches/protobuf/protobuf_cmake.patch"
protobuf_url=$(grep '^protobuf' "$DEP_FILE_PATH" | cut -d ';' -f 2 | sed 's/\.zip$/\.tar.gz/')
curl -sSL --retry 5 --retry-delay 10 --create-dirs --fail -L -o protobuf_src.tar.gz "$protobuf_url"
tar -zxf protobuf_src.tar.gz --strip=1
patch --binary --ignore-whitespace -p1 < "$PATCH_FILE_PATH"
# The second 'cmake' is a folder name
cmake cmake \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
Expand Down
10 changes: 7 additions & 3 deletions tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
Expand Up @@ -57,7 +57,7 @@ variables:
- name: docker_base_image
value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
- name: linux_trt_version
value: 8.6.1.6-1.cuda11.8
value: 10.0.1.6-1.cuda11.8
- name: Repository
value: 'onnxruntimecuda11manylinuxbuild'

Expand Down Expand Up @@ -293,10 +293,14 @@ stages:

- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
Context: tools/ci_build/github/linux/docker/
ScriptName: tools/ci_build/get_docker_image.py
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
DockerBuildArgs: "
--build-arg BUILD_UID=$( id -u )
--build-arg BASEIMAGE=${{ variables.docker_base_image }}
--build-arg TRT_VERSION=${{ variables.linux_trt_version }}
"
Repository: onnxruntimeubi8packagestest
UpdateDepsTxt: false

Expand Down
Expand Up @@ -90,9 +90,14 @@ variables:
value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
- name: linux_trt_version
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: 8.6.1.6-1.cuda11.8
value: 10.0.1.6-1.cuda11.8
${{ if eq(parameters.CudaVersion, '12.2') }}:
value: 8.6.1.6-1.cuda12.0
value: 10.0.1.6-1.cuda12.4
- name: win_trt_version
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: 11.8
${{ if eq(parameters.CudaVersion, '12.2') }}:
value: 12.4

stages:
- stage: Setup
Expand Down Expand Up @@ -217,11 +222,11 @@ stages:
buildArch: x64
msbuildPlatform: x64
packageName: x64-cuda
buildparameter: --use_cuda --cuda_home=$(Agent.TempDirectory)\v11.8 --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" ${{parameters.AdditionalBuildFlag}}
buildparameter: --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.CudaVersion }} --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" ${{parameters.AdditionalBuildFlag}}
runTests: ${{ parameters.RunOnnxRuntimeTests }}
buildJava: true
java_artifact_id: onnxruntime_gpu
CudaVersion: 11.8
CudaVersion: ${{ parameters.CudaVersion }}
SpecificArtifact: ${{ parameters.SpecificArtifact }}
BuildId: ${{ parameters.BuildId }}

Expand All @@ -235,11 +240,11 @@ stages:
buildArch: x64
msbuildPlatform: x64
packageName: x64-tensorrt
buildparameter: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
buildparameter: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-${{ variables.win_trt_version }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
runTests: ${{ parameters.RunOnnxRuntimeTests }}
buildJava: true
java_artifact_id: onnxruntime_gpu
CudaVersion: 11.8
CudaVersion: ${{ parameters.CudaVersion }}
UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}

# ROCm
Expand Down Expand Up @@ -464,9 +469,13 @@ stages:

- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
Context: tools/ci_build/github/linux/docker/
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
DockerBuildArgs: "
--build-arg BUILD_UID=$( id -u )
--build-arg BASEIMAGE=${{ variables.docker_base_image }}
--build-arg TRT_VERSION=${{ variables.linux_trt_version }}
"
Repository: onnxruntimeubi8packagestest
UpdateDepsTxt: false

Expand Down Expand Up @@ -1050,6 +1059,7 @@ stages:
ArtifactSuffix: 'GPU'
StageSuffix: 'GPU'
NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
CudaVersion: ${{ parameters.CudaVersion }}
SpecificArtifact: ${{ parameters.specificArtifact }}
BuildId: ${{ parameters.BuildId }}

Expand All @@ -1060,6 +1070,7 @@ stages:
StageSuffix: 'GPU'
MoreSuffix: '_Linux'
NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
CudaVersion: ${{ parameters.CudaVersion }}
SpecificArtifact: ${{ parameters.specificArtifact }}
BuildId: ${{ parameters.BuildId }}

Expand Down
Expand Up @@ -68,14 +68,14 @@ variables:
value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
- name: linux_trt_version
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: 8.6.1.6-1.cuda11.8
value: 10.0.1.6-1.cuda11.8
${{ if eq(parameters.CudaVersion, '12.2') }}:
value: 8.6.1.6-1.cuda12.0
value: 10.0.1.6-1.cuda12.4
- name: win_trt_home
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8
value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
${{ if eq(parameters.CudaVersion, '12.2') }}:
value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0
value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4
- name: win_cuda_home
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: $(Agent.TempDirectory)\v11.8
Expand Down
Expand Up @@ -62,9 +62,9 @@ variables:

- name: linux_trt_version
${{ if eq(parameters.CudaVersion, '11.8') }}:
value: 8.6.1.6-1.cuda11.8
value: 10.0.1.6-1.cuda11.8
${{ if eq(parameters.CudaVersion, '12.2') }}:
value: 8.6.1.6-1.cuda12.0
value: 10.0.1.6-1.cuda12.4

- name: Repository
${{ if eq(parameters.CudaVersion, '11.8') }}:
Expand Down

0 comments on commit ab0867c

Please sign in to comment.