Merge branch 'main' into qnn_tensor_v2_support

microsoft · May 1, 2024 · ab0867c · ab0867c
2 parents 73d3d6b + f9febc4
commit ab0867c
Show file tree

Hide file tree

Showing 40 changed files with 520 additions and 155 deletions.
diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -37,8 +37,8 @@ mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d
-#use the commit of Final DDS removal. DDS output is now supported by ORT TRT.
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26
+#use the latest commit of 10.0-GA
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/eb43908b02a296ea0594432f06e9d3fac288d672.zip;94d07871810a36a5bc70a1def5c50504101c9bd1
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874

diff --git a/cmake/patches/protobuf/protobuf_cmake.patch b/cmake/patches/protobuf/protobuf_cmake.patch
@@ -29,3 +29,27 @@ index 04cb3303a..4025805cf 100644
    # When building with "make", "lib" prefix will be added automatically by
    # the build tool.
    set(LIB_PREFIX)
+diff --git a/src/google/protobuf/map.h b/src/google/protobuf/map.h
+index 008c19225..cbab108c2 100644
+--- a/src/google/protobuf/map.h
++++ b/src/google/protobuf/map.h
+@@ -52,7 +52,8 @@
+ #endif  // defined(__cpp_lib_string_view)
+
+ #if !defined(GOOGLE_PROTOBUF_NO_RDTSC) && defined(__APPLE__)
+-#include <mach/mach_time.h>
++// apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
++#include <time.h>
+ #endif
+
+ #include <google/protobuf/stubs/common.h>
+@@ -1154,7 +1155,8 @@ class Map {
+ #if defined(__APPLE__)
+       // Use a commpage-based fast time function on Apple environments (MacOS,
+       // iOS, tvOS, watchOS, etc).
+-      s += mach_absolute_time();
++      // apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
++      s += clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
+ #elif defined(__x86_64__) && defined(__GNUC__)
+       uint32_t hi, lo;
+       asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -19,6 +19,8 @@
     "8.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5",
     "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
     "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
+    "10.0.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0",
+    "10.0.cuda_12_4_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 

diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -1198,7 +1198,7 @@ TEST(MathOpTest, Sum_6) {
 #if defined(OPENVINO_CONFIG_GPU)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});  // OpenVINO EP: Disabled due to accuracy mismatch for FP16
 #else
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TRT10: Disabled due to segfault caused by older opset 6
 #endif
 }
 
@@ -1225,7 +1225,7 @@ TEST(MathOpTest, Sum_6_double) {
 #if defined(OPENVINO_CONFIG_GPU)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});  // OpenVINO EP: Disabled due to accuracy mismatch for FP16
 #else
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TRT10: Disabled due to segfault caused by older opset 6
 #endif
 }
 
@@ -1452,7 +1452,7 @@ TEST(MathOpTest, Min_6) {
                         {1.0f, 0.0f, 1.0f,
                          -3.0f, 1.1f, -100.0f,
                          -5.4f, 0.01f, -10000.0f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TRT10: Disabled due to segfault caused by older opset 6
 }
 
 TEST(MathOpTest, Min_8) {
@@ -1708,7 +1708,7 @@ TEST(MathOpTest, Max_6) {
                         {1.0f, 0.0f, 3.0f,
                          -1.0f, 3.3f, 64.0f,
                          5.4f, 0.03f, 10000.0f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TRT10: Disabled due to segfault caused by older opset 6
 }
 
 TEST(MathOpTest, Max_8_Float) {
@@ -2718,8 +2718,7 @@ TEST(MathOpTest, Mean_6) {
                         {1.0f, 0.0f, 2.0f,
                          -2.0f, 2.2f, 10.0f,
                          -3.0f, 0.02f, -4.0f});
-  // OpenVINO: Disabled due to accuracy mismatch
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TRT10: Disabled due to segfault caused by older opset 6
 }
 
 TEST(MathOpTest, Mean_8) {

diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -425,7 +425,7 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
   // The other EPs can choose which opsets to test.
   // If an EP doesn't have any CI build pipeline, then there is no need to specify any opset.
 #ifdef USE_TENSORRT
-  // tensorrt: only enable opset 14 to 17 of onnx tests
+  // tensorrt: only enable opset 12 to 17 of onnx tests
   provider_names[provider_name_tensorrt] = {opset12, opset14, opset15, opset16, opset17};
 #endif
 #ifdef USE_MIGRAPHX

diff --git a/onnxruntime/test/providers/cpu/tensor/isnan_test.cc b/onnxruntime/test/providers/cpu/tensor/isnan_test.cc
@@ -10,11 +10,15 @@ namespace onnxruntime {
 namespace test {
 
 template <typename T>
-void run_is_nan_test(int opset, const std::vector<int64_t>& dims, const std::initializer_list<T>& input, const std::initializer_list<bool>& output) {
+void run_is_nan_test(int opset, const std::vector<int64_t>& dims, const std::initializer_list<T>& input, const std::initializer_list<bool>& output, bool skip_trt = false) {
   OpTester test("IsNaN", opset, kOnnxDomain);
   test.AddInput<T>("X", dims, input);
   test.AddOutput<bool>("Y", dims, output);
-  test.Run();
+  if (skip_trt) {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  } else {
+    test.Run();
+  }
 }
 
 TEST(IsNaNOpTest, IsNaNFloat9) {
@@ -56,7 +60,7 @@ TEST(IsNaNOpTest, IsNaNBFloat16_20) {
   std::vector<int64_t> dims{2, 2};
   std::initializer_list<BFloat16> input = {BFloat16::One, BFloat16::NaN, BFloat16(2.0f), BFloat16::NaN};
   std::initializer_list<bool> output = {false, true, false, true};
-  run_is_nan_test(20, dims, input, output);
+  run_is_nan_test(20, dims, input, output, true);  // Skip as TRT10 supports BF16 but T4 GPU run on TRT CIs doesn't
 }
 
 TEST(IsNaNOpTest, IsNaNDouble9) {
@@ -78,7 +82,7 @@ TEST(IsNaNOpTest, IsNaNFloat8E4M3FN) {
   std::vector<int64_t> dims{2, 2};
   std::initializer_list<Float8E4M3FN> input = {Float8E4M3FN(1.0f), Float8E4M3FN(-NAN), Float8E4M3FN(2.0f), Float8E4M3FN(NAN)};
   std::initializer_list<bool> output = {false, true, false, true};
-  run_is_nan_test(20, dims, input, output);
+  run_is_nan_test(20, dims, input, output, true);  // No direct FP8 usage allowed without QDQ. Skip since TRT10
 }
 
 TEST(IsNaNOpTest, IsNaN_Float8E4M3FNUZ) {

diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -121,7 +121,7 @@ TEST(DequantizeLinearOpTest, Without_Zero_Point) {
   test.AddInput<int8_t>("x", {}, {100});
   test.AddInput<float>("x_scale", {}, {2.0f});
   test.AddOutput<float>("y", {}, {200.0f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // No DQ allowed without corresponding Q. Skip since TRT10
 }
 
 // 1d zero & scale with default axis

diff --git a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc
@@ -166,62 +166,67 @@ void RunTestWrapper() {
 // OpTester's AddInput and AddOutput do not support std::vector<bool>.
 void RunTestForBool(std::initializer_list<bool> input_data, std::initializer_list<int64_t> input_dims,
                     std::initializer_list<int64_t> repeats, std::initializer_list<int64_t> repeats_dims,
-                    std::initializer_list<bool> output_data, std::initializer_list<int64_t> output_dims) {
+                    std::initializer_list<bool> output_data, std::initializer_list<int64_t> output_dims,
+                    bool skip_trt = false) {
   OpTester test("Tile");
   test.AddInput<bool>("input", input_dims, input_data);
   test.AddInput<int64_t>("repeats", repeats_dims, repeats);
   test.AddOutput<bool>("output", output_dims, output_data);
-  test.Run();
+  if (skip_trt) {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  } else {
+    test.Run();
+  }
 }
 
 void RunTestWrapperForBool() {
   // Tile1DWithZeroRepeats
-  RunTestForBool({true, false, true}, {3}, {0}, {1}, {}, {0});
+  RunTestForBool({true, false, true}, {3}, {0}, {1}, {}, {0}, true);
 
   // Tile2DWithZeroRepeats
-  RunTestForBool({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0});
+  RunTestForBool({true, false, true, false}, {2, 2}, {2, 0}, {2}, {}, {4, 0}, true);
 
   // Tile1D
-  RunTestForBool({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9});
+  RunTestForBool({true, false, true}, {3}, {3}, {1}, {true, false, true, true, false, true, true, false, true}, {9}, true);
 
   // Tile2D_1Axis
   RunTestForBool({true, false, true, false}, {2, 2}, {2, 1}, {2}, {true, false, true, false, true, false, true, false},
-                 {4, 2});
+                 {4, 2}, true);
 
   // Tile2D_2Axes
   RunTestForBool(
       {true, false, true, false}, {2, 2}, {2, 2}, {2},
-      {true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4});
+      {true, false, true, false, true, false, true, false, true, false, true, false, true, false, true, false}, {4, 4}, true);
 
   // Tile3D
   RunTestForBool({true, false, true, false, true, false}, {2, 1, 3}, {1, 2, 1}, {3},
-                 {true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3});
+                 {true, false, true, true, false, true, false, true, false, false, true, false}, {2, 2, 3}, true);
 
   // Tile1DWithOneRepeats
   RunTestForBool({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3},
-                 {true, false, true, false, true, true}, {2, 1, 3});
+                 {true, false, true, false, true, true}, {2, 1, 3}, true);
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 1
   // This will trigger the MemCpy optimization path
   RunTestForBool({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3},
-                 {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
+                 {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}, true);
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 2
   // This will trigger the MemCpy optimization path
   RunTestForBool({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3},
-                 {true, false, true, true, false, true, true, false, true}, {3, 1, 3});
+                 {true, false, true, true, false, true, true, false, true}, {3, 1, 3}, true);
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1)
   // This will trigger the (Batched) MemCpy optimization path
   RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3},
-                 {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3});
+                 {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}, true);
 
   // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1)
   // This will trigger the (Batched) MemCpy optimization path
   RunTestForBool({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3},
                  {true, false, true, true, false, true, true, false, true, true, false, true,
                   true, false, true, true, false, true, true, false, true, true, false, true},
-                 {4, 2, 3});
+                 {4, 2, 3}, true);
 }
 
 TEST(TensorOpTest, TileFloatType) { RunTestWrapper<float>(); }

diff --git a/tools/ci_build/github/apple/build_host_protoc.sh b/tools/ci_build/github/apple/build_host_protoc.sh
@@ -22,9 +22,11 @@ pushd .
 mkdir -p "$PROTOC_BUILD_PATH"
 cd "$PROTOC_BUILD_PATH"
 DEP_FILE_PATH="$ORT_REPO_ROOT/cmake/deps.txt"
+PATCH_FILE_PATH="$ORT_REPO_ROOT/cmake/patches/protobuf/protobuf_cmake.patch"
 protobuf_url=$(grep '^protobuf' "$DEP_FILE_PATH" | cut -d ';' -f 2 | sed 's/\.zip$/\.tar.gz/')
 curl -sSL --retry 5 --retry-delay 10 --create-dirs --fail -L -o protobuf_src.tar.gz "$protobuf_url"
 tar -zxf protobuf_src.tar.gz --strip=1
+patch --binary --ignore-whitespace -p1 < "$PATCH_FILE_PATH"
 # The second 'cmake' is a folder name
 cmake cmake \
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON \

diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -57,7 +57,7 @@ variables:
   - name: docker_base_image
     value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
   - name: linux_trt_version
-    value: 8.6.1.6-1.cuda11.8
+    value: 10.0.1.6-1.cuda11.8
   - name: Repository
     value: 'onnxruntimecuda11manylinuxbuild'
 
@@ -293,10 +293,14 @@ stages:
 
     - template: templates/get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
         Context: tools/ci_build/github/linux/docker/
         ScriptName: tools/ci_build/get_docker_image.py
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+        DockerBuildArgs: "
+        --build-arg BUILD_UID=$( id -u )
+        --build-arg BASEIMAGE=${{ variables.docker_base_image }}
+        --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
+        "
         Repository: onnxruntimeubi8packagestest
         UpdateDepsTxt: false
 

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -90,9 +90,14 @@ variables:
     value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
 - name: linux_trt_version
   ${{ if eq(parameters.CudaVersion, '11.8') }}:
-    value: 8.6.1.6-1.cuda11.8
+    value: 10.0.1.6-1.cuda11.8
   ${{ if eq(parameters.CudaVersion, '12.2') }}:
-    value: 8.6.1.6-1.cuda12.0
+    value: 10.0.1.6-1.cuda12.4
+- name: win_trt_version
+  ${{ if eq(parameters.CudaVersion, '11.8') }}:
+    value: 11.8
+  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    value: 12.4
 
 stages:
 - stage: Setup
@@ -217,11 +222,11 @@ stages:
     buildArch: x64
     msbuildPlatform: x64
     packageName: x64-cuda
-    buildparameter: --use_cuda --cuda_home=$(Agent.TempDirectory)\v11.8 --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" ${{parameters.AdditionalBuildFlag}}
+    buildparameter: --use_cuda --cuda_home=$(Agent.TempDirectory)\v${{ parameters.CudaVersion }} --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" ${{parameters.AdditionalBuildFlag}}
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: true
     java_artifact_id: onnxruntime_gpu
-    CudaVersion: 11.8
+    CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
@@ -235,11 +240,11 @@ stages:
     buildArch: x64
     msbuildPlatform: x64
     packageName: x64-tensorrt
-    buildparameter: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8"  --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
+    buildparameter: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-${{ variables.win_trt_version }}"  --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"  --enable_onnx_tests --enable_wcos --build_java --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: true
     java_artifact_id: onnxruntime_gpu
-    CudaVersion: 11.8
+    CudaVersion: ${{ parameters.CudaVersion }}
     UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
 
 # ROCm
@@ -464,9 +469,13 @@ stages:
 
     - template: templates/get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
         Context: tools/ci_build/github/linux/docker/
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+        DockerBuildArgs: "
+        --build-arg BUILD_UID=$( id -u )
+        --build-arg BASEIMAGE=${{ variables.docker_base_image }}
+        --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
+        "
         Repository: onnxruntimeubi8packagestest
         UpdateDepsTxt: false
 
@@ -1050,6 +1059,7 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
+    CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.specificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
@@ -1060,6 +1070,7 @@ stages:
     StageSuffix: 'GPU'
     MoreSuffix: '_Linux'
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
+    CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.specificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 

diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -68,14 +68,14 @@ variables:
       value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: 8.6.1.6-1.cuda11.8
+      value: 10.0.1.6-1.cuda11.8
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: 8.6.1.6-1.cuda12.0
+      value: 10.0.1.6-1.cuda12.4
   - name: win_trt_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8
+      value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0
+      value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4
   - name: win_cuda_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: $(Agent.TempDirectory)\v11.8

diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -62,9 +62,9 @@ variables:
 
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: 8.6.1.6-1.cuda11.8
+      value: 10.0.1.6-1.cuda11.8
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: 8.6.1.6-1.cuda12.0
+      value: 10.0.1.6-1.cuda12.4
 
   - name: Repository
     ${{ if eq(parameters.CudaVersion, '11.8') }}: