[BLAS] SYCL-Graph integration for native-command

EwanC · EwanC · commit 8d153ca7c80b · 2025-05-15T12:24:24.000+01:00
In order to support applications calling the library
with a sycl queue recording to a SYCL-Graph, check if
the `ext_codeplay_enqueue_native_command` command-group is being
recorded to a graph object. If so use the native stream recording
APIs to add the blas calls as nodes in the graph.

In particular this fixes the llama.cpp unit test
`MUL_MAT(type_a=f16,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],v=0)`
on CUDA with SYCL-Graph enabled. Previously this would throw an error:

```sh
$ GGML_SYCL_DISABLE_GRAPH=0 ./bin/test-backend-ops -b SYCL0 -o MUL_MAT -p type_a=f16,type_b=f32,m=16,n=1,k=256,bs=\\[1,1\\],nr=\\[2

UR CUDA ERROR:
        Value:           700
        Name:            CUDA_ERROR_ILLEGAL_ADDRESS
        Description:     an illegal memory access was encountered
        Function:        operator()
        Source Location: $HOME/dpcpp/unified-runtime/source/adapters/cuda/queue.cpp:154

Native API failed. Native API returns: 2147483646 (UR_RESULT_ERROR_UNKNOWN)
Exception caught at file:$HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp, line:3598, func:operator()
SYCL error: CHECK_TRY_ERROR((stream)-&gt;wait()): Meet error in this line code!
  in function ggml_backend_sycl_synchronize at $HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp:3598
$HOME/llama.cpp/ggml/src/ggml-sycl/../ggml-sycl/common.hpp:118: SYCL error
Could not attach to process.  If your uid matches the uid of the target
process, check the setting of /proc/sys/kernel/yama/ptrace_scope, or try
again as the root user.  For more details, see /etc/sysctl.d/10-ptrace.conf
ptrace: Operation not permitted.
No stack.
The program is not being run.
```
diff --git a/src/blas/backends/cublas/cublas_scope_handle.cpp b/src/blas/backends/cublas/cublas_scope_handle.cpp
@@ -60,6 +60,56 @@ cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue& queue)
     return nativeHandle;
 }
 
+void CublasScopedContextHandler::begin_recording_if_graph(const sycl::queue& queue) {
+// interop_handle graph methods only available from extension version 2
+#if SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND >= 2
+    if (!ih.ext_codeplay_has_graph()) {
+        return;
+    }
+
+    auto stream = get_stream(queue);
+    CUresult err;
+#if CUDA_VERSION >= 12030
+    // After CUDA 12.3 we can use cuStreamBeginCaptureToGraph to capture
+    // the stream directly in the native graph, rather than needing to
+    // instantiate the stream capture as a new graph.
+    auto graph = ih.ext_codeplay_get_native_graph<sycl::backend::ext_oneapi_cuda>();
+    CUDA_ERROR_FUNC(cuStreamBeginCaptureToGraph, err, stream, graph, nullptr, nullptr, 0,
+                    CU_STREAM_CAPTURE_MODE_GLOBAL);
+#else
+    CUDA_ERROR_FUNC(cuStreamBeginCapture, err, stream, CU_STREAM_CAPTURE_MODE_GLOBAL);
+#endif // CUDA_VERSION
+#endif // SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND >= 2
+}
+
+void CublasScopedContextHandler::end_recording_if_graph(const sycl::queue& queue) {
+// interop_handle graph methods only available from extension version 2
+#if SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND >= 2
+    if (!ih.ext_codeplay_has_graph()) {
+        return;
+    }
+
+    auto graph = ih.ext_codeplay_get_native_graph<sycl::backend::ext_oneapi_cuda>();
+    auto stream = get_stream(queue);
+    CUresult err;
+#if CUDA_VERSION >= 12030
+    CUDA_ERROR_FUNC(cuStreamEndCapture, err, stream, &graph);
+#else
+    // cuStreamEndCapture returns a new graph, if we overwrite
+    // "graph" it won't be picked up by the SYCL runtime, as
+    // "ext_codeplay_get_native_graph" returns a passed-by-value pointer.
+    CUgraph recorded_graph;
+    CUDA_ERROR_FUNC(cuStreamEndCapture, err, stream, &recorded_graph);
+
+    // Add graph to native graph as a child node
+    // Need to return a node object for the node to be created,
+    // can't be nullptr.
+    CUgraphNode node;
+    CUDA_ERROR_FUNC(cuGraphAddChildGraphNode, err, &node, graph, nullptr, 0, recorded_graph);
+#endif // CUDA_VERSION
+#endif // SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND >= 2
+}
+
 CUstream CublasScopedContextHandler::get_stream(const sycl::queue& queue) {
     return sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue);
 }
diff --git a/src/blas/backends/cublas/cublas_scope_handle.hpp b/src/blas/backends/cublas/cublas_scope_handle.hpp
@@ -69,6 +69,28 @@ class CublasScopedContextHandler {
 public:
     CublasScopedContextHandler(sycl::interop_handle& ih);
 
+    /**
+     * @brief Start recording cuBlas calls to a graph.
+     * @detail Checks if the command-group associated with \p ih is being added
+     * to a graph, and if so, begin stream recording of the native CUDA stream
+     * associated with \p queue to the native cuda-graph object.
+     * @param queue The sycl queue to start stream recording on native stream
+     * backing the queue.
+     */
+    void begin_recording_if_graph(const sycl::queue& queue);
+
+    /**
+     * @brief End recording cuBlas calls to a graph.
+     * @detail Checks if the command-group associated with \p ih is being added
+     * to a graph, and if so, ends stream recording of the native CUDA stream
+     * associated with \p queue to the native cuda-graph object. Doing any
+     * extra work to ensure that stream recorded calls get added as nodes to
+     * the native graph object associated with \p ih.
+     * @param queue The sycl queue to end stream recording on native stream
+     * backing the queue.
+     */
+    void end_recording_if_graph(const sycl::queue& queue);
+
     /**
    * @brief get_handle: creates the handle by implicitly impose the advice
    * given by nvidia for creating a cublas_handle. (e.g. one cuStream per device
diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp
@@ -61,7 +61,9 @@ static inline void host_task_internal(H& cgh, sycl::queue queue, F f) {
     cgh.host_task([f, queue](sycl::interop_handle ih) {
 #endif
         auto sc = CublasScopedContextHandler(ih);
+        sc.begin_recording_if_graph(queue);
         f(sc);
+        sc.end_recording_if_graph(queue);
     });
 }
 #endif
diff --git a/tests/unit_tests/blas/batch/gemm_batch_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_usm.cpp
@@ -48,7 +48,7 @@ extern std::vector<sycl::device*> devices;
 namespace {
 
 template <typename Ta, typename Tb, typename Tc, typename Ts>
-int test(device* dev, oneapi::math::layout layout, int64_t group_count) {
+int test(device* dev, oneapi::math::layout layout, int64_t group_count, bool graph_record = false) {
     // Catch asynchronous exceptions.
     auto exception_handler = [](exception_list exceptions) {
         for (std::exception_ptr const& e : exceptions) {
@@ -247,6 +247,13 @@ int test(device* dev, oneapi::math::layout layout, int64_t group_count) {
 
     try {
 #ifdef CALL_RT_API
+        namespace sycl_exp = sycl::ext::oneapi::experimental;
+        using modifiable_graph = sycl_exp::command_graph<sycl_exp::graph_state::modifiable>;
+        std::unique_ptr<modifiable_graph> graph;
+        if (graph_record) {
+            graph = std::make_unique<modifiable_graph>(main_queue);
+            graph->begin_recording(main_queue);
+        }
         switch (layout) {
             case oneapi::math::layout::col_major:
                 done = oneapi::math::blas::column_major::gemm_batch(
@@ -262,7 +269,15 @@ int test(device* dev, oneapi::math::layout layout, int64_t group_count) {
                 break;
             default: break;
         }
-        done.wait_and_throw();
+
+        if (graph_record) {
+            graph->end_recording(main_queue);
+            auto exec_graph = graph->finalize();
+            main_queue.ext_oneapi_graph(exec_graph).wait_and_throw();
+        }
+        else {
+            done.wait_and_throw();
+        }
 #else
         switch (layout) {
             case oneapi::math::layout::col_major:
@@ -419,4 +434,64 @@ INSTANTIATE_TEST_SUITE_P(GemmBatchUsmTestSuite, GemmBatchUsmTests,
                                                             oneapi::math::layout::row_major)),
                          ::LayoutDeviceNamePrint());
 
+// Test using sycl_ext_oneapi_graph to record the operations from a sycl::queue
+// to a graph, then execute the graph.
+class GraphRecordGemmBatchUsmTests
+        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::math::layout>> {
+    virtual void SetUp() override {
+        CHECK_GRAPH_ON_DEVICE(std::get<0>(GetParam()));
+    }
+};
+
+TEST_P(GraphRecordGemmBatchUsmTests, RealHalfPrecision) {
+    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, sycl::half, sycl::half>(
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, true)));
+}
+
+TEST_P(GraphRecordGemmBatchUsmTests, HalfHalfFloatPrecision) {
+    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, float, float>(
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, true)));
+}
+
+TEST_P(GraphRecordGemmBatchUsmTests, Int8Int8SinglePrecision) {
+    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, float, float>(
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, true)));
+}
+
+TEST_P(GraphRecordGemmBatchUsmTests, Int8Int8Int32Precision) {
+    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, std::int32_t, float>(
+        std::get<0>(GetParam()), std::get<1>(GetParam()), 5, true)));
+}
+
+TEST_P(GraphRecordGemmBatchUsmTests, RealSinglePrecision) {
+    EXPECT_TRUEORSKIP((test<float, float, float, float>(std::get<0>(GetParam()),
+                                                        std::get<1>(GetParam()), 5, true)));
+}
+
+TEST_P(GraphRecordGemmBatchUsmTests, RealDoublePrecision) {
+    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
+
+    EXPECT_TRUEORSKIP((test<double, double, double, double>(std::get<0>(GetParam()),
+                                                            std::get<1>(GetParam()), 5, true)));
+}
+
+TEST_P(GraphRecordGemmBatchUsmTests, ComplexSinglePrecision) {
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<float>, std::complex<float>, std::complex<float>, std::complex<float>>(
+            std::get<0>(GetParam()), std::get<1>(GetParam()), 5, true)));
+}
+
+TEST_P(GraphRecordGemmBatchUsmTests, ComplexDoublePrecision) {
+    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
+
+    EXPECT_TRUEORSKIP(
+        (test<std::complex<double>, std::complex<double>, std::complex<double>,
+              std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5, true)));
+}
+
+INSTANTIATE_TEST_SUITE_P(GraphRecordGemmBatchUsmTestSuite, GraphRecordGemmBatchUsmTests,
+                         ::testing::Combine(testing::ValuesIn(devices),
+                                            testing::Values(oneapi::math::layout::col_major,
+                                                            oneapi::math::layout::row_major)),
+                         ::LayoutDeviceNamePrint());
 } // anonymous namespace
diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp
@@ -73,6 +73,10 @@
     if (d->get_info<sycl::info::device::double_fp_config>().size() == 0) \
     GTEST_SKIP() << "Double precision is not supported on the device"
 
+#define CHECK_GRAPH_ON_DEVICE(d)                   \
+    if (!d->has(aspect::ext_oneapi_limited_graph)) \
+    GTEST_SKIP() << "SYCL-Graph is not supported on the device"
+
 #if defined(ONEMATH_ENABLE_MKLCPU_BACKEND) || defined(ONEMATH_ENABLE_NETLIB_BACKEND) || \
     defined(ONEMATH_ENABLE_ARMPL_BACKEND)
 #ifdef ONEMATH_ENABLE_MKLCPU_BACKEND