Anbeeld · nycdubliner · May 23, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -101,7 +101,6 @@ static bool common_dflash_log_contract_verbose() {
     }();
     return v;
 }
-
 static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_context * ctx_dft) {
     if (!common_dflash_gpu_ring_env_enabled()) {
         LOG_INF("dflash: GPU cross ring disabled by GGML_DFLASH_GPU_RING=0; using CPU hidden capture\n");
@@ -111,9 +110,14 @@ static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_contex
     const int32_t n_tgt_devices = ctx_tgt ? llama_model_n_devices(llama_get_model(ctx_tgt)) : 1;
     const int32_t n_dft_devices = ctx_dft ? llama_model_n_devices(llama_get_model(ctx_dft)) : 1;
     if (n_tgt_devices > 1 || n_dft_devices > 1) {
-        LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); disabling GPU cross ring and graph hidden capture\n",
-                n_tgt_devices, n_dft_devices);
-        return false;
+        if (llama_dflash_allow_multi_gpu_tape()) {
+            LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); enabling experimental GPU cross ring and graph hidden capture (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n",
+                    n_tgt_devices, n_dft_devices);
+        } else {
+            LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); disabling GPU cross ring and graph hidden capture\n",
+                    n_tgt_devices, n_dft_devices);
+            return false;
+        }
     }
 
     return true;

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -14,6 +14,7 @@
 #include "ggml-impl.h"
 
 #include <assert.h>
+#include <inttypes.h>
 #include <limits.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -1661,7 +1662,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 } else {
                     // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                     // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
-                    if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
+                    const bool async_ok = split_backend->iface.cpy_tensor_async && split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy);
+                    if (!async_ok) {
                         ggml_backend_synchronize(input_backend);
                         if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                             ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
@@ -1702,11 +1704,13 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     return ec;
                 }
 
-                // TODO: pass backend to the callback, then the user can decide if they want to synchronize
-                ggml_backend_synchronize(split_backend);
+                if (need) {
+                    // TODO: pass backend to the callback, then the user can decide if they want to synchronize
+                    ggml_backend_synchronize(split_backend);
 
-                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
-                    break;
+                    if (!sched->callback_eval(t, false, sched->callback_eval_user_data)) {
+                        break;
+                    }
                 }
 
                 j0 = j1;

diff --git a/ggml/src/ggml-cuda/cross-ring-interleave.cu b/ggml/src/ggml-cuda/cross-ring-interleave.cu
@@ -7,6 +7,11 @@
 #include <cstdio>
 #include <cstring>
 
+#ifndef GGML_CUDA_MAX_DEVICES
+#define GGML_CUDA_MAX_DEVICES 16
+#endif
+
+
 // GPU cross-attention ring buffer for DFlash speculative decoding.
 
 static bool dflash_cuda_debug_enabled() {
@@ -194,11 +199,11 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d(
         return false;
     }
 #if CUDART_VERSION >= 10000 || defined(GGML_USE_HIP)
-    if (attr.type != cudaMemoryTypeDevice || attr.device != ring->device) {
+    if (attr.type != cudaMemoryTypeDevice) {
         return false;
     }
 #else
-    if (attr.memoryType != cudaMemoryTypeDevice || attr.device != ring->device) {
+    if (attr.memoryType != cudaMemoryTypeDevice) {
         return false;
     }
 #endif
@@ -219,14 +224,54 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d(
     }
 
     int first = ring->ring_size - pos;
-    if (first >= n_tokens) {
-        cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
-                        (size_t)n_tokens * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+    const bool is_peer = (attr.device != ring->device);
+    if (is_peer) {
+        static bool peer_enabled[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_DEVICES] = { { false } };
+        if (attr.device >= 0 && attr.device < GGML_CUDA_MAX_DEVICES &&
+            ring->device >= 0 && ring->device < GGML_CUDA_MAX_DEVICES) {
+            if (!peer_enabled[attr.device][ring->device]) {
+                int can_access = 0;
+                cudaDeviceCanAccessPeer(&can_access, attr.device, ring->device);
+                if (can_access) {
+                    (void)cudaSetDevice(attr.device);
+                    cudaDeviceEnablePeerAccess(ring->device, 0);
+                    cudaGetLastError();
+                }
+                peer_enabled[attr.device][ring->device] = true;
+            }
+            if (!peer_enabled[ring->device][attr.device]) {
+                int can_access = 0;
+                cudaDeviceCanAccessPeer(&can_access, ring->device, attr.device);
+                if (can_access) {
+                    (void)cudaSetDevice(ring->device);
+                    cudaDeviceEnablePeerAccess(attr.device, 0);
+                    cudaGetLastError();
+                }
+                peer_enabled[ring->device][attr.device] = true;
+            }
+            (void)cudaSetDevice(ring->device);
+        }
+
+        if (first >= n_tokens) {
+            cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device,
+                                (size_t)n_tokens * stride, cudaStreamPerThread);
+        } else {
+            cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device,
+                                (size_t)first * stride, cudaStreamPerThread);
+            cudaMemcpyPeerAsync(dst, ring->device, src + (size_t)first * stride, attr.device,
+                                (size_t)(n_tokens - first) * stride, cudaStreamPerThread);
+        }
     } else {
-        cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
-                        (size_t)first * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
-        cudaMemcpyAsync(dst, src + (size_t)first * stride,
-                        (size_t)(n_tokens - first) * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+        (void)cudaSetDevice(ring->device);
+        if (first >= n_tokens) {
+            cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
+                            (size_t)n_tokens * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+        } else {
+            cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
+                            (size_t)first * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+            cudaMemcpyAsync(dst, src + (size_t)first * stride,
+                            (size_t)(n_tokens - first) * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+        }
     }
 
     return cudaGetLastError() == cudaSuccess;

diff --git a/include/llama.h b/include/llama.h
@@ -1095,11 +1095,11 @@ extern "C" {
     // if layer_ids are configured.  GPU buffers and tape state are preserved.
     LLAMA_API void llama_set_dflash_capture_active(struct llama_context * ctx, bool active);
 
-    // DFlash: enable graph-embedded GPU hidden/tape capture for target decode.
-    // Disable this before decode when the drafter cannot consume GPU cross-ring
-    // tensors directly, so the eval callback keeps CPU hidden buffers populated.
     LLAMA_API void llama_set_dflash_gpu_capture(struct llama_context * ctx, bool enabled);
 
+    // DFlash: check whether multi-GPU recurrent tape allocation is allowed
+    LLAMA_API bool llama_dflash_allow_multi_gpu_tape();
+
     // DFlash: set drafter sampling temperature (Gumbel-max trick)
     // temp=0: greedy argmax (default), temp>0: sample from softmax(logits/temp)
     LLAMA_API void llama_set_dflash_sample_temp(struct llama_context * ctx, float temp);