From da3b7110db3bb2147d1d2aeafc96a5d001659496 Mon Sep 17 00:00:00 2001
From: nycdubliner <nycdubliner@users.noreply.github.com>
Date: Sat, 23 May 2026 14:08:56 +0100
Subject: [PATCH 1/4] spec-dec: support device-aware recurrent GPU tape
 placement on ROCm multi-GPU

For multi-GPU ROCm/HIP setups, allocating all spec-dec tape buffers on a single
device (e.g. ROCm0) causes execution failures or severe bottlenecks when recurrent
layers are split across devices (e.g. ROCm1 cannot access ROCm0 source views).

This commit enables device-aware recurrent GPU tape placement:
- Tape buffers are allocated per recurrent layer on the exact physical GPU
  device assigned to compile/run that layer.
- During DFlash tape capture and direct GPU replay, state memory is verified
  to reside on the local physical GPU before executing.
- Non-local device memory accesses and associated ROCm IPC/peer faults are avoided.
- A fallback path to CPU tape capture and replay is preserved when
  `GGML_DFLASH_ALLOW_MULTI_GPU_TAPE=0` is set or on single-GPU setups.
- Retained optimized scheduler callback-mode synchronization checks to minimize
  overhead when Hidden/GDN callback evaluation is enabled.
---
 ggml/src/ggml-backend.cpp      |  14 +-
 src/llama-context.cpp          | 274 ++++++++++++++++++++-------------
 src/llama-context.h            |   7 +
 src/llama-memory-recurrent.cpp |  51 ++++--
 src/llama-memory-recurrent.h   |   2 +-
 5 files changed, 222 insertions(+), 126 deletions(-)
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index d9f8aaec52f..eb59e427743 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -14,6 +14,7 @@
 #include "ggml-impl.h"
 
 #include <assert.h>
+#include <inttypes.h>
 #include <limits.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -1661,7 +1662,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 } else {
                     // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                     // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
-                    if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
+                    const bool async_ok = split_backend->iface.cpy_tensor_async && split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy);
+                    if (!async_ok) {
                         ggml_backend_synchronize(input_backend);
                         if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                             ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
@@ -1702,11 +1704,13 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     return ec;
                 }
 
-                // TODO: pass backend to the callback, then the user can decide if they want to synchronize
-                ggml_backend_synchronize(split_backend);
+                if (need) {
+                    // TODO: pass backend to the callback, then the user can decide if they want to synchronize
+                    ggml_backend_synchronize(split_backend);
 
-                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
-                    break;
+                    if (!sched->callback_eval(t, false, sched->callback_eval_user_data)) {
+                        break;
+                    }
                 }
 
                 j0 = j1;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f443c00732f..637471dcab7 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -23,6 +23,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <limits>
+#include <map>
 #include <stdexcept>
 
 //
@@ -31,6 +32,22 @@
 
 static llama_memory_recurrent * get_recurrent_mem(llama_memory_t mem);
 
+static ggml_backend_reg_t dflash_gpu_backend_reg() {
+    ggml_backend_reg_t reg = ggml_backend_reg_by_name("CUDA");
+    if (!reg) {
+        reg = ggml_backend_reg_by_name("ROCm");
+    }
+    return reg;
+}
+
+static bool dflash_is_cuda_compatible_tensor(const ggml_tensor * t) {
+    if (!t || !t->data || !t->buffer || ggml_backend_buffer_is_host(t->buffer)) {
+        return false;
+    }
+    const char * name = ggml_backend_buffer_name(t->buffer);
+    return name && (std::strncmp(name, "CUDA", 4) == 0 || std::strncmp(name, "ROCm", 4) == 0);
+}
+
 llama_context::llama_context(
         const llama_model & model,
               llama_context_params params) :
@@ -1121,6 +1138,14 @@ static bool dflash_profile_sync_split_enabled() {
     return enabled;
 }
 
+static bool dflash_allow_multi_gpu_tape() {
+    static const bool enabled = [] {
+        const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE");
+        return env && env[0] != '\0' && std::strcmp(env, "0") != 0;
+    }();
+    return enabled;
+}
+
 static void dflash_log_decode_seq_state(
         const char * where,
         const llama_ubatch & ubatch,
@@ -1383,17 +1408,17 @@ static bool dflash_eval_callback(struct ggml_tensor * t, bool ask, void * user_d
     // (called at the top of decode()) zeroes buf.n_tokens for every slot before
     // the ubatch loop, so each slot's buffer accumulates only that slot's tokens
     // (in their ubatch order) across all ubatches in this llama_decode() call.
-        if (h_it != cap->hidden_name_idx.end()) {
-            if (cap->profile) {
-                cap->profile_cb_read++;
-                cap->profile_cb_hidden_read++;
-                dflash_profile_cb_name(*cap, t, "read");
-            }
-            const int64_t new_embd = t->ne[0];
-            const int64_t new_n = t->ne[1];
-            const size_t h_idx = h_it->second;
+    if (h_it != cap->hidden_name_idx.end()) {
+        if (cap->profile) {
+            cap->profile_cb_read++;
+            cap->profile_cb_hidden_read++;
+            dflash_profile_cb_name(*cap, t, "read");
+        }
+        const int64_t new_embd = t->ne[0];
+        const int64_t new_n = t->ne[1];
+        const size_t h_idx = h_it->second;
 
-            if (n_seqs_unq <= 1) {
+        if (n_seqs_unq <= 1) {
             // single-seq fast path: route the whole tensor to one slot
             const int slot = ub ? ub->seq_id_unq[0] : -1;
             auto * sh = cap->slot_hiddens(slot);
@@ -1514,6 +1539,7 @@ static bool dflash_eval_callback(struct ggml_tensor * t, bool ask, void * user_d
     return true;
 }
 
+
 void llama_context::set_dflash_sample_temp(float temp) {
     cparams.dflash_sample_temp = temp;
 }
@@ -1668,7 +1694,8 @@ void llama_context::set_dflash_gpu_capture(bool enabled) {
         return;
     }
 
-    dflash_capture->gpu_capture_enabled = enabled;
+    dflash_capture->gpu_capture_enabled =
+        enabled || (model.n_devices() > 1 && dflash_allow_multi_gpu_tape());
 
     // Always clear the graph-embedded capture cparams when changing mode;
     // the decode loop will repopulate them if GPU capture is active and
@@ -1845,7 +1872,7 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) {
         return;
     }
 
-    if (model.n_devices() > 1) {
+    if (model.n_devices() > 1 && !dflash_allow_multi_gpu_tape()) {
         dflash_capture->hidden_gpu.clear();
         dflash_capture->tapes.clear();
         if (!dflash_capture->multi_gpu_capture_fallback_logged) {
@@ -1855,6 +1882,14 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) {
         }
         return;
     }
+    if (model.n_devices() > 1 && !dflash_capture->multi_gpu_capture_fallback_logged) {
+        // ROCm/HIP device-placement constraint: For multi-GPU targets, tape buffers must be allocated
+        // on the exact GPU device running the respective recurrent layer to ensure local device visibility
+        // during graph executions and direct GPU replay enqueues, avoiding nonlocal buffer access failures.
+        LLAMA_LOG_INFO("%s: multi-GPU target detected (%zu devices); enabling experimental graph GPU tape while hidden capture stays on eval callback\n",
+            __func__, model.n_devices());
+        dflash_capture->multi_gpu_capture_fallback_logged = true;
+    }
 
     allocate_hidden_gpu(n_slots, max_tokens);
 
@@ -1868,18 +1903,15 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) {
         return;
     }
 
-    // find GPU backend
-    ggml_backend_t gpu_backend = nullptr;
-    for (auto & backend : backends) {
-        auto * dev = ggml_backend_get_device(backend.get());
-        if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
-            gpu_backend = backend.get();
-            break;
+    auto backend_for_dev = [&](ggml_backend_dev_t want_dev) -> ggml_backend_t {
+        for (auto & backend : backends) {
+            auto * dev = ggml_backend_get_device(backend.get());
+            if (dev == want_dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+                return backend.get();
+            }
         }
-    }
-    if (!gpu_backend) {
-        return; // no GPU, fall back to CPU tape via eval callback
-    }
+        return nullptr;
+    };
 
     const auto & hparams = model.hparams;
     const auto & rec_ids = dflash_capture->recurrent_layer_ids;
@@ -1899,27 +1931,35 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) {
     dflash_capture->tapes.reserve(n_slots);
 
     size_t total_size = 0;
+    std::map<ggml_backend_dev_t, int> layers_by_dev;
 
     for (int slot = 0; slot < n_slots; ++slot) {
-        // allocate ggml context for this slot's tensor descriptors
-        size_t ctx_mem = ggml_tensor_overhead() * (n_rec * 5 + 2);
-        struct ggml_init_params ctx_params = { ctx_mem, nullptr, true };
-        struct ggml_context * tape_ctx = ggml_init(ctx_params);
-        if (!tape_ctx) {
-            LLAMA_LOG_WARN("%s: failed to create GPU tape context for slot %d, falling back to CPU tape\n",
-                __func__, slot);
-            dflash_capture->tapes.clear();
-            return;
-        }
-
         auto tape = std::make_unique<dflash_tape_gpu>();
         tape->layers.resize(n_rec);
         tape->layer_ids = dflash_capture->recurrent_layer_ids;
         tape->max_tokens = max_tokens;
-        tape->ctx = tape_ctx;
 
         for (int li = 0; li < n_rec; ++li) {
             const int il = rec_ids[li];
+            ggml_backend_dev_t layer_dev = model.dev_layer(il);
+            ggml_backend_t layer_backend = backend_for_dev(layer_dev);
+            if (!layer_backend) {
+                LLAMA_LOG_WARN("%s: no GPU backend for recurrent layer %d device %s; falling back to CPU tape\n",
+                    __func__, il, layer_dev ? ggml_backend_dev_name(layer_dev) : "<null>");
+                dflash_capture->tapes.clear();
+                return;
+            }
+
+            const size_t ctx_mem = ggml_tensor_overhead() * 7;
+            struct ggml_init_params ctx_params = { ctx_mem, nullptr, true };
+            struct ggml_context * tape_ctx = ggml_init(ctx_params);
+            if (!tape_ctx) {
+                LLAMA_LOG_WARN("%s: failed to create GPU tape context for slot %d layer %d, falling back to CPU tape\n",
+                    __func__, slot, il);
+                dflash_capture->tapes.clear();
+                return;
+            }
+
             const auto * conv_kernel = model.layers[il].ssm_conv1d;
             GGML_ASSERT(conv_kernel != nullptr);
             const int64_t conv_window = conv_kernel->ne[0] - 1;
@@ -1932,25 +1972,33 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) {
             tl.gate = ggml_new_tensor_3d(tape_ctx, GGML_TYPE_F32, (int64_t)1, H_v, (int64_t)max_tokens);
             tl.beta = ggml_new_tensor_3d(tape_ctx, GGML_TYPE_F32, (int64_t)1, H_v, (int64_t)max_tokens);
             tl.qkv  = ggml_new_tensor_2d(tape_ctx, GGML_TYPE_F32, conv_ch, (int64_t)max_tokens);
-        }
 
-        tape->buf = ggml_backend_alloc_ctx_tensors(tape_ctx, gpu_backend);
+            tl.ctx = tape_ctx;
+            tl.dev = layer_dev;
+            tl.buf = ggml_backend_alloc_ctx_tensors(tape_ctx, layer_backend);
 
-        if (!tape->buf) {
-            LLAMA_LOG_WARN("%s: failed to allocate GPU tape buffer for slot %d, falling back to CPU tape\n",
-                __func__, slot);
-            dflash_capture->tapes.clear();
-            return;
+            if (!tl.buf) {
+                LLAMA_LOG_WARN("%s: failed to allocate GPU tape buffer for slot %d layer %d device %s, falling back to CPU tape\n",
+                    __func__, slot, il, ggml_backend_dev_name(layer_dev));
+                dflash_capture->tapes.clear();
+                return;
+            }
+
+            total_size += ggml_backend_buffer_get_size(tl.buf);
+            layers_by_dev[layer_dev] += 1;
         }
 
-        total_size += ggml_backend_buffer_get_size(tape->buf);
         dflash_capture->tapes.push_back(std::move(tape));
     }
 
     dflash_capture->active_tape_idx = 0;
 
-    LLAMA_LOG_INFO("%s: allocated GPU tape buffers: %.1f MB total (%d slot%s, %d layers, %d max tokens)\n",
+    LLAMA_LOG_INFO("%s: allocated device-aware GPU tape buffers: %.1f MB total (%d slot%s, %d layers, %d max tokens)\n",
         __func__, total_size / (1024.0 * 1024.0), n_slots, n_slots == 1 ? "" : "s", n_rec, max_tokens);
+    for (const auto & kv : layers_by_dev) {
+        LLAMA_LOG_INFO("%s: dflash tape placement: device=%s layers=%d\n",
+            __func__, ggml_backend_dev_name(kv.first), kv.second);
+    }
 }
 
 void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) {
@@ -2173,33 +2221,33 @@ bool llama_context::dflash_memory_seq_cp_recurrent_ordered(
         llama_seq_id seq_id_dst,
         llama_pos    p0,
         llama_pos    p1) {
-    if (model.n_devices() > 1) {
-        return false;
-    }
-
     llama_memory_recurrent * mem_recr = get_recurrent_mem(get_memory());
     if (!mem_recr) {
         return false;
     }
 
-    ggml_backend_t gpu_backend = nullptr;
-    ggml_backend_reg_t cuda_reg = nullptr;
+    using sync_dflash_stream_to_backend_fn_t = bool (*)(ggml_backend_t);
+    struct gpu_wait_backend {
+        ggml_backend_t backend = nullptr;
+        sync_dflash_stream_to_backend_fn_t fn_wait_backend = nullptr;
+    };
+    std::vector<gpu_wait_backend> gpu_wait_backends;
+
     for (auto & backend : backends) {
         auto * dev = ggml_backend_get_device(backend.get());
         if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
-            gpu_backend = backend.get();
-            cuda_reg = ggml_backend_dev_backend_reg(dev);
-            break;
+            ggml_backend_reg_t cuda_reg = ggml_backend_dev_backend_reg(dev);
+            auto fn_wait_backend = cuda_reg
+                ? (sync_dflash_stream_to_backend_fn_t)
+                    ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_backend_wait_for_dflash_stream")
+                : nullptr;
+            if (!fn_wait_backend) {
+                return false;
+            }
+            gpu_wait_backends.push_back({ backend.get(), fn_wait_backend });
         }
     }
-    if (!gpu_backend || !cuda_reg) {
-        return false;
-    }
-
-    using sync_dflash_stream_to_backend_fn_t = bool (*)(ggml_backend_t);
-    auto fn_wait_backend = (sync_dflash_stream_to_backend_fn_t)
-        ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_backend_wait_for_dflash_stream");
-    if (!fn_wait_backend) {
+    if (gpu_wait_backends.empty()) {
         return false;
     }
 
@@ -2212,9 +2260,11 @@ bool llama_context::dflash_memory_seq_cp_recurrent_ordered(
         LLAMA_LOG_INFO("%s: dflash crash breadcrumb: recurrent ordered copy enqueued src=%d dst=%d\n",
                 __func__, (int) seq_id_src, (int) seq_id_dst);
     }
-    if (!fn_wait_backend(gpu_backend)) {
-        LLAMA_LOG_ERROR("%s: failed to order DFlash recurrent backup stream before verifier compute\n", __func__);
-        GGML_ABORT("failed to order DFlash recurrent backup stream before verifier compute");
+    for (const gpu_wait_backend & gpu_wait : gpu_wait_backends) {
+        if (!gpu_wait.fn_wait_backend(gpu_wait.backend)) {
+            LLAMA_LOG_ERROR("%s: failed to order DFlash recurrent backup stream before verifier compute\n", __func__);
+            GGML_ABORT("failed to order DFlash recurrent backup stream before verifier compute");
+        }
     }
     if (dflash_crash_trace_enabled()) {
         LLAMA_LOG_INFO("%s: dflash crash breadcrumb: recurrent ordered copy synced src=%d dst=%d\n",
@@ -2366,6 +2416,8 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) {
     const auto & rec_ids = dflash_capture->recurrent_layer_ids;
     auto & tape_layers   = dflash_capture->tape_layers;
 
+    const uint32_t n_embd_s = hparams.n_embd_s();
+
     // find the tail cell for this seq_id
     int32_t cell_idx = -1;
     if (seq_id >= 0 && (uint32_t) seq_id < mem_recurrent->size) {
@@ -2379,9 +2431,6 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) {
         return;
     }
 
-    const uint32_t n_embd_s = hparams.n_embd_s();
-    const uint32_t n_embd_r = hparams.n_embd_r();
-
     // find a GPU backend for graph computation
     ggml_backend_t gpu_backend = nullptr;
     for (auto & backend : backends) {
@@ -2411,6 +2460,17 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) {
 
     const bool multi_gpu_target = model.n_devices() > 1;
     if (multi_gpu_target) {
+        if (use_gpu_tape && tape_replay_gdn_direct_gpu(mem_recurrent, cell_idx, n_accepted)) {
+            dflash_capture->replay_pending = true;
+            dflash_capture->replay_gpu_backend = nullptr;
+            dflash_capture->replay_graph_ctx = nullptr;
+            dflash_capture->replay_direct_gpu = true;
+            dflash_capture->replay_n_accepted = n_accepted;
+            dflash_capture->replay_cell_idx = cell_idx;
+            dflash_capture->replay_seq_id = seq_id;
+            dflash_capture->replay_mem_recurrent = mem_recurrent;
+            return;
+        }
         if (tape_replay_gdn_direct_from_cpu_tape(mem_recurrent, cell_idx, n_accepted)) {
             tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id);
             return;
@@ -2645,15 +2705,17 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu
         return false;
     }
 
-    ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA");
+    ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg();
     if (!cuda_reg) {
         return false;
     }
     using ptr_device_fn_t = bool (*)(const void *, int *);
+    using prepare_ptr_fn_t = bool (*)(const void *);
     using replay_fn_t = bool (*)(void *, const void *, const void *, const void *, const void *, int, int, int, int);
     auto fn_ptr_device = (ptr_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_ptr_device");
+    auto fn_prepare = (prepare_ptr_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_prepare_ptr");
     auto fn_replay = (replay_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_replay_gdn_state_no_check");
-    if (!fn_ptr_device || !fn_replay) {
+    if (!fn_ptr_device || !fn_prepare || !fn_replay) {
         return false;
     }
 
@@ -2678,14 +2740,6 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu
     std::vector<replay_launch> launches;
     launches.reserve(rec_ids.size());
 
-    auto is_cuda_tensor = [](const ggml_tensor * t) {
-        if (!t || !t->data || !t->buffer) {
-            return false;
-        }
-        const char * name = ggml_backend_buffer_name(t->buffer);
-        return name && std::strncmp(name, "CUDA", 4) == 0;
-    };
-
     for (size_t li = 0; li < rec_ids.size(); ++li) {
         const int il = rec_ids[li];
         if (li >= gpu_tape->layers.size()) {
@@ -2694,8 +2748,9 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu
 
         ggml_tensor * s_tensor = mem_recurrent->s_l[il];
         auto & tl = gpu_tape->layers[li];
-        if (!is_cuda_tensor(s_tensor) || !is_cuda_tensor(tl.k) || !is_cuda_tensor(tl.v) ||
-                !is_cuda_tensor(tl.gate) || !is_cuda_tensor(tl.beta)) {
+        if (!dflash_is_cuda_compatible_tensor(s_tensor) || !dflash_is_cuda_compatible_tensor(tl.k) ||
+                !dflash_is_cuda_compatible_tensor(tl.v) || !dflash_is_cuda_compatible_tensor(tl.gate) ||
+                !dflash_is_cuda_compatible_tensor(tl.beta)) {
             return false;
         }
 
@@ -2737,22 +2792,39 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu
         if (!fn_ptr_device(launch.state, &device)) {
             return false;
         }
+        int k_device = -1;
+        int v_device = -1;
+        int gate_device = -1;
+        int beta_device = -1;
+        if (!fn_ptr_device(launch.k, &k_device) ||
+                !fn_ptr_device(launch.v, &v_device) ||
+                !fn_ptr_device(launch.gate, &gate_device) ||
+                !fn_ptr_device(launch.beta, &beta_device)) {
+            return false;
+        }
+        if (k_device != device || v_device != device || gate_device != device || beta_device != device) {
+            return false;
+        }
+        launch.device = device;
         if (replay_device < 0) {
             replay_device = device;
         } else if (device != replay_device) {
-            return false;
+            replay_device = -2;
         }
-        launch.device = device;
     }
 
     const int64_t t_start_us = dflash_capture->profile ? ggml_time_us() : 0;
     dflash_capture->replay_sync_ptrs.clear();
-    dflash_capture->replay_sync_device = replay_device;
+    dflash_capture->replay_sync_device = replay_device >= 0 ? replay_device : -1;
     for (const auto & launch : launches) {
-        if (!fn_replay(launch.state, launch.k, launch.v, launch.gate, launch.beta,
+        if (!fn_prepare(launch.state) ||
+                !fn_replay(launch.state, launch.k, launch.v, launch.gate, launch.beta,
                     n_accepted, launch.S, launch.H_k, launch.H_v)) {
             GGML_ABORT("DFlash direct GPU GDN replay launch failed after validation\n");
         }
+        if (replay_device < 0) {
+            dflash_capture->replay_sync_ptrs.push_back(launch.state);
+        }
     }
     if (dflash_capture->profile) {
         const uint64_t elapsed = ggml_time_us() - t_start_us;
@@ -2761,7 +2833,7 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu
         dflash_capture->profile_replay_direct_gpu += 1;
         dflash_capture->profile_replay_layers += launches.size();
     }
-    dflash_capture->replay_sync_ptr = launches.back().state;
+    dflash_capture->replay_sync_ptr = replay_device >= 0 ? launches.back().state : nullptr;
     return true;
 }
 
@@ -2770,7 +2842,7 @@ bool llama_context::tape_replay_gdn_direct_from_cpu_tape(llama_memory_recurrent
         return false;
     }
 
-    ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA");
+    ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg();
     if (!cuda_reg) {
         return false;
     }
@@ -2819,14 +2891,6 @@ bool llama_context::tape_replay_gdn_direct_from_cpu_tape(llama_memory_recurrent
         }
     };
 
-    auto is_cuda_tensor = [](const ggml_tensor * t) {
-        if (!t || !t->data || !t->buffer || ggml_backend_buffer_is_host(t->buffer)) {
-            return false;
-        }
-        const char * name = ggml_backend_buffer_name(t->buffer);
-        return name && std::strncmp(name, "CUDA", 4) == 0;
-    };
-
     for (size_t li = 0; li < rec_ids.size(); ++li) {
         if (li >= tape_layers.size()) {
             cleanup();
@@ -2869,7 +2933,7 @@ bool llama_context::tape_replay_gdn_direct_from_cpu_tape(llama_memory_recurrent
         }
 
         ggml_tensor * s_tensor = mem_recurrent->s_l[il];
-        if (!is_cuda_tensor(s_tensor)) {
+        if (!dflash_is_cuda_compatible_tensor(s_tensor)) {
             cleanup();
             return false;
         }
@@ -2960,7 +3024,7 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent,
         return false;
     }
 
-    ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA");
+    ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg();
     if (!cuda_reg) {
         return false;
     }
@@ -3045,7 +3109,7 @@ bool llama_context::tape_replay_conv_gpu_from_cpu_tape(llama_memory_recurrent *
         return false;
     }
 
-    ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA");
+    ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg();
     if (!cuda_reg) {
         return false;
     }
@@ -3090,14 +3154,6 @@ bool llama_context::tape_replay_conv_gpu_from_cpu_tape(llama_memory_recurrent *
         }
     };
 
-    auto is_cuda_tensor = [](const ggml_tensor * t) {
-        if (!t || !t->data || !t->buffer || ggml_backend_buffer_is_host(t->buffer)) {
-            return false;
-        }
-        const char * name = ggml_backend_buffer_name(t->buffer);
-        return name && std::strncmp(name, "CUDA", 4) == 0;
-    };
-
     for (size_t li = 0; li < rec_ids.size(); ++li) {
         if (li >= tape_layers.size()) {
             cleanup();
@@ -3109,7 +3165,7 @@ bool llama_context::tape_replay_conv_gpu_from_cpu_tape(llama_memory_recurrent *
         if (!r_tensor) {
             continue;
         }
-        if (!is_cuda_tensor(r_tensor)) {
+        if (!dflash_is_cuda_compatible_tensor(r_tensor)) {
             cleanup();
             return false;
         }
@@ -3406,7 +3462,7 @@ void llama_context::tape_replay_sync() {
     } else if (dflash_capture->replay_direct_gpu &&
             (!dflash_capture->replay_sync_ptrs.empty() || dflash_capture->replay_sync_ptr)) {
         const int64_t t_start_us = dflash_capture->profile ? ggml_time_us() : 0;
-        ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA");
+        ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg();
         using sync_ptr_fn_t = bool (*)(const void *);
         using sync_device_fn_t = bool (*)(int);
         auto fn_sync_ptr = cuda_reg
@@ -5725,6 +5781,10 @@ int llama_context::decode(const llama_batch & batch_inp) {
                     dflash_capture->ubatch = &ubatch;
                 } else {
                     dflash_gpu_capture_ready = model.n_devices() <= 1 && dflash_capture->gpu_capture_enabled;
+                    const bool dflash_gpu_tape_ready_allowed =
+                        dflash_gpu_capture_ready ||
+                        (model.n_devices() > 1 && dflash_capture->gpu_capture_enabled &&
+                         dflash_allow_multi_gpu_tape());
                     dflash_capture->ubatch = &ubatch;
                     cparams.hidden_gpu_n_seqs = 0;
                     dflash_clear_prefill_cparams(cparams);
@@ -5940,7 +6000,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                     dflash_graph_tape_ready =
                         !dflash_use_prefill_staging &&
                         !dflash_capture->tapes.empty() &&
-                        dflash_gpu_capture_ready &&
+                        dflash_gpu_tape_ready_allowed &&
                         dflash_capture->tape_enabled &&
                         dflash_capture_n_tokens <= LLAMA_DFLASH_MAX_VERIFY_TOKENS;
                     const int tape_ns = dflash_graph_tape_ready ? ns : 0;
diff --git a/src/llama-context.h b/src/llama-context.h
index b5adf13b18d..2a167e490f3 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -102,6 +102,9 @@ struct dflash_tape_gpu_layer {
     ggml_tensor * gate = nullptr;  // [1, H_v, max_tokens]
     ggml_tensor * beta = nullptr;  // [1, H_v, max_tokens]
     ggml_tensor * qkv  = nullptr;  // [conv_channels, max_tokens]
+    ggml_backend_buffer_t buf = nullptr;
+    ggml_context * ctx = nullptr;
+    ggml_backend_dev_t dev = nullptr;
 };
 
 struct dflash_tape_gpu {
@@ -113,6 +116,10 @@ struct dflash_tape_gpu {
     int n_tokens = 0;                           // actual tokens recorded this pass
 
     ~dflash_tape_gpu() {
+        for (auto & layer : layers) {
+            if (layer.buf) ggml_backend_buffer_free(layer.buf);
+            if (layer.ctx) ggml_free(layer.ctx);
+        }
         if (buf) ggml_backend_buffer_free(buf);
         if (ctx) ggml_free(ctx);
     }
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index e1432ea07cf..c37cee7836c 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -449,13 +449,15 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
 bool llama_memory_recurrent::build_recurrent_copy_plan() {
     copy_plan_valid = true;
     copy_plan_cuda_fast = false;
-    copy_plan_device = -1;
     copy_plan_entries.clear();
     copy_plan_fn_copy = nullptr;
     copy_plan_fn_set_device = nullptr;
     copy_plan_fn_sync_device = nullptr;
 
     ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA");
+    if (!cuda_reg) {
+        cuda_reg = ggml_backend_reg_by_name("ROCm");
+    }
     auto fn_ptr_device = cuda_reg
         ? (dflash_cuda_ptr_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_ptr_device")
         : nullptr;
@@ -470,6 +472,11 @@ bool llama_memory_recurrent::build_recurrent_copy_plan() {
         : nullptr;
 
     if (!fn_ptr_device || !fn_copy || !fn_set_device || !fn_sync_device) {
+        if (dflash_profile_enabled(DFLASH_PROFILE_COPY)) {
+            LLAMA_LOG_INFO("%s: dflash recurrent D2D copy unavailable: cuda_reg=%d ptr_device=%d copy=%d set_device=%d sync_device=%d\n",
+                    __func__, cuda_reg ? 1 : 0, fn_ptr_device ? 1 : 0, fn_copy ? 1 : 0,
+                    fn_set_device ? 1 : 0, fn_sync_device ? 1 : 0);
+        }
         return false;
     }
 
@@ -479,21 +486,26 @@ bool llama_memory_recurrent::build_recurrent_copy_plan() {
         }
 
         const char * buffer_name = tensor->buffer ? ggml_backend_buffer_name(tensor->buffer) : nullptr;
-        if (!buffer_name || std::strncmp(buffer_name, "CUDA", 4) != 0) {
+        if (!buffer_name ||
+                (std::strncmp(buffer_name, "CUDA", 4) != 0 &&
+                 std::strncmp(buffer_name, "ROCm", 4) != 0)) {
+            if (dflash_profile_enabled(DFLASH_PROFILE_COPY)) {
+                LLAMA_LOG_INFO("%s: dflash recurrent D2D copy unavailable: tensor=%s buffer=%s\n",
+                        __func__, tensor->name, buffer_name ? buffer_name : "<null>");
+            }
             return false;
         }
 
         int tensor_device = -1;
         if (!fn_ptr_device(tensor->data, &tensor_device)) {
-            return false;
-        }
-        if (copy_plan_device < 0) {
-            copy_plan_device = tensor_device;
-        } else if (copy_plan_device != tensor_device) {
+            if (dflash_profile_enabled(DFLASH_PROFILE_COPY)) {
+                LLAMA_LOG_INFO("%s: dflash recurrent D2D copy unavailable: tensor=%s buffer=%s ptr_device_failed\n",
+                        __func__, tensor->name, buffer_name);
+            }
             return false;
         }
 
-        copy_plan_entries.push_back({ tensor, ggml_row_size(tensor->type, n_embd) });
+        copy_plan_entries.push_back({ tensor, ggml_row_size(tensor->type, n_embd), tensor_device });
         return true;
     };
 
@@ -501,12 +513,11 @@ bool llama_memory_recurrent::build_recurrent_copy_plan() {
         if (!add_tensor(r_l[il], hparams.n_embd_r()) ||
             !add_tensor(s_l[il], hparams.n_embd_s())) {
             copy_plan_entries.clear();
-            copy_plan_device = -1;
             return false;
         }
     }
 
-    if (copy_plan_entries.empty() || copy_plan_device < 0) {
+    if (copy_plan_entries.empty()) {
         return false;
     }
 
@@ -520,7 +531,6 @@ bool llama_memory_recurrent::build_recurrent_copy_plan() {
 void llama_memory_recurrent::invalidate_recurrent_copy_plan() {
     copy_plan_valid = false;
     copy_plan_cuda_fast = false;
-    copy_plan_device = -1;
     copy_plan_entries.clear();
     copy_plan_fn_copy = nullptr;
     copy_plan_fn_set_device = nullptr;
@@ -553,12 +563,21 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) {
         build_recurrent_copy_plan();
     }
     if (copy_plan_cuda_fast) {
-        bool all_queued = copy_plan_fn_set_device && copy_plan_fn_set_device(copy_plan_device);
+        bool all_queued = copy_plan_fn_set_device != nullptr;
         bool any_queued = false;
+        int current_device = -1;
+        std::vector<int> touched_devices;
         const int64_t t_enqueue_start = profile_timing ? ggml_time_us() : 0;
 
         if (all_queued) {
             for (const recurrent_copy_plan_entry & entry : copy_plan_entries) {
+                if (current_device != entry.device) {
+                    if (!copy_plan_fn_set_device(entry.device)) {
+                        all_queued = false;
+                        break;
+                    }
+                    current_device = entry.device;
+                }
                 const char * src = (const char *) entry.tensor->data + (size_t) i_src * entry.row_bytes;
                 char * dst = (char *) entry.tensor->data + (size_t) i_dst * entry.row_bytes;
                 if (!copy_plan_fn_copy(dst, src, entry.row_bytes)) {
@@ -566,6 +585,9 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) {
                     break;
                 }
                 any_queued = true;
+                if (std::find(touched_devices.begin(), touched_devices.end(), entry.device) == touched_devices.end()) {
+                    touched_devices.push_back(entry.device);
+                }
                 profile.tensors_copied++;
                 profile.cuda_d2d_queued++;
             }
@@ -583,7 +605,10 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) {
         bool synced = false;
         if (any_queued && copy_plan_fn_sync_device) {
             const int64_t t_sync_start = profile_timing ? ggml_time_us() : 0;
-            synced = copy_plan_fn_sync_device(copy_plan_device);
+            synced = true;
+            for (int device : touched_devices) {
+                synced = copy_plan_fn_sync_device(device) && synced;
+            }
             if (t_sync_start != 0) {
                 profile.sync_us += ggml_time_us() - t_sync_start;
             }
diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h
index 097a9414442..b20c1d5fa54 100644
--- a/src/llama-memory-recurrent.h
+++ b/src/llama-memory-recurrent.h
@@ -153,6 +153,7 @@ class llama_memory_recurrent : public llama_memory_i {
     struct recurrent_copy_plan_entry {
         ggml_tensor * tensor = nullptr;
         size_t row_bytes = 0;
+        int device = -1;
     };
 
     bool build_recurrent_copy_plan();
@@ -161,7 +162,6 @@ class llama_memory_recurrent : public llama_memory_i {
 
     bool copy_plan_valid = false;
     bool copy_plan_cuda_fast = false;
-    int copy_plan_device = -1;
     std::vector<recurrent_copy_plan_entry> copy_plan_entries;
     dflash_cuda_copy_d2d_fn_t copy_plan_fn_copy = nullptr;
     dflash_cuda_set_device_fn_t copy_plan_fn_set_device = nullptr;

From 4b208f783fb8a998a0f808789fecae4911f61f9e Mon Sep 17 00:00:00 2001
From: nycdubliner <nycdubliner@users.noreply.github.com>
Date: Sat, 23 May 2026 15:13:16 +0100
Subject: [PATCH 2/4] spec-dec: address review feedback on multi-GPU tape
 placement

- Documented n_devices() > 1 multi-GPU limitations for hidden and prefill GPU allocations.
- Documented role of GGML_DFLASH_ALLOW_MULTI_GPU_TAPE environment variable.
- Added warning logging when explicitly disabling GPU capture is overridden by the env var.
- Avoid std::map dependency in allocate_tape_gpu by using a vector of structures to count device occurrences.
- Move touched_devices vector out of copy_cell hot-path stack to class member to avoid heap pressure.
---
 src/llama-context.cpp          | 29 ++++++++++++++++++++++++-----
 src/llama-memory-recurrent.cpp |  8 ++++----
 src/llama-memory-recurrent.h   |  1 +
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 637471dcab7..33cd01945be 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -23,7 +23,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <limits>
-#include <map>
 #include <stdexcept>
 
 //
@@ -1138,6 +1137,7 @@ static bool dflash_profile_sync_split_enabled() {
     return enabled;
 }
 
+// Controls the multi-GPU speculative decoding recurrent tape path. Independent of GGML_DFLASH_GPU_RING.
 static bool dflash_allow_multi_gpu_tape() {
     static const bool enabled = [] {
         const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE");
@@ -1696,6 +1696,9 @@ void llama_context::set_dflash_gpu_capture(bool enabled) {
 
     dflash_capture->gpu_capture_enabled =
         enabled || (model.n_devices() > 1 && dflash_allow_multi_gpu_tape());
+    if (!enabled && dflash_capture->gpu_capture_enabled) {
+        LLAMA_LOG_INFO("%s: forcing GPU capture enabled for multi-GPU tape support (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n", __func__);
+    }
 
     // Always clear the graph-embedded capture cparams when changing mode;
     // the decode loop will repopulate them if GPU capture is active and
@@ -1931,7 +1934,11 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) {
     dflash_capture->tapes.reserve(n_slots);
 
     size_t total_size = 0;
-    std::map<ggml_backend_dev_t, int> layers_by_dev;
+    struct dev_layer_count {
+        ggml_backend_dev_t dev;
+        int count;
+    };
+    std::vector<dev_layer_count> dev_counts;
 
     for (int slot = 0; slot < n_slots; ++slot) {
         auto tape = std::make_unique<dflash_tape_gpu>();
@@ -1985,7 +1992,17 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) {
             }
 
             total_size += ggml_backend_buffer_get_size(tl.buf);
-            layers_by_dev[layer_dev] += 1;
+            bool found = false;
+            for (auto & dc : dev_counts) {
+                if (dc.dev == layer_dev) {
+                    dc.count++;
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                dev_counts.push_back({ layer_dev, 1 });
+            }
         }
 
         dflash_capture->tapes.push_back(std::move(tape));
@@ -1995,9 +2012,9 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) {
 
     LLAMA_LOG_INFO("%s: allocated device-aware GPU tape buffers: %.1f MB total (%d slot%s, %d layers, %d max tokens)\n",
         __func__, total_size / (1024.0 * 1024.0), n_slots, n_slots == 1 ? "" : "s", n_rec, max_tokens);
-    for (const auto & kv : layers_by_dev) {
+    for (const auto & dc : dev_counts) {
         LLAMA_LOG_INFO("%s: dflash tape placement: device=%s layers=%d\n",
-            __func__, ggml_backend_dev_name(kv.first), kv.second);
+            __func__, ggml_backend_dev_name(dc.dev), dc.count);
     }
 }
 
@@ -2014,6 +2031,7 @@ void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) {
         dflash_capture->sync_backend_to_stream_backend = nullptr;
         return;
     }
+    // Hidden GPU capture requires same-device graph output tensors; not yet supported for multi-GPU layer splits
     if (model.n_devices() > 1) {
         dflash_capture->hidden_gpu.clear();
         dflash_capture->fn_sync_backend_to_stream = nullptr;
@@ -2106,6 +2124,7 @@ bool llama_context::allocate_prefill_gpu(int n_slots, int max_tokens) {
     if (!dflash_capture->gpu_capture_enabled) {
         return false;
     }
+    // Prefill GPU allocation is not yet multi-GPU aware
     if (model.n_devices() > 1) {
         return false;
     }
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index c37cee7836c..405dcbd5bd8 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -566,7 +566,7 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) {
         bool all_queued = copy_plan_fn_set_device != nullptr;
         bool any_queued = false;
         int current_device = -1;
-        std::vector<int> touched_devices;
+        copy_plan_touched_devices.clear();
         const int64_t t_enqueue_start = profile_timing ? ggml_time_us() : 0;
 
         if (all_queued) {
@@ -585,8 +585,8 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) {
                     break;
                 }
                 any_queued = true;
-                if (std::find(touched_devices.begin(), touched_devices.end(), entry.device) == touched_devices.end()) {
-                    touched_devices.push_back(entry.device);
+                if (std::find(copy_plan_touched_devices.begin(), copy_plan_touched_devices.end(), entry.device) == copy_plan_touched_devices.end()) {
+                    copy_plan_touched_devices.push_back(entry.device);
                 }
                 profile.tensors_copied++;
                 profile.cuda_d2d_queued++;
@@ -606,7 +606,7 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) {
         if (any_queued && copy_plan_fn_sync_device) {
             const int64_t t_sync_start = profile_timing ? ggml_time_us() : 0;
             synced = true;
-            for (int device : touched_devices) {
+            for (int device : copy_plan_touched_devices) {
                 synced = copy_plan_fn_sync_device(device) && synced;
             }
             if (t_sync_start != 0) {
diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h
index b20c1d5fa54..522c6f53a82 100644
--- a/src/llama-memory-recurrent.h
+++ b/src/llama-memory-recurrent.h
@@ -163,6 +163,7 @@ class llama_memory_recurrent : public llama_memory_i {
     bool copy_plan_valid = false;
     bool copy_plan_cuda_fast = false;
     std::vector<recurrent_copy_plan_entry> copy_plan_entries;
+    std::vector<int> copy_plan_touched_devices;
     dflash_cuda_copy_d2d_fn_t copy_plan_fn_copy = nullptr;
     dflash_cuda_set_device_fn_t copy_plan_fn_set_device = nullptr;
     dflash_cuda_sync_device_fn_t copy_plan_fn_sync_device = nullptr;

From 0278d5bbd0fc6475f5d1cf2f8f5dd4389cdbf5d3 Mon Sep 17 00:00:00 2001
From: nycdubliner <nycdubliner@users.noreply.github.com>
Date: Sat, 23 May 2026 15:47:48 +0100
Subject: [PATCH 3/4] wip: phase 2 gpu conv replay and hidden capture

---
 common/speculative.cpp                      |  19 +-
 ggml/src/ggml-cuda/cross-ring-interleave.cu |  32 +++-
 src/llama-context.cpp                       | 183 +++++++++++++-------
 src/llama-context.h                         |  15 ++
 tests/test-dflash-plumbing.cpp              |   6 +-
 5 files changed, 181 insertions(+), 74 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index cf413307b45..d1d6228f694 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -102,6 +102,14 @@ static bool common_dflash_log_contract_verbose() {
     return v;
 }
 
+static bool common_dflash_allow_multi_gpu_tape() {
+    static const bool enabled = [] {
+        const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE");
+        return env && env[0] != '\0' && std::strcmp(env, "0") != 0;
+    }();
+    return enabled;
+}
+
 static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_context * ctx_dft) {
     if (!common_dflash_gpu_ring_env_enabled()) {
         LOG_INF("dflash: GPU cross ring disabled by GGML_DFLASH_GPU_RING=0; using CPU hidden capture\n");
@@ -111,9 +119,14 @@ static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_contex
     const int32_t n_tgt_devices = ctx_tgt ? llama_model_n_devices(llama_get_model(ctx_tgt)) : 1;
     const int32_t n_dft_devices = ctx_dft ? llama_model_n_devices(llama_get_model(ctx_dft)) : 1;
     if (n_tgt_devices > 1 || n_dft_devices > 1) {
-        LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); disabling GPU cross ring and graph hidden capture\n",
-                n_tgt_devices, n_dft_devices);
-        return false;
+        if (common_dflash_allow_multi_gpu_tape()) {
+            LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); enabling experimental GPU cross ring and graph hidden capture (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n",
+                    n_tgt_devices, n_dft_devices);
+        } else {
+            LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); disabling GPU cross ring and graph hidden capture\n",
+                    n_tgt_devices, n_dft_devices);
+            return false;
+        }
     }
 
     return true;
diff --git a/ggml/src/ggml-cuda/cross-ring-interleave.cu b/ggml/src/ggml-cuda/cross-ring-interleave.cu
index 2b024a3cab8..50bb13f50ef 100644
--- a/ggml/src/ggml-cuda/cross-ring-interleave.cu
+++ b/ggml/src/ggml-cuda/cross-ring-interleave.cu
@@ -194,11 +194,11 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d(
         return false;
     }
 #if CUDART_VERSION >= 10000 || defined(GGML_USE_HIP)
-    if (attr.type != cudaMemoryTypeDevice || attr.device != ring->device) {
+    if (attr.type != cudaMemoryTypeDevice) {
         return false;
     }
 #else
-    if (attr.memoryType != cudaMemoryTypeDevice || attr.device != ring->device) {
+    if (attr.memoryType != cudaMemoryTypeDevice) {
         return false;
     }
 #endif
@@ -219,14 +219,28 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d(
     }
 
     int first = ring->ring_size - pos;
-    if (first >= n_tokens) {
-        cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
-                        (size_t)n_tokens * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+    const bool is_peer = (attr.device != ring->device);
+    if (is_peer) {
+        if (first >= n_tokens) {
+            cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device,
+                                (size_t)n_tokens * stride, cudaStreamPerThread);
+        } else {
+            cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device,
+                                (size_t)first * stride, cudaStreamPerThread);
+            cudaMemcpyPeerAsync(dst, ring->device, src + (size_t)first * stride, attr.device,
+                                (size_t)(n_tokens - first) * stride, cudaStreamPerThread);
+        }
     } else {
-        cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
-                        (size_t)first * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
-        cudaMemcpyAsync(dst, src + (size_t)first * stride,
-                        (size_t)(n_tokens - first) * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+        (void)cudaSetDevice(ring->device);
+        if (first >= n_tokens) {
+            cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
+                            (size_t)n_tokens * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+        } else {
+            cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
+                            (size_t)first * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+            cudaMemcpyAsync(dst, src + (size_t)first * stride,
+                            (size_t)(n_tokens - first) * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+        }
     }
 
     return cudaGetLastError() == cudaSuccess;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 33cd01945be..adad1ef8023 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2031,13 +2031,6 @@ void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) {
         dflash_capture->sync_backend_to_stream_backend = nullptr;
         return;
     }
-    // Hidden GPU capture requires same-device graph output tensors; not yet supported for multi-GPU layer splits
-    if (model.n_devices() > 1) {
-        dflash_capture->hidden_gpu.clear();
-        dflash_capture->fn_sync_backend_to_stream = nullptr;
-        dflash_capture->sync_backend_to_stream_backend = nullptr;
-        return;
-    }
     if (!llama_dflash_gpu_hidden_supported_arch(model.arch)) {
         dflash_capture->hidden_gpu.clear();
         dflash_capture->fn_sync_backend_to_stream = nullptr;
@@ -2069,6 +2062,16 @@ void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) {
     dflash_capture->sync_backend_to_stream_backend =
         dflash_capture->fn_sync_backend_to_stream ? gpu_backend : nullptr;
 
+    auto backend_for_dev = [&](ggml_backend_dev_t want_dev) -> ggml_backend_t {
+        for (auto & backend : backends) {
+            auto * dev = ggml_backend_get_device(backend.get());
+            if (dev == want_dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+                return backend.get();
+            }
+        }
+        return nullptr;
+    };
+
     const int n_layers = (int) dflash_capture->layer_ids.size();
     const int64_t n_embd = model.hparams.n_embd;
 
@@ -2077,36 +2080,48 @@ void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) {
 
     size_t total_size = 0;
     for (int slot = 0; slot < n_slots; ++slot) {
-        const size_t ctx_mem = ggml_tensor_overhead() * ((size_t) n_layers + 2);
-        struct ggml_init_params ctx_params = { ctx_mem, nullptr, true };
-        struct ggml_context * hidden_ctx = ggml_init(ctx_params);
-        if (!hidden_ctx) {
-            LLAMA_LOG_WARN("%s: failed to create GPU hidden context for slot %d; using callback hidden fallback\n",
-                __func__, slot);
-            dflash_capture->hidden_gpu.clear();
-            return;
-        }
-
         auto hidden = std::make_unique<dflash_hidden_gpu>();
         hidden->layers.resize(n_layers);
         hidden->layer_ids = dflash_capture->layer_ids;
         hidden->n_embd = n_embd;
         hidden->max_tokens = max_tokens;
-        hidden->ctx = hidden_ctx;
 
         for (int i = 0; i < n_layers; ++i) {
-            hidden->layers[i] = ggml_new_tensor_2d(hidden_ctx, GGML_TYPE_F32, n_embd, (int64_t) max_tokens);
-        }
+            const int il = dflash_capture->layer_ids[i];
+            ggml_backend_dev_t layer_dev = model.dev_layer(il);
+            ggml_backend_t layer_backend = backend_for_dev(layer_dev);
+            if (!layer_backend) {
+                LLAMA_LOG_WARN("%s: no GPU backend for hidden layer %d device %s; using callback hidden fallback\n",
+                    __func__, il, layer_dev ? ggml_backend_dev_name(layer_dev) : "<null>");
+                dflash_capture->hidden_gpu.clear();
+                return;
+            }
 
-        hidden->buf = ggml_backend_alloc_ctx_tensors(hidden_ctx, gpu_backend);
-        if (!hidden->buf) {
-            LLAMA_LOG_WARN("%s: failed to allocate GPU hidden buffer for slot %d; using callback hidden fallback\n",
-                __func__, slot);
-            dflash_capture->hidden_gpu.clear();
-            return;
+            const size_t ctx_mem = ggml_tensor_overhead() * 2;
+            struct ggml_init_params ctx_params = { ctx_mem, nullptr, true };
+            struct ggml_context * hidden_ctx = ggml_init(ctx_params);
+            if (!hidden_ctx) {
+                LLAMA_LOG_WARN("%s: failed to create GPU hidden context for slot %d layer %d; using callback hidden fallback\n",
+                    __func__, slot, il);
+                dflash_capture->hidden_gpu.clear();
+                return;
+            }
+            hidden->ctxs.push_back(hidden_ctx);
+
+            ggml_tensor * t = ggml_new_tensor_2d(hidden_ctx, GGML_TYPE_F32, n_embd, (int64_t) max_tokens);
+            hidden->layers[i] = t;
+
+            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(hidden_ctx, layer_backend);
+            if (!buf) {
+                LLAMA_LOG_WARN("%s: failed to allocate GPU hidden buffer for slot %d layer %d; using callback hidden fallback\n",
+                    __func__, slot, il);
+                dflash_capture->hidden_gpu.clear();
+                return;
+            }
+            hidden->bufs.push_back(buf);
+            total_size += ggml_backend_buffer_get_size(buf);
         }
 
-        total_size += ggml_backend_buffer_get_size(hidden->buf);
         dflash_capture->hidden_gpu.push_back(std::move(hidden));
     }
 
@@ -2124,10 +2139,6 @@ bool llama_context::allocate_prefill_gpu(int n_slots, int max_tokens) {
     if (!dflash_capture->gpu_capture_enabled) {
         return false;
     }
-    // Prefill GPU allocation is not yet multi-GPU aware
-    if (model.n_devices() > 1) {
-        return false;
-    }
     if (!llama_dflash_gpu_hidden_supported_arch(model.arch)) {
         return false;
     }
@@ -2164,6 +2175,16 @@ bool llama_context::allocate_prefill_gpu(int n_slots, int max_tokens) {
             dflash_capture->fn_sync_backend_to_stream ? gpu_backend : nullptr;
     }
 
+    auto backend_for_dev = [&](ggml_backend_dev_t want_dev) -> ggml_backend_t {
+        for (auto & backend : backends) {
+            auto * dev = ggml_backend_get_device(backend.get());
+            if (dev == want_dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+                return backend.get();
+            }
+        }
+        return nullptr;
+    };
+
     const int n_layers = (int) dflash_capture->layer_ids.size();
     const int64_t n_embd = model.hparams.n_embd;
 
@@ -2172,39 +2193,52 @@ bool llama_context::allocate_prefill_gpu(int n_slots, int max_tokens) {
 
     size_t total_size = 0;
     for (int slot = 0; slot < n_slots; ++slot) {
-        const size_t ctx_mem = ggml_tensor_overhead() * ((size_t) n_layers + 2);
-        struct ggml_init_params ctx_params = { ctx_mem, nullptr, true };
-        struct ggml_context * hidden_ctx = ggml_init(ctx_params);
-        if (!hidden_ctx) {
-            LLAMA_LOG_WARN("%s: failed to create prefill GPU context for slot %d; using callback fallback\n",
-                __func__, slot);
-            dflash_capture->prefill_gpu.clear();
-            dflash_capture->prefill_gpu_max_tokens = 0;
-            return false;
-        }
-
         auto hidden = std::make_unique<dflash_hidden_gpu>();
         hidden->layers.resize(n_layers);
         hidden->layer_ids = dflash_capture->layer_ids;
         hidden->n_embd = n_embd;
         hidden->max_tokens = max_tokens;
         hidden->n_tokens = 0;
-        hidden->ctx = hidden_ctx;
 
         for (int i = 0; i < n_layers; ++i) {
-            hidden->layers[i] = ggml_new_tensor_2d(hidden_ctx, GGML_TYPE_F32, n_embd, (int64_t) max_tokens);
-        }
+            const int il = dflash_capture->layer_ids[i];
+            ggml_backend_dev_t layer_dev = model.dev_layer(il);
+            ggml_backend_t layer_backend = backend_for_dev(layer_dev);
+            if (!layer_backend) {
+                LLAMA_LOG_WARN("%s: no GPU backend for prefill layer %d device %s; using callback fallback\n",
+                    __func__, il, layer_dev ? ggml_backend_dev_name(layer_dev) : "<null>");
+                dflash_capture->prefill_gpu.clear();
+                dflash_capture->prefill_gpu_max_tokens = 0;
+                return false;
+            }
 
-        hidden->buf = ggml_backend_alloc_ctx_tensors(hidden_ctx, gpu_backend);
-        if (!hidden->buf) {
-            LLAMA_LOG_WARN("%s: failed to allocate prefill GPU buffer for slot %d; using callback fallback\n",
-                __func__, slot);
-            dflash_capture->prefill_gpu.clear();
-            dflash_capture->prefill_gpu_max_tokens = 0;
-            return false;
+            const size_t ctx_mem = ggml_tensor_overhead() * 2;
+            struct ggml_init_params ctx_params = { ctx_mem, nullptr, true };
+            struct ggml_context * hidden_ctx = ggml_init(ctx_params);
+            if (!hidden_ctx) {
+                LLAMA_LOG_WARN("%s: failed to create prefill GPU context for slot %d layer %d; using callback fallback\n",
+                    __func__, slot, il);
+                dflash_capture->prefill_gpu.clear();
+                dflash_capture->prefill_gpu_max_tokens = 0;
+                return false;
+            }
+            hidden->ctxs.push_back(hidden_ctx);
+
+            ggml_tensor * t = ggml_new_tensor_2d(hidden_ctx, GGML_TYPE_F32, n_embd, (int64_t) max_tokens);
+            hidden->layers[i] = t;
+
+            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(hidden_ctx, layer_backend);
+            if (!buf) {
+                LLAMA_LOG_WARN("%s: failed to allocate prefill GPU buffer for slot %d layer %d; using callback fallback\n",
+                    __func__, slot, il);
+                dflash_capture->prefill_gpu.clear();
+                dflash_capture->prefill_gpu_max_tokens = 0;
+                return false;
+            }
+            hidden->bufs.push_back(buf);
+            total_size += ggml_backend_buffer_get_size(buf);
         }
 
-        total_size += ggml_backend_buffer_get_size(hidden->buf);
         dflash_capture->prefill_gpu.push_back(std::move(hidden));
     }
 
@@ -3039,17 +3073,22 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent,
     if (!dflash_capture || !mem_recurrent || n_accepted <= 0) {
         return false;
     }
-    if (model.n_devices() > 1) {
-        return false;
-    }
 
     ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg();
     if (!cuda_reg) {
         return false;
     }
+    using ptr_device_fn_t = bool (*)(const void *, int *);
+    using set_device_fn_t = bool (*)(int);
+    using sync_device_fn_t = bool (*)(int);
     using rebuild_fn_t = bool (*)(void *, const void *, int, int, int);
+
+    auto fn_ptr_device = (ptr_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_ptr_device");
+    auto fn_set_device = (set_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_set_device");
+    auto fn_sync_device = (sync_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_synchronize_device");
     auto fn_rebuild = (rebuild_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_rebuild_conv_state");
-    if (!fn_rebuild) {
+
+    if (!fn_ptr_device || !fn_set_device || !fn_sync_device || !fn_rebuild) {
         return false;
     }
 
@@ -3065,10 +3104,13 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent,
         const void * qkv;
         int conv_ch;
         int conv_window;
+        int device;
     };
     std::vector<conv_launch> launches;
     launches.reserve(rec_ids.size());
 
+    bool touched_devices[32] = {false};
+
     for (size_t li = 0; li < rec_ids.size(); ++li) {
         const int il = rec_ids[li];
         if (li >= gpu_tape->layers.size()) {
@@ -3092,13 +3134,24 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent,
             return false;
         }
 
+        int r_dev = -1;
+        if (!fn_ptr_device(r_tensor->data, &r_dev)) {
+            return false;
+        }
+        if (r_dev < 0 || r_dev >= 32) {
+            return false;
+        }
+
         const size_t r_offset = (size_t) cell_idx * n_embd_r * ggml_element_size(r_tensor);
         launches.push_back({
             (char *) r_tensor->data + r_offset,
             qkv_tensor->data,
             (int) conv_ch_i64,
             (int) conv_window_i64,
+            r_dev,
         });
+
+        touched_devices[r_dev] = true;
     }
 
     if (launches.empty()) {
@@ -3107,10 +3160,22 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent,
 
     const int64_t t_gpu_start_us = dflash_capture->profile ? ggml_time_us() : 0;
     for (const auto & launch : launches) {
+        if (!fn_set_device(launch.device)) {
+            return false;
+        }
         if (!fn_rebuild(launch.r_state, launch.qkv, n_accepted, launch.conv_ch, launch.conv_window)) {
             return false;
         }
     }
+
+    for (int dev = 0; dev < 32; ++dev) {
+        if (touched_devices[dev]) {
+            if (!fn_sync_device(dev)) {
+                return false;
+            }
+        }
+    }
+
     if (dflash_capture->profile) {
         const uint64_t elapsed = ggml_time_us() - t_gpu_start_us;
         dflash_capture->profile_replay_conv_enqueue_us += elapsed;
@@ -3302,7 +3367,7 @@ void llama_context::tape_replay_conv(llama_memory_recurrent * mem_recurrent, int
     auto & tape_layers   = dflash_capture->tape_layers;
     const uint32_t n_embd_r = hparams.n_embd_r();
 
-    if (model.n_devices() <= 1 && tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)) {
+    if (tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)) {
         return;
     }
     if (model.n_devices() > 1 && tape_replay_conv_gpu_from_cpu_tape(mem_recurrent, cell_idx, n_accepted, seq_id)) {
@@ -5799,7 +5864,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                     }
                     dflash_capture->ubatch = &ubatch;
                 } else {
-                    dflash_gpu_capture_ready = model.n_devices() <= 1 && dflash_capture->gpu_capture_enabled;
+                    dflash_gpu_capture_ready = (model.n_devices() <= 1 || dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled;
                     const bool dflash_gpu_tape_ready_allowed =
                         dflash_gpu_capture_ready ||
                         (model.n_devices() > 1 && dflash_capture->gpu_capture_enabled &&
diff --git a/src/llama-context.h b/src/llama-context.h
index 2a167e490f3..30381633c0e 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -125,9 +125,18 @@ struct dflash_tape_gpu {
     }
 };
 
+struct dflash_hidden_gpu_layer {
+    ggml_tensor * hidden = nullptr;
+    ggml_backend_buffer_t buf = nullptr;
+    ggml_context * ctx = nullptr;
+    ggml_backend_dev_t dev = nullptr;
+};
+
 struct dflash_hidden_gpu {
     std::vector<ggml_tensor *> layers;  // one [n_embd, max_tokens] tensor per captured layer
     std::vector<int32_t> layer_ids;
+    std::vector<ggml_backend_buffer_t> bufs;
+    std::vector<ggml_context *> ctxs;
     ggml_backend_buffer_t buf = nullptr;
     ggml_context * ctx = nullptr;
     int64_t n_embd = 0;
@@ -135,6 +144,12 @@ struct dflash_hidden_gpu {
     int n_tokens = 0;
 
     ~dflash_hidden_gpu() {
+        for (auto b : bufs) {
+            if (b) ggml_backend_buffer_free(b);
+        }
+        for (auto c : ctxs) {
+            if (c) ggml_free(c);
+        }
         if (buf) ggml_backend_buffer_free(buf);
         if (ctx) ggml_free(ctx);
     }
diff --git a/tests/test-dflash-plumbing.cpp b/tests/test-dflash-plumbing.cpp
index fdcd9cabe29..d2828f1c960 100644
--- a/tests/test-dflash-plumbing.cpp
+++ b/tests/test-dflash-plumbing.cpp
@@ -359,11 +359,11 @@ int main(int argc, char ** argv) {
     ok &= expect(llama_h.find("llama_set_dflash_gpu_capture") != std::string::npos, "public DFlash API must expose GPU capture gating");
     ok &= expect(context_cpp.find("allocate_hidden_gpu(n_slots, max_tokens)") != std::string::npos, "GPU tape allocation must allocate hidden capture buffers too");
     ok &= expect(context_cpp.find("dflash_skip_eval_callback ? nullptr : dflash_eval_callback") != std::string::npos, "eligible DFlash verifier graph must disable eval callback, including suppressed no-intersection ubatches");
-    ok &= expect(context_cpp.find("const bool dflash_graph_tape_ready") != std::string::npos, "DFlash decode must gate GPU tape copies separately from hidden capture");
+    ok &= expect(context_cpp.find("bool dflash_graph_tape_ready") != std::string::npos, "DFlash decode must gate GPU tape copies separately from hidden capture");
     ok &= expect(context_cpp.find("dflash_graph_hidden_ready =\n                            !dflash_capture->hidden_gpu.empty()") != std::string::npos, "GPU hidden graph capture must not depend on active tape recording");
     ok &= expect(context_cpp.find("dflash_tape_gpu * graph_tp = dflash_graph_tape_ready ? tp : nullptr") != std::string::npos, "GPU tape graph pointers must be disabled when tape recording is inactive");
     ok &= expect(context_cpp.find("multi-GPU target detected") != std::string::npos, "multi-GPU target must fall back from graph-embedded DFlash GPU capture");
-    ok &= expect(context_cpp.find("const bool dflash_gpu_capture_ready = model.n_devices() <= 1 && dflash_capture->gpu_capture_enabled") != std::string::npos, "DFlash graph capture must be gated to single-GPU target placement and explicit GPU capture policy");
+    ok &= expect(context_cpp.find("dflash_gpu_capture_ready = (model.n_devices() <= 1 || dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled;") != std::string::npos, "DFlash graph capture must be gated to single-GPU or multi-GPU tape mode and explicit GPU capture policy");
     ok &= expect(context_cpp.find("const bool multi_gpu_target = model.n_devices() > 1;") != std::string::npos, "DFlash replay must detect multi-GPU target placement before choosing GPU replay");
     ok &= expect(context_cpp.find("exact CUDA DFlash replay unavailable, using CPU recurrent replay fallback") != std::string::npos, "DFlash replay must log only the multi-GPU CPU replay fallback");
     ok &= expect(context_cpp.find("tape_replay_cpu(mem_recurrent, cell_idx, n_accepted);\n        tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id);") != std::string::npos,
@@ -413,7 +413,7 @@ int main(int argc, char ** argv) {
     ok &= expect(cuda_cpp.find("source buffer is not visible to CUDA backend device") != std::string::npos,
         "CUDA backend assert diagnostics must report the non-local source tensor");
     ok &= expect(context_h.find("tape_replay_conv_gpu") != std::string::npos, "DFlash must declare GPU conv rebuild fast path");
-    ok &= expect(context_cpp.find("model.n_devices() <= 1 && tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)") != std::string::npos, "conv replay must try GPU rebuild only for single-GPU target placement");
+    ok &= expect(context_cpp.find("tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)") != std::string::npos, "conv replay must try GPU rebuild");
     ok &= expect(context_cpp.find("const bool use_async_backend = gpu_backend && model.n_devices() <= 1;") != std::string::npos, "conv replay CPU fallback must use async backend copies only for single-GPU target placement");
     ok &= expect(context_cpp.find("ggml_backend_tensor_get_async(gpu_backend, r_tensor") == std::string::npos, "conv replay must not read split recurrent state through the first GPU backend");
     ok &= expect(context_cpp.find("ggml_backend_tensor_set_async(gpu_backend, d.r_tensor") == std::string::npos, "conv replay must not write split recurrent state through the first GPU backend");

From 7fd860667e9645d24bc71d93106ec03a763e4c66 Mon Sep 17 00:00:00 2001
From: nycdubliner <nycdubliner@users.noreply.github.com>
Date: Sat, 23 May 2026 16:08:12 +0100
Subject: [PATCH 4/4] spec-dec: address Phase 2 review feedback for ROCm
 Multi-GPU DFlash

- Auto-enable peer access dynamically in dflash_cross_ring_gpu_write_d2d to avoid manual setup.
- Clean up unused struct dflash_hidden_gpu_layer and dead buf/ctx fields in dflash_hidden_gpu.
- Resolve speculative.cpp duplication by introducing a public llama_dflash_allow_multi_gpu_tape() API.
- Replace magic array sizes of 32 with GGML_CUDA_MAX_DEVICES for touched devices tracking.
- Document unconditional call to tape_replay_conv_gpu as internally gated.
- Update test-dflash-plumbing.cpp regex assertions for renamed gating function.
---
 common/speculative.cpp                      | 11 +-------
 ggml/src/ggml-cuda/cross-ring-interleave.cu | 31 +++++++++++++++++++++
 include/llama.h                             |  6 ++--
 src/llama-context.cpp                       | 27 +++++++++++-------
 src/llama-context.h                         | 11 --------
 tests/test-dflash-plumbing.cpp              |  2 +-
 6 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index d1d6228f694..40df713cc00 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -101,15 +101,6 @@ static bool common_dflash_log_contract_verbose() {
     }();
     return v;
 }
-
-static bool common_dflash_allow_multi_gpu_tape() {
-    static const bool enabled = [] {
-        const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE");
-        return env && env[0] != '\0' && std::strcmp(env, "0") != 0;
-    }();
-    return enabled;
-}
-
 static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_context * ctx_dft) {
     if (!common_dflash_gpu_ring_env_enabled()) {
         LOG_INF("dflash: GPU cross ring disabled by GGML_DFLASH_GPU_RING=0; using CPU hidden capture\n");
@@ -119,7 +110,7 @@ static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_contex
     const int32_t n_tgt_devices = ctx_tgt ? llama_model_n_devices(llama_get_model(ctx_tgt)) : 1;
     const int32_t n_dft_devices = ctx_dft ? llama_model_n_devices(llama_get_model(ctx_dft)) : 1;
     if (n_tgt_devices > 1 || n_dft_devices > 1) {
-        if (common_dflash_allow_multi_gpu_tape()) {
+        if (llama_dflash_allow_multi_gpu_tape()) {
             LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); enabling experimental GPU cross ring and graph hidden capture (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n",
                     n_tgt_devices, n_dft_devices);
         } else {
diff --git a/ggml/src/ggml-cuda/cross-ring-interleave.cu b/ggml/src/ggml-cuda/cross-ring-interleave.cu
index 50bb13f50ef..87fe15517af 100644
--- a/ggml/src/ggml-cuda/cross-ring-interleave.cu
+++ b/ggml/src/ggml-cuda/cross-ring-interleave.cu
@@ -7,6 +7,11 @@
 #include <cstdio>
 #include <cstring>
 
+#ifndef GGML_CUDA_MAX_DEVICES
+#define GGML_CUDA_MAX_DEVICES 16
+#endif
+
+
 // GPU cross-attention ring buffer for DFlash speculative decoding.
 
 static bool dflash_cuda_debug_enabled() {
@@ -221,6 +226,32 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d(
     int first = ring->ring_size - pos;
     const bool is_peer = (attr.device != ring->device);
     if (is_peer) {
+        static bool peer_enabled[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_DEVICES] = { { false } };
+        if (attr.device >= 0 && attr.device < GGML_CUDA_MAX_DEVICES &&
+            ring->device >= 0 && ring->device < GGML_CUDA_MAX_DEVICES) {
+            if (!peer_enabled[attr.device][ring->device]) {
+                int can_access = 0;
+                cudaDeviceCanAccessPeer(&can_access, attr.device, ring->device);
+                if (can_access) {
+                    (void)cudaSetDevice(attr.device);
+                    cudaDeviceEnablePeerAccess(ring->device, 0);
+                    cudaGetLastError();
+                }
+                peer_enabled[attr.device][ring->device] = true;
+            }
+            if (!peer_enabled[ring->device][attr.device]) {
+                int can_access = 0;
+                cudaDeviceCanAccessPeer(&can_access, ring->device, attr.device);
+                if (can_access) {
+                    (void)cudaSetDevice(ring->device);
+                    cudaDeviceEnablePeerAccess(attr.device, 0);
+                    cudaGetLastError();
+                }
+                peer_enabled[ring->device][attr.device] = true;
+            }
+            (void)cudaSetDevice(ring->device);
+        }
+
         if (first >= n_tokens) {
             cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device,
                                 (size_t)n_tokens * stride, cudaStreamPerThread);
diff --git a/include/llama.h b/include/llama.h
index c854b8a484f..3e479aaa2e6 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1095,11 +1095,11 @@ extern "C" {
     // if layer_ids are configured.  GPU buffers and tape state are preserved.
     LLAMA_API void llama_set_dflash_capture_active(struct llama_context * ctx, bool active);
 
-    // DFlash: enable graph-embedded GPU hidden/tape capture for target decode.
-    // Disable this before decode when the drafter cannot consume GPU cross-ring
-    // tensors directly, so the eval callback keeps CPU hidden buffers populated.
     LLAMA_API void llama_set_dflash_gpu_capture(struct llama_context * ctx, bool enabled);
 
+    // DFlash: check whether multi-GPU recurrent tape allocation is allowed
+    LLAMA_API bool llama_dflash_allow_multi_gpu_tape();
+
     // DFlash: set drafter sampling temperature (Gumbel-max trick)
     // temp=0: greedy argmax (default), temp>0: sample from softmax(logits/temp)
     LLAMA_API void llama_set_dflash_sample_temp(struct llama_context * ctx, float temp);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index adad1ef8023..1139174f958 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -15,6 +15,11 @@
 #include "dflash-profile.h"
 #include "llama.h"
 
+#ifndef GGML_CUDA_MAX_DEVICES
+#define GGML_CUDA_MAX_DEVICES 16
+#endif
+
+
 #include "ggml-alloc.h"
 
 #include <algorithm>
@@ -1138,7 +1143,7 @@ static bool dflash_profile_sync_split_enabled() {
 }
 
 // Controls the multi-GPU speculative decoding recurrent tape path. Independent of GGML_DFLASH_GPU_RING.
-static bool dflash_allow_multi_gpu_tape() {
+bool llama_dflash_allow_multi_gpu_tape() {
     static const bool enabled = [] {
         const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE");
         return env && env[0] != '\0' && std::strcmp(env, "0") != 0;
@@ -1146,6 +1151,7 @@ static bool dflash_allow_multi_gpu_tape() {
     return enabled;
 }
 
+
 static void dflash_log_decode_seq_state(
         const char * where,
         const llama_ubatch & ubatch,
@@ -1693,9 +1699,9 @@ void llama_context::set_dflash_gpu_capture(bool enabled) {
     if (!dflash_capture) {
         return;
     }
-
     dflash_capture->gpu_capture_enabled =
-        enabled || (model.n_devices() > 1 && dflash_allow_multi_gpu_tape());
+        enabled || (model.n_devices() > 1 && llama_dflash_allow_multi_gpu_tape());
+
     if (!enabled && dflash_capture->gpu_capture_enabled) {
         LLAMA_LOG_INFO("%s: forcing GPU capture enabled for multi-GPU tape support (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n", __func__);
     }
@@ -1874,8 +1880,7 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) {
         dflash_capture->tapes.clear();
         return;
     }
-
-    if (model.n_devices() > 1 && !dflash_allow_multi_gpu_tape()) {
+    if (model.n_devices() > 1 && !llama_dflash_allow_multi_gpu_tape()) {
         dflash_capture->hidden_gpu.clear();
         dflash_capture->tapes.clear();
         if (!dflash_capture->multi_gpu_capture_fallback_logged) {
@@ -3109,7 +3114,7 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent,
     std::vector<conv_launch> launches;
     launches.reserve(rec_ids.size());
 
-    bool touched_devices[32] = {false};
+    bool touched_devices[GGML_CUDA_MAX_DEVICES] = {false};
 
     for (size_t li = 0; li < rec_ids.size(); ++li) {
         const int il = rec_ids[li];
@@ -3138,7 +3143,7 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent,
         if (!fn_ptr_device(r_tensor->data, &r_dev)) {
             return false;
         }
-        if (r_dev < 0 || r_dev >= 32) {
+        if (r_dev < 0 || r_dev >= GGML_CUDA_MAX_DEVICES) {
             return false;
         }
 
@@ -3168,7 +3173,7 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent,
         }
     }
 
-    for (int dev = 0; dev < 32; ++dev) {
+    for (int dev = 0; dev < GGML_CUDA_MAX_DEVICES; ++dev) {
         if (touched_devices[dev]) {
             if (!fn_sync_device(dev)) {
                 return false;
@@ -3367,6 +3372,8 @@ void llama_context::tape_replay_conv(llama_memory_recurrent * mem_recurrent, int
     auto & tape_layers   = dflash_capture->tape_layers;
     const uint32_t n_embd_r = hparams.n_embd_r();
 
+    // Attempt GPU conv replay path unconditionally. It handles its own gating checks internally
+    // and returns false to fallback if multi-device tape, GPU backend, or pointers are not eligible.
     if (tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)) {
         return;
     }
@@ -5864,11 +5871,11 @@ int llama_context::decode(const llama_batch & batch_inp) {
                     }
                     dflash_capture->ubatch = &ubatch;
                 } else {
-                    dflash_gpu_capture_ready = (model.n_devices() <= 1 || dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled;
+                    dflash_gpu_capture_ready = (model.n_devices() <= 1 || llama_dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled;
                     const bool dflash_gpu_tape_ready_allowed =
                         dflash_gpu_capture_ready ||
                         (model.n_devices() > 1 && dflash_capture->gpu_capture_enabled &&
-                         dflash_allow_multi_gpu_tape());
+                         llama_dflash_allow_multi_gpu_tape());
                     dflash_capture->ubatch = &ubatch;
                     cparams.hidden_gpu_n_seqs = 0;
                     dflash_clear_prefill_cparams(cparams);
diff --git a/src/llama-context.h b/src/llama-context.h
index 30381633c0e..82b7b678b87 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -125,20 +125,11 @@ struct dflash_tape_gpu {
     }
 };
 
-struct dflash_hidden_gpu_layer {
-    ggml_tensor * hidden = nullptr;
-    ggml_backend_buffer_t buf = nullptr;
-    ggml_context * ctx = nullptr;
-    ggml_backend_dev_t dev = nullptr;
-};
-
 struct dflash_hidden_gpu {
     std::vector<ggml_tensor *> layers;  // one [n_embd, max_tokens] tensor per captured layer
     std::vector<int32_t> layer_ids;
     std::vector<ggml_backend_buffer_t> bufs;
     std::vector<ggml_context *> ctxs;
-    ggml_backend_buffer_t buf = nullptr;
-    ggml_context * ctx = nullptr;
     int64_t n_embd = 0;
     int max_tokens = 0;
     int n_tokens = 0;
@@ -150,8 +141,6 @@ struct dflash_hidden_gpu {
         for (auto c : ctxs) {
             if (c) ggml_free(c);
         }
-        if (buf) ggml_backend_buffer_free(buf);
-        if (ctx) ggml_free(ctx);
     }
 };
 
diff --git a/tests/test-dflash-plumbing.cpp b/tests/test-dflash-plumbing.cpp
index d2828f1c960..f74bbab938a 100644
--- a/tests/test-dflash-plumbing.cpp
+++ b/tests/test-dflash-plumbing.cpp
@@ -363,7 +363,7 @@ int main(int argc, char ** argv) {
     ok &= expect(context_cpp.find("dflash_graph_hidden_ready =\n                            !dflash_capture->hidden_gpu.empty()") != std::string::npos, "GPU hidden graph capture must not depend on active tape recording");
     ok &= expect(context_cpp.find("dflash_tape_gpu * graph_tp = dflash_graph_tape_ready ? tp : nullptr") != std::string::npos, "GPU tape graph pointers must be disabled when tape recording is inactive");
     ok &= expect(context_cpp.find("multi-GPU target detected") != std::string::npos, "multi-GPU target must fall back from graph-embedded DFlash GPU capture");
-    ok &= expect(context_cpp.find("dflash_gpu_capture_ready = (model.n_devices() <= 1 || dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled;") != std::string::npos, "DFlash graph capture must be gated to single-GPU or multi-GPU tape mode and explicit GPU capture policy");
+    ok &= expect(context_cpp.find("dflash_gpu_capture_ready = (model.n_devices() <= 1 || llama_dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled;") != std::string::npos, "DFlash graph capture must be gated to single-GPU or multi-GPU tape mode and explicit GPU capture policy");
     ok &= expect(context_cpp.find("const bool multi_gpu_target = model.n_devices() > 1;") != std::string::npos, "DFlash replay must detect multi-GPU target placement before choosing GPU replay");
     ok &= expect(context_cpp.find("exact CUDA DFlash replay unavailable, using CPU recurrent replay fallback") != std::string::npos, "DFlash replay must log only the multi-GPU CPU replay fallback");
     ok &= expect(context_cpp.find("tape_replay_cpu(mem_recurrent, cell_idx, n_accepted);\n        tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id);") != std::string::npos,