From da3b7110db3bb2147d1d2aeafc96a5d001659496 Mon Sep 17 00:00:00 2001 From: nycdubliner Date: Sat, 23 May 2026 14:08:56 +0100 Subject: [PATCH 1/4] spec-dec: support device-aware recurrent GPU tape placement on ROCm multi-GPU For multi-GPU ROCm/HIP setups, allocating all spec-dec tape buffers on a single device (e.g. ROCm0) causes execution failures or severe bottlenecks when recurrent layers are split across devices (e.g. ROCm1 cannot access ROCm0 source views). This commit enables device-aware recurrent GPU tape placement: - Tape buffers are allocated per recurrent layer on the exact physical GPU device assigned to compile/run that layer. - During DFlash tape capture and direct GPU replay, state memory is verified to reside on the local physical GPU before executing. - Non-local device memory accesses and associated ROCm IPC/peer faults are avoided. - A fallback path to CPU tape capture and replay is preserved when `GGML_DFLASH_ALLOW_MULTI_GPU_TAPE=0` is set or on single-GPU setups. - Retained optimized scheduler callback-mode synchronization checks to minimize overhead when Hidden/GDN callback evaluation is enabled. --- ggml/src/ggml-backend.cpp | 14 +- src/llama-context.cpp | 274 ++++++++++++++++++++------------- src/llama-context.h | 7 + src/llama-memory-recurrent.cpp | 51 ++++-- src/llama-memory-recurrent.h | 2 +- 5 files changed, 222 insertions(+), 126 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index d9f8aaec52f..eb59e427743 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -14,6 +14,7 @@ #include "ggml-impl.h" #include +#include #include #include #include @@ -1661,7 +1662,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } else { // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface - if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { + const bool async_ok = split_backend->iface.cpy_tensor_async && split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy); + if (!async_ok) { ggml_backend_synchronize(input_backend); if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); @@ -1702,11 +1704,13 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s return ec; } - // TODO: pass backend to the callback, then the user can decide if they want to synchronize - ggml_backend_synchronize(split_backend); + if (need) { + // TODO: pass backend to the callback, then the user can decide if they want to synchronize + ggml_backend_synchronize(split_backend); - if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { - break; + if (!sched->callback_eval(t, false, sched->callback_eval_user_data)) { + break; + } } j0 = j1; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f443c00732f..637471dcab7 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include // @@ -31,6 +32,22 @@ static llama_memory_recurrent * get_recurrent_mem(llama_memory_t mem); +static ggml_backend_reg_t dflash_gpu_backend_reg() { + ggml_backend_reg_t reg = ggml_backend_reg_by_name("CUDA"); + if (!reg) { + reg = ggml_backend_reg_by_name("ROCm"); + } + return reg; +} + +static bool dflash_is_cuda_compatible_tensor(const ggml_tensor * t) { + if (!t || !t->data || !t->buffer || ggml_backend_buffer_is_host(t->buffer)) { + return false; + } + const char * name = ggml_backend_buffer_name(t->buffer); + return name && (std::strncmp(name, "CUDA", 4) == 0 || std::strncmp(name, "ROCm", 4) == 0); +} + llama_context::llama_context( const llama_model & model, llama_context_params params) : @@ -1121,6 +1138,14 @@ static bool dflash_profile_sync_split_enabled() { return enabled; } +static bool dflash_allow_multi_gpu_tape() { + static const bool enabled = [] { + const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE"); + return env && env[0] != '\0' && std::strcmp(env, "0") != 0; + }(); + return enabled; +} + static void dflash_log_decode_seq_state( const char * where, const llama_ubatch & ubatch, @@ -1383,17 +1408,17 @@ static bool dflash_eval_callback(struct ggml_tensor * t, bool ask, void * user_d // (called at the top of decode()) zeroes buf.n_tokens for every slot before // the ubatch loop, so each slot's buffer accumulates only that slot's tokens // (in their ubatch order) across all ubatches in this llama_decode() call. - if (h_it != cap->hidden_name_idx.end()) { - if (cap->profile) { - cap->profile_cb_read++; - cap->profile_cb_hidden_read++; - dflash_profile_cb_name(*cap, t, "read"); - } - const int64_t new_embd = t->ne[0]; - const int64_t new_n = t->ne[1]; - const size_t h_idx = h_it->second; + if (h_it != cap->hidden_name_idx.end()) { + if (cap->profile) { + cap->profile_cb_read++; + cap->profile_cb_hidden_read++; + dflash_profile_cb_name(*cap, t, "read"); + } + const int64_t new_embd = t->ne[0]; + const int64_t new_n = t->ne[1]; + const size_t h_idx = h_it->second; - if (n_seqs_unq <= 1) { + if (n_seqs_unq <= 1) { // single-seq fast path: route the whole tensor to one slot const int slot = ub ? ub->seq_id_unq[0] : -1; auto * sh = cap->slot_hiddens(slot); @@ -1514,6 +1539,7 @@ static bool dflash_eval_callback(struct ggml_tensor * t, bool ask, void * user_d return true; } + void llama_context::set_dflash_sample_temp(float temp) { cparams.dflash_sample_temp = temp; } @@ -1668,7 +1694,8 @@ void llama_context::set_dflash_gpu_capture(bool enabled) { return; } - dflash_capture->gpu_capture_enabled = enabled; + dflash_capture->gpu_capture_enabled = + enabled || (model.n_devices() > 1 && dflash_allow_multi_gpu_tape()); // Always clear the graph-embedded capture cparams when changing mode; // the decode loop will repopulate them if GPU capture is active and @@ -1845,7 +1872,7 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) { return; } - if (model.n_devices() > 1) { + if (model.n_devices() > 1 && !dflash_allow_multi_gpu_tape()) { dflash_capture->hidden_gpu.clear(); dflash_capture->tapes.clear(); if (!dflash_capture->multi_gpu_capture_fallback_logged) { @@ -1855,6 +1882,14 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) { } return; } + if (model.n_devices() > 1 && !dflash_capture->multi_gpu_capture_fallback_logged) { + // ROCm/HIP device-placement constraint: For multi-GPU targets, tape buffers must be allocated + // on the exact GPU device running the respective recurrent layer to ensure local device visibility + // during graph executions and direct GPU replay enqueues, avoiding nonlocal buffer access failures. + LLAMA_LOG_INFO("%s: multi-GPU target detected (%zu devices); enabling experimental graph GPU tape while hidden capture stays on eval callback\n", + __func__, model.n_devices()); + dflash_capture->multi_gpu_capture_fallback_logged = true; + } allocate_hidden_gpu(n_slots, max_tokens); @@ -1868,18 +1903,15 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) { return; } - // find GPU backend - ggml_backend_t gpu_backend = nullptr; - for (auto & backend : backends) { - auto * dev = ggml_backend_get_device(backend.get()); - if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { - gpu_backend = backend.get(); - break; + auto backend_for_dev = [&](ggml_backend_dev_t want_dev) -> ggml_backend_t { + for (auto & backend : backends) { + auto * dev = ggml_backend_get_device(backend.get()); + if (dev == want_dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + return backend.get(); + } } - } - if (!gpu_backend) { - return; // no GPU, fall back to CPU tape via eval callback - } + return nullptr; + }; const auto & hparams = model.hparams; const auto & rec_ids = dflash_capture->recurrent_layer_ids; @@ -1899,27 +1931,35 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) { dflash_capture->tapes.reserve(n_slots); size_t total_size = 0; + std::map layers_by_dev; for (int slot = 0; slot < n_slots; ++slot) { - // allocate ggml context for this slot's tensor descriptors - size_t ctx_mem = ggml_tensor_overhead() * (n_rec * 5 + 2); - struct ggml_init_params ctx_params = { ctx_mem, nullptr, true }; - struct ggml_context * tape_ctx = ggml_init(ctx_params); - if (!tape_ctx) { - LLAMA_LOG_WARN("%s: failed to create GPU tape context for slot %d, falling back to CPU tape\n", - __func__, slot); - dflash_capture->tapes.clear(); - return; - } - auto tape = std::make_unique(); tape->layers.resize(n_rec); tape->layer_ids = dflash_capture->recurrent_layer_ids; tape->max_tokens = max_tokens; - tape->ctx = tape_ctx; for (int li = 0; li < n_rec; ++li) { const int il = rec_ids[li]; + ggml_backend_dev_t layer_dev = model.dev_layer(il); + ggml_backend_t layer_backend = backend_for_dev(layer_dev); + if (!layer_backend) { + LLAMA_LOG_WARN("%s: no GPU backend for recurrent layer %d device %s; falling back to CPU tape\n", + __func__, il, layer_dev ? ggml_backend_dev_name(layer_dev) : ""); + dflash_capture->tapes.clear(); + return; + } + + const size_t ctx_mem = ggml_tensor_overhead() * 7; + struct ggml_init_params ctx_params = { ctx_mem, nullptr, true }; + struct ggml_context * tape_ctx = ggml_init(ctx_params); + if (!tape_ctx) { + LLAMA_LOG_WARN("%s: failed to create GPU tape context for slot %d layer %d, falling back to CPU tape\n", + __func__, slot, il); + dflash_capture->tapes.clear(); + return; + } + const auto * conv_kernel = model.layers[il].ssm_conv1d; GGML_ASSERT(conv_kernel != nullptr); const int64_t conv_window = conv_kernel->ne[0] - 1; @@ -1932,25 +1972,33 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) { tl.gate = ggml_new_tensor_3d(tape_ctx, GGML_TYPE_F32, (int64_t)1, H_v, (int64_t)max_tokens); tl.beta = ggml_new_tensor_3d(tape_ctx, GGML_TYPE_F32, (int64_t)1, H_v, (int64_t)max_tokens); tl.qkv = ggml_new_tensor_2d(tape_ctx, GGML_TYPE_F32, conv_ch, (int64_t)max_tokens); - } - tape->buf = ggml_backend_alloc_ctx_tensors(tape_ctx, gpu_backend); + tl.ctx = tape_ctx; + tl.dev = layer_dev; + tl.buf = ggml_backend_alloc_ctx_tensors(tape_ctx, layer_backend); - if (!tape->buf) { - LLAMA_LOG_WARN("%s: failed to allocate GPU tape buffer for slot %d, falling back to CPU tape\n", - __func__, slot); - dflash_capture->tapes.clear(); - return; + if (!tl.buf) { + LLAMA_LOG_WARN("%s: failed to allocate GPU tape buffer for slot %d layer %d device %s, falling back to CPU tape\n", + __func__, slot, il, ggml_backend_dev_name(layer_dev)); + dflash_capture->tapes.clear(); + return; + } + + total_size += ggml_backend_buffer_get_size(tl.buf); + layers_by_dev[layer_dev] += 1; } - total_size += ggml_backend_buffer_get_size(tape->buf); dflash_capture->tapes.push_back(std::move(tape)); } dflash_capture->active_tape_idx = 0; - LLAMA_LOG_INFO("%s: allocated GPU tape buffers: %.1f MB total (%d slot%s, %d layers, %d max tokens)\n", + LLAMA_LOG_INFO("%s: allocated device-aware GPU tape buffers: %.1f MB total (%d slot%s, %d layers, %d max tokens)\n", __func__, total_size / (1024.0 * 1024.0), n_slots, n_slots == 1 ? "" : "s", n_rec, max_tokens); + for (const auto & kv : layers_by_dev) { + LLAMA_LOG_INFO("%s: dflash tape placement: device=%s layers=%d\n", + __func__, ggml_backend_dev_name(kv.first), kv.second); + } } void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) { @@ -2173,33 +2221,33 @@ bool llama_context::dflash_memory_seq_cp_recurrent_ordered( llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { - if (model.n_devices() > 1) { - return false; - } - llama_memory_recurrent * mem_recr = get_recurrent_mem(get_memory()); if (!mem_recr) { return false; } - ggml_backend_t gpu_backend = nullptr; - ggml_backend_reg_t cuda_reg = nullptr; + using sync_dflash_stream_to_backend_fn_t = bool (*)(ggml_backend_t); + struct gpu_wait_backend { + ggml_backend_t backend = nullptr; + sync_dflash_stream_to_backend_fn_t fn_wait_backend = nullptr; + }; + std::vector gpu_wait_backends; + for (auto & backend : backends) { auto * dev = ggml_backend_get_device(backend.get()); if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { - gpu_backend = backend.get(); - cuda_reg = ggml_backend_dev_backend_reg(dev); - break; + ggml_backend_reg_t cuda_reg = ggml_backend_dev_backend_reg(dev); + auto fn_wait_backend = cuda_reg + ? (sync_dflash_stream_to_backend_fn_t) + ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_backend_wait_for_dflash_stream") + : nullptr; + if (!fn_wait_backend) { + return false; + } + gpu_wait_backends.push_back({ backend.get(), fn_wait_backend }); } } - if (!gpu_backend || !cuda_reg) { - return false; - } - - using sync_dflash_stream_to_backend_fn_t = bool (*)(ggml_backend_t); - auto fn_wait_backend = (sync_dflash_stream_to_backend_fn_t) - ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_backend_wait_for_dflash_stream"); - if (!fn_wait_backend) { + if (gpu_wait_backends.empty()) { return false; } @@ -2212,9 +2260,11 @@ bool llama_context::dflash_memory_seq_cp_recurrent_ordered( LLAMA_LOG_INFO("%s: dflash crash breadcrumb: recurrent ordered copy enqueued src=%d dst=%d\n", __func__, (int) seq_id_src, (int) seq_id_dst); } - if (!fn_wait_backend(gpu_backend)) { - LLAMA_LOG_ERROR("%s: failed to order DFlash recurrent backup stream before verifier compute\n", __func__); - GGML_ABORT("failed to order DFlash recurrent backup stream before verifier compute"); + for (const gpu_wait_backend & gpu_wait : gpu_wait_backends) { + if (!gpu_wait.fn_wait_backend(gpu_wait.backend)) { + LLAMA_LOG_ERROR("%s: failed to order DFlash recurrent backup stream before verifier compute\n", __func__); + GGML_ABORT("failed to order DFlash recurrent backup stream before verifier compute"); + } } if (dflash_crash_trace_enabled()) { LLAMA_LOG_INFO("%s: dflash crash breadcrumb: recurrent ordered copy synced src=%d dst=%d\n", @@ -2366,6 +2416,8 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { const auto & rec_ids = dflash_capture->recurrent_layer_ids; auto & tape_layers = dflash_capture->tape_layers; + const uint32_t n_embd_s = hparams.n_embd_s(); + // find the tail cell for this seq_id int32_t cell_idx = -1; if (seq_id >= 0 && (uint32_t) seq_id < mem_recurrent->size) { @@ -2379,9 +2431,6 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { return; } - const uint32_t n_embd_s = hparams.n_embd_s(); - const uint32_t n_embd_r = hparams.n_embd_r(); - // find a GPU backend for graph computation ggml_backend_t gpu_backend = nullptr; for (auto & backend : backends) { @@ -2411,6 +2460,17 @@ void llama_context::tape_replay(llama_seq_id seq_id, int n_accepted) { const bool multi_gpu_target = model.n_devices() > 1; if (multi_gpu_target) { + if (use_gpu_tape && tape_replay_gdn_direct_gpu(mem_recurrent, cell_idx, n_accepted)) { + dflash_capture->replay_pending = true; + dflash_capture->replay_gpu_backend = nullptr; + dflash_capture->replay_graph_ctx = nullptr; + dflash_capture->replay_direct_gpu = true; + dflash_capture->replay_n_accepted = n_accepted; + dflash_capture->replay_cell_idx = cell_idx; + dflash_capture->replay_seq_id = seq_id; + dflash_capture->replay_mem_recurrent = mem_recurrent; + return; + } if (tape_replay_gdn_direct_from_cpu_tape(mem_recurrent, cell_idx, n_accepted)) { tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id); return; @@ -2645,15 +2705,17 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu return false; } - ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA"); + ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg(); if (!cuda_reg) { return false; } using ptr_device_fn_t = bool (*)(const void *, int *); + using prepare_ptr_fn_t = bool (*)(const void *); using replay_fn_t = bool (*)(void *, const void *, const void *, const void *, const void *, int, int, int, int); auto fn_ptr_device = (ptr_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_ptr_device"); + auto fn_prepare = (prepare_ptr_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_prepare_ptr"); auto fn_replay = (replay_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_replay_gdn_state_no_check"); - if (!fn_ptr_device || !fn_replay) { + if (!fn_ptr_device || !fn_prepare || !fn_replay) { return false; } @@ -2678,14 +2740,6 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu std::vector launches; launches.reserve(rec_ids.size()); - auto is_cuda_tensor = [](const ggml_tensor * t) { - if (!t || !t->data || !t->buffer) { - return false; - } - const char * name = ggml_backend_buffer_name(t->buffer); - return name && std::strncmp(name, "CUDA", 4) == 0; - }; - for (size_t li = 0; li < rec_ids.size(); ++li) { const int il = rec_ids[li]; if (li >= gpu_tape->layers.size()) { @@ -2694,8 +2748,9 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu ggml_tensor * s_tensor = mem_recurrent->s_l[il]; auto & tl = gpu_tape->layers[li]; - if (!is_cuda_tensor(s_tensor) || !is_cuda_tensor(tl.k) || !is_cuda_tensor(tl.v) || - !is_cuda_tensor(tl.gate) || !is_cuda_tensor(tl.beta)) { + if (!dflash_is_cuda_compatible_tensor(s_tensor) || !dflash_is_cuda_compatible_tensor(tl.k) || + !dflash_is_cuda_compatible_tensor(tl.v) || !dflash_is_cuda_compatible_tensor(tl.gate) || + !dflash_is_cuda_compatible_tensor(tl.beta)) { return false; } @@ -2737,22 +2792,39 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu if (!fn_ptr_device(launch.state, &device)) { return false; } + int k_device = -1; + int v_device = -1; + int gate_device = -1; + int beta_device = -1; + if (!fn_ptr_device(launch.k, &k_device) || + !fn_ptr_device(launch.v, &v_device) || + !fn_ptr_device(launch.gate, &gate_device) || + !fn_ptr_device(launch.beta, &beta_device)) { + return false; + } + if (k_device != device || v_device != device || gate_device != device || beta_device != device) { + return false; + } + launch.device = device; if (replay_device < 0) { replay_device = device; } else if (device != replay_device) { - return false; + replay_device = -2; } - launch.device = device; } const int64_t t_start_us = dflash_capture->profile ? ggml_time_us() : 0; dflash_capture->replay_sync_ptrs.clear(); - dflash_capture->replay_sync_device = replay_device; + dflash_capture->replay_sync_device = replay_device >= 0 ? replay_device : -1; for (const auto & launch : launches) { - if (!fn_replay(launch.state, launch.k, launch.v, launch.gate, launch.beta, + if (!fn_prepare(launch.state) || + !fn_replay(launch.state, launch.k, launch.v, launch.gate, launch.beta, n_accepted, launch.S, launch.H_k, launch.H_v)) { GGML_ABORT("DFlash direct GPU GDN replay launch failed after validation\n"); } + if (replay_device < 0) { + dflash_capture->replay_sync_ptrs.push_back(launch.state); + } } if (dflash_capture->profile) { const uint64_t elapsed = ggml_time_us() - t_start_us; @@ -2761,7 +2833,7 @@ bool llama_context::tape_replay_gdn_direct_gpu(llama_memory_recurrent * mem_recu dflash_capture->profile_replay_direct_gpu += 1; dflash_capture->profile_replay_layers += launches.size(); } - dflash_capture->replay_sync_ptr = launches.back().state; + dflash_capture->replay_sync_ptr = replay_device >= 0 ? launches.back().state : nullptr; return true; } @@ -2770,7 +2842,7 @@ bool llama_context::tape_replay_gdn_direct_from_cpu_tape(llama_memory_recurrent return false; } - ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA"); + ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg(); if (!cuda_reg) { return false; } @@ -2819,14 +2891,6 @@ bool llama_context::tape_replay_gdn_direct_from_cpu_tape(llama_memory_recurrent } }; - auto is_cuda_tensor = [](const ggml_tensor * t) { - if (!t || !t->data || !t->buffer || ggml_backend_buffer_is_host(t->buffer)) { - return false; - } - const char * name = ggml_backend_buffer_name(t->buffer); - return name && std::strncmp(name, "CUDA", 4) == 0; - }; - for (size_t li = 0; li < rec_ids.size(); ++li) { if (li >= tape_layers.size()) { cleanup(); @@ -2869,7 +2933,7 @@ bool llama_context::tape_replay_gdn_direct_from_cpu_tape(llama_memory_recurrent } ggml_tensor * s_tensor = mem_recurrent->s_l[il]; - if (!is_cuda_tensor(s_tensor)) { + if (!dflash_is_cuda_compatible_tensor(s_tensor)) { cleanup(); return false; } @@ -2960,7 +3024,7 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, return false; } - ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA"); + ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg(); if (!cuda_reg) { return false; } @@ -3045,7 +3109,7 @@ bool llama_context::tape_replay_conv_gpu_from_cpu_tape(llama_memory_recurrent * return false; } - ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA"); + ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg(); if (!cuda_reg) { return false; } @@ -3090,14 +3154,6 @@ bool llama_context::tape_replay_conv_gpu_from_cpu_tape(llama_memory_recurrent * } }; - auto is_cuda_tensor = [](const ggml_tensor * t) { - if (!t || !t->data || !t->buffer || ggml_backend_buffer_is_host(t->buffer)) { - return false; - } - const char * name = ggml_backend_buffer_name(t->buffer); - return name && std::strncmp(name, "CUDA", 4) == 0; - }; - for (size_t li = 0; li < rec_ids.size(); ++li) { if (li >= tape_layers.size()) { cleanup(); @@ -3109,7 +3165,7 @@ bool llama_context::tape_replay_conv_gpu_from_cpu_tape(llama_memory_recurrent * if (!r_tensor) { continue; } - if (!is_cuda_tensor(r_tensor)) { + if (!dflash_is_cuda_compatible_tensor(r_tensor)) { cleanup(); return false; } @@ -3406,7 +3462,7 @@ void llama_context::tape_replay_sync() { } else if (dflash_capture->replay_direct_gpu && (!dflash_capture->replay_sync_ptrs.empty() || dflash_capture->replay_sync_ptr)) { const int64_t t_start_us = dflash_capture->profile ? ggml_time_us() : 0; - ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA"); + ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg(); using sync_ptr_fn_t = bool (*)(const void *); using sync_device_fn_t = bool (*)(int); auto fn_sync_ptr = cuda_reg @@ -5725,6 +5781,10 @@ int llama_context::decode(const llama_batch & batch_inp) { dflash_capture->ubatch = &ubatch; } else { dflash_gpu_capture_ready = model.n_devices() <= 1 && dflash_capture->gpu_capture_enabled; + const bool dflash_gpu_tape_ready_allowed = + dflash_gpu_capture_ready || + (model.n_devices() > 1 && dflash_capture->gpu_capture_enabled && + dflash_allow_multi_gpu_tape()); dflash_capture->ubatch = &ubatch; cparams.hidden_gpu_n_seqs = 0; dflash_clear_prefill_cparams(cparams); @@ -5940,7 +6000,7 @@ int llama_context::decode(const llama_batch & batch_inp) { dflash_graph_tape_ready = !dflash_use_prefill_staging && !dflash_capture->tapes.empty() && - dflash_gpu_capture_ready && + dflash_gpu_tape_ready_allowed && dflash_capture->tape_enabled && dflash_capture_n_tokens <= LLAMA_DFLASH_MAX_VERIFY_TOKENS; const int tape_ns = dflash_graph_tape_ready ? ns : 0; diff --git a/src/llama-context.h b/src/llama-context.h index b5adf13b18d..2a167e490f3 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -102,6 +102,9 @@ struct dflash_tape_gpu_layer { ggml_tensor * gate = nullptr; // [1, H_v, max_tokens] ggml_tensor * beta = nullptr; // [1, H_v, max_tokens] ggml_tensor * qkv = nullptr; // [conv_channels, max_tokens] + ggml_backend_buffer_t buf = nullptr; + ggml_context * ctx = nullptr; + ggml_backend_dev_t dev = nullptr; }; struct dflash_tape_gpu { @@ -113,6 +116,10 @@ struct dflash_tape_gpu { int n_tokens = 0; // actual tokens recorded this pass ~dflash_tape_gpu() { + for (auto & layer : layers) { + if (layer.buf) ggml_backend_buffer_free(layer.buf); + if (layer.ctx) ggml_free(layer.ctx); + } if (buf) ggml_backend_buffer_free(buf); if (ctx) ggml_free(ctx); } diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index e1432ea07cf..c37cee7836c 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -449,13 +449,15 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const { bool llama_memory_recurrent::build_recurrent_copy_plan() { copy_plan_valid = true; copy_plan_cuda_fast = false; - copy_plan_device = -1; copy_plan_entries.clear(); copy_plan_fn_copy = nullptr; copy_plan_fn_set_device = nullptr; copy_plan_fn_sync_device = nullptr; ggml_backend_reg_t cuda_reg = ggml_backend_reg_by_name("CUDA"); + if (!cuda_reg) { + cuda_reg = ggml_backend_reg_by_name("ROCm"); + } auto fn_ptr_device = cuda_reg ? (dflash_cuda_ptr_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_ptr_device") : nullptr; @@ -470,6 +472,11 @@ bool llama_memory_recurrent::build_recurrent_copy_plan() { : nullptr; if (!fn_ptr_device || !fn_copy || !fn_set_device || !fn_sync_device) { + if (dflash_profile_enabled(DFLASH_PROFILE_COPY)) { + LLAMA_LOG_INFO("%s: dflash recurrent D2D copy unavailable: cuda_reg=%d ptr_device=%d copy=%d set_device=%d sync_device=%d\n", + __func__, cuda_reg ? 1 : 0, fn_ptr_device ? 1 : 0, fn_copy ? 1 : 0, + fn_set_device ? 1 : 0, fn_sync_device ? 1 : 0); + } return false; } @@ -479,21 +486,26 @@ bool llama_memory_recurrent::build_recurrent_copy_plan() { } const char * buffer_name = tensor->buffer ? ggml_backend_buffer_name(tensor->buffer) : nullptr; - if (!buffer_name || std::strncmp(buffer_name, "CUDA", 4) != 0) { + if (!buffer_name || + (std::strncmp(buffer_name, "CUDA", 4) != 0 && + std::strncmp(buffer_name, "ROCm", 4) != 0)) { + if (dflash_profile_enabled(DFLASH_PROFILE_COPY)) { + LLAMA_LOG_INFO("%s: dflash recurrent D2D copy unavailable: tensor=%s buffer=%s\n", + __func__, tensor->name, buffer_name ? buffer_name : ""); + } return false; } int tensor_device = -1; if (!fn_ptr_device(tensor->data, &tensor_device)) { - return false; - } - if (copy_plan_device < 0) { - copy_plan_device = tensor_device; - } else if (copy_plan_device != tensor_device) { + if (dflash_profile_enabled(DFLASH_PROFILE_COPY)) { + LLAMA_LOG_INFO("%s: dflash recurrent D2D copy unavailable: tensor=%s buffer=%s ptr_device_failed\n", + __func__, tensor->name, buffer_name); + } return false; } - copy_plan_entries.push_back({ tensor, ggml_row_size(tensor->type, n_embd) }); + copy_plan_entries.push_back({ tensor, ggml_row_size(tensor->type, n_embd), tensor_device }); return true; }; @@ -501,12 +513,11 @@ bool llama_memory_recurrent::build_recurrent_copy_plan() { if (!add_tensor(r_l[il], hparams.n_embd_r()) || !add_tensor(s_l[il], hparams.n_embd_s())) { copy_plan_entries.clear(); - copy_plan_device = -1; return false; } } - if (copy_plan_entries.empty() || copy_plan_device < 0) { + if (copy_plan_entries.empty()) { return false; } @@ -520,7 +531,6 @@ bool llama_memory_recurrent::build_recurrent_copy_plan() { void llama_memory_recurrent::invalidate_recurrent_copy_plan() { copy_plan_valid = false; copy_plan_cuda_fast = false; - copy_plan_device = -1; copy_plan_entries.clear(); copy_plan_fn_copy = nullptr; copy_plan_fn_set_device = nullptr; @@ -553,12 +563,21 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) { build_recurrent_copy_plan(); } if (copy_plan_cuda_fast) { - bool all_queued = copy_plan_fn_set_device && copy_plan_fn_set_device(copy_plan_device); + bool all_queued = copy_plan_fn_set_device != nullptr; bool any_queued = false; + int current_device = -1; + std::vector touched_devices; const int64_t t_enqueue_start = profile_timing ? ggml_time_us() : 0; if (all_queued) { for (const recurrent_copy_plan_entry & entry : copy_plan_entries) { + if (current_device != entry.device) { + if (!copy_plan_fn_set_device(entry.device)) { + all_queued = false; + break; + } + current_device = entry.device; + } const char * src = (const char *) entry.tensor->data + (size_t) i_src * entry.row_bytes; char * dst = (char *) entry.tensor->data + (size_t) i_dst * entry.row_bytes; if (!copy_plan_fn_copy(dst, src, entry.row_bytes)) { @@ -566,6 +585,9 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) { break; } any_queued = true; + if (std::find(touched_devices.begin(), touched_devices.end(), entry.device) == touched_devices.end()) { + touched_devices.push_back(entry.device); + } profile.tensors_copied++; profile.cuda_d2d_queued++; } @@ -583,7 +605,10 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) { bool synced = false; if (any_queued && copy_plan_fn_sync_device) { const int64_t t_sync_start = profile_timing ? ggml_time_us() : 0; - synced = copy_plan_fn_sync_device(copy_plan_device); + synced = true; + for (int device : touched_devices) { + synced = copy_plan_fn_sync_device(device) && synced; + } if (t_sync_start != 0) { profile.sync_us += ggml_time_us() - t_sync_start; } diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index 097a9414442..b20c1d5fa54 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -153,6 +153,7 @@ class llama_memory_recurrent : public llama_memory_i { struct recurrent_copy_plan_entry { ggml_tensor * tensor = nullptr; size_t row_bytes = 0; + int device = -1; }; bool build_recurrent_copy_plan(); @@ -161,7 +162,6 @@ class llama_memory_recurrent : public llama_memory_i { bool copy_plan_valid = false; bool copy_plan_cuda_fast = false; - int copy_plan_device = -1; std::vector copy_plan_entries; dflash_cuda_copy_d2d_fn_t copy_plan_fn_copy = nullptr; dflash_cuda_set_device_fn_t copy_plan_fn_set_device = nullptr; From 4b208f783fb8a998a0f808789fecae4911f61f9e Mon Sep 17 00:00:00 2001 From: nycdubliner Date: Sat, 23 May 2026 15:13:16 +0100 Subject: [PATCH 2/4] spec-dec: address review feedback on multi-GPU tape placement - Documented n_devices() > 1 multi-GPU limitations for hidden and prefill GPU allocations. - Documented role of GGML_DFLASH_ALLOW_MULTI_GPU_TAPE environment variable. - Added warning logging when explicitly disabling GPU capture is overridden by the env var. - Avoid std::map dependency in allocate_tape_gpu by using a vector of structures to count device occurrences. - Move touched_devices vector out of copy_cell hot-path stack to class member to avoid heap pressure. --- src/llama-context.cpp | 29 ++++++++++++++++++++++++----- src/llama-memory-recurrent.cpp | 8 ++++---- src/llama-memory-recurrent.h | 1 + 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 637471dcab7..33cd01945be 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -23,7 +23,6 @@ #include #include #include -#include #include // @@ -1138,6 +1137,7 @@ static bool dflash_profile_sync_split_enabled() { return enabled; } +// Controls the multi-GPU speculative decoding recurrent tape path. Independent of GGML_DFLASH_GPU_RING. static bool dflash_allow_multi_gpu_tape() { static const bool enabled = [] { const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE"); @@ -1696,6 +1696,9 @@ void llama_context::set_dflash_gpu_capture(bool enabled) { dflash_capture->gpu_capture_enabled = enabled || (model.n_devices() > 1 && dflash_allow_multi_gpu_tape()); + if (!enabled && dflash_capture->gpu_capture_enabled) { + LLAMA_LOG_INFO("%s: forcing GPU capture enabled for multi-GPU tape support (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n", __func__); + } // Always clear the graph-embedded capture cparams when changing mode; // the decode loop will repopulate them if GPU capture is active and @@ -1931,7 +1934,11 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) { dflash_capture->tapes.reserve(n_slots); size_t total_size = 0; - std::map layers_by_dev; + struct dev_layer_count { + ggml_backend_dev_t dev; + int count; + }; + std::vector dev_counts; for (int slot = 0; slot < n_slots; ++slot) { auto tape = std::make_unique(); @@ -1985,7 +1992,17 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) { } total_size += ggml_backend_buffer_get_size(tl.buf); - layers_by_dev[layer_dev] += 1; + bool found = false; + for (auto & dc : dev_counts) { + if (dc.dev == layer_dev) { + dc.count++; + found = true; + break; + } + } + if (!found) { + dev_counts.push_back({ layer_dev, 1 }); + } } dflash_capture->tapes.push_back(std::move(tape)); @@ -1995,9 +2012,9 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) { LLAMA_LOG_INFO("%s: allocated device-aware GPU tape buffers: %.1f MB total (%d slot%s, %d layers, %d max tokens)\n", __func__, total_size / (1024.0 * 1024.0), n_slots, n_slots == 1 ? "" : "s", n_rec, max_tokens); - for (const auto & kv : layers_by_dev) { + for (const auto & dc : dev_counts) { LLAMA_LOG_INFO("%s: dflash tape placement: device=%s layers=%d\n", - __func__, ggml_backend_dev_name(kv.first), kv.second); + __func__, ggml_backend_dev_name(dc.dev), dc.count); } } @@ -2014,6 +2031,7 @@ void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) { dflash_capture->sync_backend_to_stream_backend = nullptr; return; } + // Hidden GPU capture requires same-device graph output tensors; not yet supported for multi-GPU layer splits if (model.n_devices() > 1) { dflash_capture->hidden_gpu.clear(); dflash_capture->fn_sync_backend_to_stream = nullptr; @@ -2106,6 +2124,7 @@ bool llama_context::allocate_prefill_gpu(int n_slots, int max_tokens) { if (!dflash_capture->gpu_capture_enabled) { return false; } + // Prefill GPU allocation is not yet multi-GPU aware if (model.n_devices() > 1) { return false; } diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index c37cee7836c..405dcbd5bd8 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -566,7 +566,7 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) { bool all_queued = copy_plan_fn_set_device != nullptr; bool any_queued = false; int current_device = -1; - std::vector touched_devices; + copy_plan_touched_devices.clear(); const int64_t t_enqueue_start = profile_timing ? ggml_time_us() : 0; if (all_queued) { @@ -585,8 +585,8 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) { break; } any_queued = true; - if (std::find(touched_devices.begin(), touched_devices.end(), entry.device) == touched_devices.end()) { - touched_devices.push_back(entry.device); + if (std::find(copy_plan_touched_devices.begin(), copy_plan_touched_devices.end(), entry.device) == copy_plan_touched_devices.end()) { + copy_plan_touched_devices.push_back(entry.device); } profile.tensors_copied++; profile.cuda_d2d_queued++; @@ -606,7 +606,7 @@ void llama_memory_recurrent::copy_cell(int32_t i_src, int32_t i_dst) { if (any_queued && copy_plan_fn_sync_device) { const int64_t t_sync_start = profile_timing ? ggml_time_us() : 0; synced = true; - for (int device : touched_devices) { + for (int device : copy_plan_touched_devices) { synced = copy_plan_fn_sync_device(device) && synced; } if (t_sync_start != 0) { diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index b20c1d5fa54..522c6f53a82 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -163,6 +163,7 @@ class llama_memory_recurrent : public llama_memory_i { bool copy_plan_valid = false; bool copy_plan_cuda_fast = false; std::vector copy_plan_entries; + std::vector copy_plan_touched_devices; dflash_cuda_copy_d2d_fn_t copy_plan_fn_copy = nullptr; dflash_cuda_set_device_fn_t copy_plan_fn_set_device = nullptr; dflash_cuda_sync_device_fn_t copy_plan_fn_sync_device = nullptr; From 0278d5bbd0fc6475f5d1cf2f8f5dd4389cdbf5d3 Mon Sep 17 00:00:00 2001 From: nycdubliner Date: Sat, 23 May 2026 15:47:48 +0100 Subject: [PATCH 3/4] wip: phase 2 gpu conv replay and hidden capture --- common/speculative.cpp | 19 +- ggml/src/ggml-cuda/cross-ring-interleave.cu | 32 +++- src/llama-context.cpp | 183 +++++++++++++------- src/llama-context.h | 15 ++ tests/test-dflash-plumbing.cpp | 6 +- 5 files changed, 181 insertions(+), 74 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index cf413307b45..d1d6228f694 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -102,6 +102,14 @@ static bool common_dflash_log_contract_verbose() { return v; } +static bool common_dflash_allow_multi_gpu_tape() { + static const bool enabled = [] { + const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE"); + return env && env[0] != '\0' && std::strcmp(env, "0") != 0; + }(); + return enabled; +} + static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_context * ctx_dft) { if (!common_dflash_gpu_ring_env_enabled()) { LOG_INF("dflash: GPU cross ring disabled by GGML_DFLASH_GPU_RING=0; using CPU hidden capture\n"); @@ -111,9 +119,14 @@ static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_contex const int32_t n_tgt_devices = ctx_tgt ? llama_model_n_devices(llama_get_model(ctx_tgt)) : 1; const int32_t n_dft_devices = ctx_dft ? llama_model_n_devices(llama_get_model(ctx_dft)) : 1; if (n_tgt_devices > 1 || n_dft_devices > 1) { - LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); disabling GPU cross ring and graph hidden capture\n", - n_tgt_devices, n_dft_devices); - return false; + if (common_dflash_allow_multi_gpu_tape()) { + LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); enabling experimental GPU cross ring and graph hidden capture (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n", + n_tgt_devices, n_dft_devices); + } else { + LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); disabling GPU cross ring and graph hidden capture\n", + n_tgt_devices, n_dft_devices); + return false; + } } return true; diff --git a/ggml/src/ggml-cuda/cross-ring-interleave.cu b/ggml/src/ggml-cuda/cross-ring-interleave.cu index 2b024a3cab8..50bb13f50ef 100644 --- a/ggml/src/ggml-cuda/cross-ring-interleave.cu +++ b/ggml/src/ggml-cuda/cross-ring-interleave.cu @@ -194,11 +194,11 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d( return false; } #if CUDART_VERSION >= 10000 || defined(GGML_USE_HIP) - if (attr.type != cudaMemoryTypeDevice || attr.device != ring->device) { + if (attr.type != cudaMemoryTypeDevice) { return false; } #else - if (attr.memoryType != cudaMemoryTypeDevice || attr.device != ring->device) { + if (attr.memoryType != cudaMemoryTypeDevice) { return false; } #endif @@ -219,14 +219,28 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d( } int first = ring->ring_size - pos; - if (first >= n_tokens) { - cudaMemcpyAsync(dst + (size_t)pos * n_embd, src, - (size_t)n_tokens * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread); + const bool is_peer = (attr.device != ring->device); + if (is_peer) { + if (first >= n_tokens) { + cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device, + (size_t)n_tokens * stride, cudaStreamPerThread); + } else { + cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device, + (size_t)first * stride, cudaStreamPerThread); + cudaMemcpyPeerAsync(dst, ring->device, src + (size_t)first * stride, attr.device, + (size_t)(n_tokens - first) * stride, cudaStreamPerThread); + } } else { - cudaMemcpyAsync(dst + (size_t)pos * n_embd, src, - (size_t)first * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread); - cudaMemcpyAsync(dst, src + (size_t)first * stride, - (size_t)(n_tokens - first) * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread); + (void)cudaSetDevice(ring->device); + if (first >= n_tokens) { + cudaMemcpyAsync(dst + (size_t)pos * n_embd, src, + (size_t)n_tokens * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread); + } else { + cudaMemcpyAsync(dst + (size_t)pos * n_embd, src, + (size_t)first * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread); + cudaMemcpyAsync(dst, src + (size_t)first * stride, + (size_t)(n_tokens - first) * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread); + } } return cudaGetLastError() == cudaSuccess; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 33cd01945be..adad1ef8023 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2031,13 +2031,6 @@ void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) { dflash_capture->sync_backend_to_stream_backend = nullptr; return; } - // Hidden GPU capture requires same-device graph output tensors; not yet supported for multi-GPU layer splits - if (model.n_devices() > 1) { - dflash_capture->hidden_gpu.clear(); - dflash_capture->fn_sync_backend_to_stream = nullptr; - dflash_capture->sync_backend_to_stream_backend = nullptr; - return; - } if (!llama_dflash_gpu_hidden_supported_arch(model.arch)) { dflash_capture->hidden_gpu.clear(); dflash_capture->fn_sync_backend_to_stream = nullptr; @@ -2069,6 +2062,16 @@ void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) { dflash_capture->sync_backend_to_stream_backend = dflash_capture->fn_sync_backend_to_stream ? gpu_backend : nullptr; + auto backend_for_dev = [&](ggml_backend_dev_t want_dev) -> ggml_backend_t { + for (auto & backend : backends) { + auto * dev = ggml_backend_get_device(backend.get()); + if (dev == want_dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + return backend.get(); + } + } + return nullptr; + }; + const int n_layers = (int) dflash_capture->layer_ids.size(); const int64_t n_embd = model.hparams.n_embd; @@ -2077,36 +2080,48 @@ void llama_context::allocate_hidden_gpu(int n_slots, int max_tokens) { size_t total_size = 0; for (int slot = 0; slot < n_slots; ++slot) { - const size_t ctx_mem = ggml_tensor_overhead() * ((size_t) n_layers + 2); - struct ggml_init_params ctx_params = { ctx_mem, nullptr, true }; - struct ggml_context * hidden_ctx = ggml_init(ctx_params); - if (!hidden_ctx) { - LLAMA_LOG_WARN("%s: failed to create GPU hidden context for slot %d; using callback hidden fallback\n", - __func__, slot); - dflash_capture->hidden_gpu.clear(); - return; - } - auto hidden = std::make_unique(); hidden->layers.resize(n_layers); hidden->layer_ids = dflash_capture->layer_ids; hidden->n_embd = n_embd; hidden->max_tokens = max_tokens; - hidden->ctx = hidden_ctx; for (int i = 0; i < n_layers; ++i) { - hidden->layers[i] = ggml_new_tensor_2d(hidden_ctx, GGML_TYPE_F32, n_embd, (int64_t) max_tokens); - } + const int il = dflash_capture->layer_ids[i]; + ggml_backend_dev_t layer_dev = model.dev_layer(il); + ggml_backend_t layer_backend = backend_for_dev(layer_dev); + if (!layer_backend) { + LLAMA_LOG_WARN("%s: no GPU backend for hidden layer %d device %s; using callback hidden fallback\n", + __func__, il, layer_dev ? ggml_backend_dev_name(layer_dev) : ""); + dflash_capture->hidden_gpu.clear(); + return; + } - hidden->buf = ggml_backend_alloc_ctx_tensors(hidden_ctx, gpu_backend); - if (!hidden->buf) { - LLAMA_LOG_WARN("%s: failed to allocate GPU hidden buffer for slot %d; using callback hidden fallback\n", - __func__, slot); - dflash_capture->hidden_gpu.clear(); - return; + const size_t ctx_mem = ggml_tensor_overhead() * 2; + struct ggml_init_params ctx_params = { ctx_mem, nullptr, true }; + struct ggml_context * hidden_ctx = ggml_init(ctx_params); + if (!hidden_ctx) { + LLAMA_LOG_WARN("%s: failed to create GPU hidden context for slot %d layer %d; using callback hidden fallback\n", + __func__, slot, il); + dflash_capture->hidden_gpu.clear(); + return; + } + hidden->ctxs.push_back(hidden_ctx); + + ggml_tensor * t = ggml_new_tensor_2d(hidden_ctx, GGML_TYPE_F32, n_embd, (int64_t) max_tokens); + hidden->layers[i] = t; + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(hidden_ctx, layer_backend); + if (!buf) { + LLAMA_LOG_WARN("%s: failed to allocate GPU hidden buffer for slot %d layer %d; using callback hidden fallback\n", + __func__, slot, il); + dflash_capture->hidden_gpu.clear(); + return; + } + hidden->bufs.push_back(buf); + total_size += ggml_backend_buffer_get_size(buf); } - total_size += ggml_backend_buffer_get_size(hidden->buf); dflash_capture->hidden_gpu.push_back(std::move(hidden)); } @@ -2124,10 +2139,6 @@ bool llama_context::allocate_prefill_gpu(int n_slots, int max_tokens) { if (!dflash_capture->gpu_capture_enabled) { return false; } - // Prefill GPU allocation is not yet multi-GPU aware - if (model.n_devices() > 1) { - return false; - } if (!llama_dflash_gpu_hidden_supported_arch(model.arch)) { return false; } @@ -2164,6 +2175,16 @@ bool llama_context::allocate_prefill_gpu(int n_slots, int max_tokens) { dflash_capture->fn_sync_backend_to_stream ? gpu_backend : nullptr; } + auto backend_for_dev = [&](ggml_backend_dev_t want_dev) -> ggml_backend_t { + for (auto & backend : backends) { + auto * dev = ggml_backend_get_device(backend.get()); + if (dev == want_dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + return backend.get(); + } + } + return nullptr; + }; + const int n_layers = (int) dflash_capture->layer_ids.size(); const int64_t n_embd = model.hparams.n_embd; @@ -2172,39 +2193,52 @@ bool llama_context::allocate_prefill_gpu(int n_slots, int max_tokens) { size_t total_size = 0; for (int slot = 0; slot < n_slots; ++slot) { - const size_t ctx_mem = ggml_tensor_overhead() * ((size_t) n_layers + 2); - struct ggml_init_params ctx_params = { ctx_mem, nullptr, true }; - struct ggml_context * hidden_ctx = ggml_init(ctx_params); - if (!hidden_ctx) { - LLAMA_LOG_WARN("%s: failed to create prefill GPU context for slot %d; using callback fallback\n", - __func__, slot); - dflash_capture->prefill_gpu.clear(); - dflash_capture->prefill_gpu_max_tokens = 0; - return false; - } - auto hidden = std::make_unique(); hidden->layers.resize(n_layers); hidden->layer_ids = dflash_capture->layer_ids; hidden->n_embd = n_embd; hidden->max_tokens = max_tokens; hidden->n_tokens = 0; - hidden->ctx = hidden_ctx; for (int i = 0; i < n_layers; ++i) { - hidden->layers[i] = ggml_new_tensor_2d(hidden_ctx, GGML_TYPE_F32, n_embd, (int64_t) max_tokens); - } + const int il = dflash_capture->layer_ids[i]; + ggml_backend_dev_t layer_dev = model.dev_layer(il); + ggml_backend_t layer_backend = backend_for_dev(layer_dev); + if (!layer_backend) { + LLAMA_LOG_WARN("%s: no GPU backend for prefill layer %d device %s; using callback fallback\n", + __func__, il, layer_dev ? ggml_backend_dev_name(layer_dev) : ""); + dflash_capture->prefill_gpu.clear(); + dflash_capture->prefill_gpu_max_tokens = 0; + return false; + } - hidden->buf = ggml_backend_alloc_ctx_tensors(hidden_ctx, gpu_backend); - if (!hidden->buf) { - LLAMA_LOG_WARN("%s: failed to allocate prefill GPU buffer for slot %d; using callback fallback\n", - __func__, slot); - dflash_capture->prefill_gpu.clear(); - dflash_capture->prefill_gpu_max_tokens = 0; - return false; + const size_t ctx_mem = ggml_tensor_overhead() * 2; + struct ggml_init_params ctx_params = { ctx_mem, nullptr, true }; + struct ggml_context * hidden_ctx = ggml_init(ctx_params); + if (!hidden_ctx) { + LLAMA_LOG_WARN("%s: failed to create prefill GPU context for slot %d layer %d; using callback fallback\n", + __func__, slot, il); + dflash_capture->prefill_gpu.clear(); + dflash_capture->prefill_gpu_max_tokens = 0; + return false; + } + hidden->ctxs.push_back(hidden_ctx); + + ggml_tensor * t = ggml_new_tensor_2d(hidden_ctx, GGML_TYPE_F32, n_embd, (int64_t) max_tokens); + hidden->layers[i] = t; + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(hidden_ctx, layer_backend); + if (!buf) { + LLAMA_LOG_WARN("%s: failed to allocate prefill GPU buffer for slot %d layer %d; using callback fallback\n", + __func__, slot, il); + dflash_capture->prefill_gpu.clear(); + dflash_capture->prefill_gpu_max_tokens = 0; + return false; + } + hidden->bufs.push_back(buf); + total_size += ggml_backend_buffer_get_size(buf); } - total_size += ggml_backend_buffer_get_size(hidden->buf); dflash_capture->prefill_gpu.push_back(std::move(hidden)); } @@ -3039,17 +3073,22 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, if (!dflash_capture || !mem_recurrent || n_accepted <= 0) { return false; } - if (model.n_devices() > 1) { - return false; - } ggml_backend_reg_t cuda_reg = dflash_gpu_backend_reg(); if (!cuda_reg) { return false; } + using ptr_device_fn_t = bool (*)(const void *, int *); + using set_device_fn_t = bool (*)(int); + using sync_device_fn_t = bool (*)(int); using rebuild_fn_t = bool (*)(void *, const void *, int, int, int); + + auto fn_ptr_device = (ptr_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_ptr_device"); + auto fn_set_device = (set_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_set_device"); + auto fn_sync_device = (sync_device_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_cuda_synchronize_device"); auto fn_rebuild = (rebuild_fn_t) ggml_backend_reg_get_proc_address(cuda_reg, "dflash_rebuild_conv_state"); - if (!fn_rebuild) { + + if (!fn_ptr_device || !fn_set_device || !fn_sync_device || !fn_rebuild) { return false; } @@ -3065,10 +3104,13 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, const void * qkv; int conv_ch; int conv_window; + int device; }; std::vector launches; launches.reserve(rec_ids.size()); + bool touched_devices[32] = {false}; + for (size_t li = 0; li < rec_ids.size(); ++li) { const int il = rec_ids[li]; if (li >= gpu_tape->layers.size()) { @@ -3092,13 +3134,24 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, return false; } + int r_dev = -1; + if (!fn_ptr_device(r_tensor->data, &r_dev)) { + return false; + } + if (r_dev < 0 || r_dev >= 32) { + return false; + } + const size_t r_offset = (size_t) cell_idx * n_embd_r * ggml_element_size(r_tensor); launches.push_back({ (char *) r_tensor->data + r_offset, qkv_tensor->data, (int) conv_ch_i64, (int) conv_window_i64, + r_dev, }); + + touched_devices[r_dev] = true; } if (launches.empty()) { @@ -3107,10 +3160,22 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, const int64_t t_gpu_start_us = dflash_capture->profile ? ggml_time_us() : 0; for (const auto & launch : launches) { + if (!fn_set_device(launch.device)) { + return false; + } if (!fn_rebuild(launch.r_state, launch.qkv, n_accepted, launch.conv_ch, launch.conv_window)) { return false; } } + + for (int dev = 0; dev < 32; ++dev) { + if (touched_devices[dev]) { + if (!fn_sync_device(dev)) { + return false; + } + } + } + if (dflash_capture->profile) { const uint64_t elapsed = ggml_time_us() - t_gpu_start_us; dflash_capture->profile_replay_conv_enqueue_us += elapsed; @@ -3302,7 +3367,7 @@ void llama_context::tape_replay_conv(llama_memory_recurrent * mem_recurrent, int auto & tape_layers = dflash_capture->tape_layers; const uint32_t n_embd_r = hparams.n_embd_r(); - if (model.n_devices() <= 1 && tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)) { + if (tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)) { return; } if (model.n_devices() > 1 && tape_replay_conv_gpu_from_cpu_tape(mem_recurrent, cell_idx, n_accepted, seq_id)) { @@ -5799,7 +5864,7 @@ int llama_context::decode(const llama_batch & batch_inp) { } dflash_capture->ubatch = &ubatch; } else { - dflash_gpu_capture_ready = model.n_devices() <= 1 && dflash_capture->gpu_capture_enabled; + dflash_gpu_capture_ready = (model.n_devices() <= 1 || dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled; const bool dflash_gpu_tape_ready_allowed = dflash_gpu_capture_ready || (model.n_devices() > 1 && dflash_capture->gpu_capture_enabled && diff --git a/src/llama-context.h b/src/llama-context.h index 2a167e490f3..30381633c0e 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -125,9 +125,18 @@ struct dflash_tape_gpu { } }; +struct dflash_hidden_gpu_layer { + ggml_tensor * hidden = nullptr; + ggml_backend_buffer_t buf = nullptr; + ggml_context * ctx = nullptr; + ggml_backend_dev_t dev = nullptr; +}; + struct dflash_hidden_gpu { std::vector layers; // one [n_embd, max_tokens] tensor per captured layer std::vector layer_ids; + std::vector bufs; + std::vector ctxs; ggml_backend_buffer_t buf = nullptr; ggml_context * ctx = nullptr; int64_t n_embd = 0; @@ -135,6 +144,12 @@ struct dflash_hidden_gpu { int n_tokens = 0; ~dflash_hidden_gpu() { + for (auto b : bufs) { + if (b) ggml_backend_buffer_free(b); + } + for (auto c : ctxs) { + if (c) ggml_free(c); + } if (buf) ggml_backend_buffer_free(buf); if (ctx) ggml_free(ctx); } diff --git a/tests/test-dflash-plumbing.cpp b/tests/test-dflash-plumbing.cpp index fdcd9cabe29..d2828f1c960 100644 --- a/tests/test-dflash-plumbing.cpp +++ b/tests/test-dflash-plumbing.cpp @@ -359,11 +359,11 @@ int main(int argc, char ** argv) { ok &= expect(llama_h.find("llama_set_dflash_gpu_capture") != std::string::npos, "public DFlash API must expose GPU capture gating"); ok &= expect(context_cpp.find("allocate_hidden_gpu(n_slots, max_tokens)") != std::string::npos, "GPU tape allocation must allocate hidden capture buffers too"); ok &= expect(context_cpp.find("dflash_skip_eval_callback ? nullptr : dflash_eval_callback") != std::string::npos, "eligible DFlash verifier graph must disable eval callback, including suppressed no-intersection ubatches"); - ok &= expect(context_cpp.find("const bool dflash_graph_tape_ready") != std::string::npos, "DFlash decode must gate GPU tape copies separately from hidden capture"); + ok &= expect(context_cpp.find("bool dflash_graph_tape_ready") != std::string::npos, "DFlash decode must gate GPU tape copies separately from hidden capture"); ok &= expect(context_cpp.find("dflash_graph_hidden_ready =\n !dflash_capture->hidden_gpu.empty()") != std::string::npos, "GPU hidden graph capture must not depend on active tape recording"); ok &= expect(context_cpp.find("dflash_tape_gpu * graph_tp = dflash_graph_tape_ready ? tp : nullptr") != std::string::npos, "GPU tape graph pointers must be disabled when tape recording is inactive"); ok &= expect(context_cpp.find("multi-GPU target detected") != std::string::npos, "multi-GPU target must fall back from graph-embedded DFlash GPU capture"); - ok &= expect(context_cpp.find("const bool dflash_gpu_capture_ready = model.n_devices() <= 1 && dflash_capture->gpu_capture_enabled") != std::string::npos, "DFlash graph capture must be gated to single-GPU target placement and explicit GPU capture policy"); + ok &= expect(context_cpp.find("dflash_gpu_capture_ready = (model.n_devices() <= 1 || dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled;") != std::string::npos, "DFlash graph capture must be gated to single-GPU or multi-GPU tape mode and explicit GPU capture policy"); ok &= expect(context_cpp.find("const bool multi_gpu_target = model.n_devices() > 1;") != std::string::npos, "DFlash replay must detect multi-GPU target placement before choosing GPU replay"); ok &= expect(context_cpp.find("exact CUDA DFlash replay unavailable, using CPU recurrent replay fallback") != std::string::npos, "DFlash replay must log only the multi-GPU CPU replay fallback"); ok &= expect(context_cpp.find("tape_replay_cpu(mem_recurrent, cell_idx, n_accepted);\n tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id);") != std::string::npos, @@ -413,7 +413,7 @@ int main(int argc, char ** argv) { ok &= expect(cuda_cpp.find("source buffer is not visible to CUDA backend device") != std::string::npos, "CUDA backend assert diagnostics must report the non-local source tensor"); ok &= expect(context_h.find("tape_replay_conv_gpu") != std::string::npos, "DFlash must declare GPU conv rebuild fast path"); - ok &= expect(context_cpp.find("model.n_devices() <= 1 && tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)") != std::string::npos, "conv replay must try GPU rebuild only for single-GPU target placement"); + ok &= expect(context_cpp.find("tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)") != std::string::npos, "conv replay must try GPU rebuild"); ok &= expect(context_cpp.find("const bool use_async_backend = gpu_backend && model.n_devices() <= 1;") != std::string::npos, "conv replay CPU fallback must use async backend copies only for single-GPU target placement"); ok &= expect(context_cpp.find("ggml_backend_tensor_get_async(gpu_backend, r_tensor") == std::string::npos, "conv replay must not read split recurrent state through the first GPU backend"); ok &= expect(context_cpp.find("ggml_backend_tensor_set_async(gpu_backend, d.r_tensor") == std::string::npos, "conv replay must not write split recurrent state through the first GPU backend"); From 7fd860667e9645d24bc71d93106ec03a763e4c66 Mon Sep 17 00:00:00 2001 From: nycdubliner Date: Sat, 23 May 2026 16:08:12 +0100 Subject: [PATCH 4/4] spec-dec: address Phase 2 review feedback for ROCm Multi-GPU DFlash - Auto-enable peer access dynamically in dflash_cross_ring_gpu_write_d2d to avoid manual setup. - Clean up unused struct dflash_hidden_gpu_layer and dead buf/ctx fields in dflash_hidden_gpu. - Resolve speculative.cpp duplication by introducing a public llama_dflash_allow_multi_gpu_tape() API. - Replace magic array sizes of 32 with GGML_CUDA_MAX_DEVICES for touched devices tracking. - Document unconditional call to tape_replay_conv_gpu as internally gated. - Update test-dflash-plumbing.cpp regex assertions for renamed gating function. --- common/speculative.cpp | 11 +------- ggml/src/ggml-cuda/cross-ring-interleave.cu | 31 +++++++++++++++++++++ include/llama.h | 6 ++-- src/llama-context.cpp | 27 +++++++++++------- src/llama-context.h | 11 -------- tests/test-dflash-plumbing.cpp | 2 +- 6 files changed, 53 insertions(+), 35 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index d1d6228f694..40df713cc00 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -101,15 +101,6 @@ static bool common_dflash_log_contract_verbose() { }(); return v; } - -static bool common_dflash_allow_multi_gpu_tape() { - static const bool enabled = [] { - const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE"); - return env && env[0] != '\0' && std::strcmp(env, "0") != 0; - }(); - return enabled; -} - static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_context * ctx_dft) { if (!common_dflash_gpu_ring_env_enabled()) { LOG_INF("dflash: GPU cross ring disabled by GGML_DFLASH_GPU_RING=0; using CPU hidden capture\n"); @@ -119,7 +110,7 @@ static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_contex const int32_t n_tgt_devices = ctx_tgt ? llama_model_n_devices(llama_get_model(ctx_tgt)) : 1; const int32_t n_dft_devices = ctx_dft ? llama_model_n_devices(llama_get_model(ctx_dft)) : 1; if (n_tgt_devices > 1 || n_dft_devices > 1) { - if (common_dflash_allow_multi_gpu_tape()) { + if (llama_dflash_allow_multi_gpu_tape()) { LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); enabling experimental GPU cross ring and graph hidden capture (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n", n_tgt_devices, n_dft_devices); } else { diff --git a/ggml/src/ggml-cuda/cross-ring-interleave.cu b/ggml/src/ggml-cuda/cross-ring-interleave.cu index 50bb13f50ef..87fe15517af 100644 --- a/ggml/src/ggml-cuda/cross-ring-interleave.cu +++ b/ggml/src/ggml-cuda/cross-ring-interleave.cu @@ -7,6 +7,11 @@ #include #include +#ifndef GGML_CUDA_MAX_DEVICES +#define GGML_CUDA_MAX_DEVICES 16 +#endif + + // GPU cross-attention ring buffer for DFlash speculative decoding. static bool dflash_cuda_debug_enabled() { @@ -221,6 +226,32 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d( int first = ring->ring_size - pos; const bool is_peer = (attr.device != ring->device); if (is_peer) { + static bool peer_enabled[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_DEVICES] = { { false } }; + if (attr.device >= 0 && attr.device < GGML_CUDA_MAX_DEVICES && + ring->device >= 0 && ring->device < GGML_CUDA_MAX_DEVICES) { + if (!peer_enabled[attr.device][ring->device]) { + int can_access = 0; + cudaDeviceCanAccessPeer(&can_access, attr.device, ring->device); + if (can_access) { + (void)cudaSetDevice(attr.device); + cudaDeviceEnablePeerAccess(ring->device, 0); + cudaGetLastError(); + } + peer_enabled[attr.device][ring->device] = true; + } + if (!peer_enabled[ring->device][attr.device]) { + int can_access = 0; + cudaDeviceCanAccessPeer(&can_access, ring->device, attr.device); + if (can_access) { + (void)cudaSetDevice(ring->device); + cudaDeviceEnablePeerAccess(attr.device, 0); + cudaGetLastError(); + } + peer_enabled[ring->device][attr.device] = true; + } + (void)cudaSetDevice(ring->device); + } + if (first >= n_tokens) { cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device, (size_t)n_tokens * stride, cudaStreamPerThread); diff --git a/include/llama.h b/include/llama.h index c854b8a484f..3e479aaa2e6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1095,11 +1095,11 @@ extern "C" { // if layer_ids are configured. GPU buffers and tape state are preserved. LLAMA_API void llama_set_dflash_capture_active(struct llama_context * ctx, bool active); - // DFlash: enable graph-embedded GPU hidden/tape capture for target decode. - // Disable this before decode when the drafter cannot consume GPU cross-ring - // tensors directly, so the eval callback keeps CPU hidden buffers populated. LLAMA_API void llama_set_dflash_gpu_capture(struct llama_context * ctx, bool enabled); + // DFlash: check whether multi-GPU recurrent tape allocation is allowed + LLAMA_API bool llama_dflash_allow_multi_gpu_tape(); + // DFlash: set drafter sampling temperature (Gumbel-max trick) // temp=0: greedy argmax (default), temp>0: sample from softmax(logits/temp) LLAMA_API void llama_set_dflash_sample_temp(struct llama_context * ctx, float temp); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index adad1ef8023..1139174f958 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -15,6 +15,11 @@ #include "dflash-profile.h" #include "llama.h" +#ifndef GGML_CUDA_MAX_DEVICES +#define GGML_CUDA_MAX_DEVICES 16 +#endif + + #include "ggml-alloc.h" #include @@ -1138,7 +1143,7 @@ static bool dflash_profile_sync_split_enabled() { } // Controls the multi-GPU speculative decoding recurrent tape path. Independent of GGML_DFLASH_GPU_RING. -static bool dflash_allow_multi_gpu_tape() { +bool llama_dflash_allow_multi_gpu_tape() { static const bool enabled = [] { const char * env = std::getenv("GGML_DFLASH_ALLOW_MULTI_GPU_TAPE"); return env && env[0] != '\0' && std::strcmp(env, "0") != 0; @@ -1146,6 +1151,7 @@ static bool dflash_allow_multi_gpu_tape() { return enabled; } + static void dflash_log_decode_seq_state( const char * where, const llama_ubatch & ubatch, @@ -1693,9 +1699,9 @@ void llama_context::set_dflash_gpu_capture(bool enabled) { if (!dflash_capture) { return; } - dflash_capture->gpu_capture_enabled = - enabled || (model.n_devices() > 1 && dflash_allow_multi_gpu_tape()); + enabled || (model.n_devices() > 1 && llama_dflash_allow_multi_gpu_tape()); + if (!enabled && dflash_capture->gpu_capture_enabled) { LLAMA_LOG_INFO("%s: forcing GPU capture enabled for multi-GPU tape support (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n", __func__); } @@ -1874,8 +1880,7 @@ void llama_context::allocate_tape_gpu(int n_slots, int max_tokens) { dflash_capture->tapes.clear(); return; } - - if (model.n_devices() > 1 && !dflash_allow_multi_gpu_tape()) { + if (model.n_devices() > 1 && !llama_dflash_allow_multi_gpu_tape()) { dflash_capture->hidden_gpu.clear(); dflash_capture->tapes.clear(); if (!dflash_capture->multi_gpu_capture_fallback_logged) { @@ -3109,7 +3114,7 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, std::vector launches; launches.reserve(rec_ids.size()); - bool touched_devices[32] = {false}; + bool touched_devices[GGML_CUDA_MAX_DEVICES] = {false}; for (size_t li = 0; li < rec_ids.size(); ++li) { const int il = rec_ids[li]; @@ -3138,7 +3143,7 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, if (!fn_ptr_device(r_tensor->data, &r_dev)) { return false; } - if (r_dev < 0 || r_dev >= 32) { + if (r_dev < 0 || r_dev >= GGML_CUDA_MAX_DEVICES) { return false; } @@ -3168,7 +3173,7 @@ bool llama_context::tape_replay_conv_gpu(llama_memory_recurrent * mem_recurrent, } } - for (int dev = 0; dev < 32; ++dev) { + for (int dev = 0; dev < GGML_CUDA_MAX_DEVICES; ++dev) { if (touched_devices[dev]) { if (!fn_sync_device(dev)) { return false; @@ -3367,6 +3372,8 @@ void llama_context::tape_replay_conv(llama_memory_recurrent * mem_recurrent, int auto & tape_layers = dflash_capture->tape_layers; const uint32_t n_embd_r = hparams.n_embd_r(); + // Attempt GPU conv replay path unconditionally. It handles its own gating checks internally + // and returns false to fallback if multi-device tape, GPU backend, or pointers are not eligible. if (tape_replay_conv_gpu(mem_recurrent, cell_idx, n_accepted)) { return; } @@ -5864,11 +5871,11 @@ int llama_context::decode(const llama_batch & batch_inp) { } dflash_capture->ubatch = &ubatch; } else { - dflash_gpu_capture_ready = (model.n_devices() <= 1 || dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled; + dflash_gpu_capture_ready = (model.n_devices() <= 1 || llama_dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled; const bool dflash_gpu_tape_ready_allowed = dflash_gpu_capture_ready || (model.n_devices() > 1 && dflash_capture->gpu_capture_enabled && - dflash_allow_multi_gpu_tape()); + llama_dflash_allow_multi_gpu_tape()); dflash_capture->ubatch = &ubatch; cparams.hidden_gpu_n_seqs = 0; dflash_clear_prefill_cparams(cparams); diff --git a/src/llama-context.h b/src/llama-context.h index 30381633c0e..82b7b678b87 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -125,20 +125,11 @@ struct dflash_tape_gpu { } }; -struct dflash_hidden_gpu_layer { - ggml_tensor * hidden = nullptr; - ggml_backend_buffer_t buf = nullptr; - ggml_context * ctx = nullptr; - ggml_backend_dev_t dev = nullptr; -}; - struct dflash_hidden_gpu { std::vector layers; // one [n_embd, max_tokens] tensor per captured layer std::vector layer_ids; std::vector bufs; std::vector ctxs; - ggml_backend_buffer_t buf = nullptr; - ggml_context * ctx = nullptr; int64_t n_embd = 0; int max_tokens = 0; int n_tokens = 0; @@ -150,8 +141,6 @@ struct dflash_hidden_gpu { for (auto c : ctxs) { if (c) ggml_free(c); } - if (buf) ggml_backend_buffer_free(buf); - if (ctx) ggml_free(ctx); } }; diff --git a/tests/test-dflash-plumbing.cpp b/tests/test-dflash-plumbing.cpp index d2828f1c960..f74bbab938a 100644 --- a/tests/test-dflash-plumbing.cpp +++ b/tests/test-dflash-plumbing.cpp @@ -363,7 +363,7 @@ int main(int argc, char ** argv) { ok &= expect(context_cpp.find("dflash_graph_hidden_ready =\n !dflash_capture->hidden_gpu.empty()") != std::string::npos, "GPU hidden graph capture must not depend on active tape recording"); ok &= expect(context_cpp.find("dflash_tape_gpu * graph_tp = dflash_graph_tape_ready ? tp : nullptr") != std::string::npos, "GPU tape graph pointers must be disabled when tape recording is inactive"); ok &= expect(context_cpp.find("multi-GPU target detected") != std::string::npos, "multi-GPU target must fall back from graph-embedded DFlash GPU capture"); - ok &= expect(context_cpp.find("dflash_gpu_capture_ready = (model.n_devices() <= 1 || dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled;") != std::string::npos, "DFlash graph capture must be gated to single-GPU or multi-GPU tape mode and explicit GPU capture policy"); + ok &= expect(context_cpp.find("dflash_gpu_capture_ready = (model.n_devices() <= 1 || llama_dflash_allow_multi_gpu_tape()) && dflash_capture->gpu_capture_enabled;") != std::string::npos, "DFlash graph capture must be gated to single-GPU or multi-GPU tape mode and explicit GPU capture policy"); ok &= expect(context_cpp.find("const bool multi_gpu_target = model.n_devices() > 1;") != std::string::npos, "DFlash replay must detect multi-GPU target placement before choosing GPU replay"); ok &= expect(context_cpp.find("exact CUDA DFlash replay unavailable, using CPU recurrent replay fallback") != std::string::npos, "DFlash replay must log only the multi-GPU CPU replay fallback"); ok &= expect(context_cpp.find("tape_replay_cpu(mem_recurrent, cell_idx, n_accepted);\n tape_replay_conv(mem_recurrent, cell_idx, n_accepted, seq_id);") != std::string::npos,