Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ static bool common_dflash_log_contract_verbose() {
}();
return v;
}

static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_context * ctx_dft) {
if (!common_dflash_gpu_ring_env_enabled()) {
LOG_INF("dflash: GPU cross ring disabled by GGML_DFLASH_GPU_RING=0; using CPU hidden capture\n");
Expand All @@ -111,9 +110,14 @@ static bool common_dflash_gpu_ring_allowed(llama_context * ctx_tgt, llama_contex
const int32_t n_tgt_devices = ctx_tgt ? llama_model_n_devices(llama_get_model(ctx_tgt)) : 1;
const int32_t n_dft_devices = ctx_dft ? llama_model_n_devices(llama_get_model(ctx_dft)) : 1;
if (n_tgt_devices > 1 || n_dft_devices > 1) {
LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); disabling GPU cross ring and graph hidden capture\n",
n_tgt_devices, n_dft_devices);
return false;
if (llama_dflash_allow_multi_gpu_tape()) {
LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); enabling experimental GPU cross ring and graph hidden capture (GGML_DFLASH_ALLOW_MULTI_GPU_TAPE is set)\n",
n_tgt_devices, n_dft_devices);
} else {
LOG_INF("dflash: multi-GPU placement detected (target=%d devices, drafter=%d devices); disabling GPU cross ring and graph hidden capture\n",
n_tgt_devices, n_dft_devices);
return false;
}
}

return true;
Expand Down
14 changes: 9 additions & 5 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "ggml-impl.h"

#include <assert.h>
#include <inttypes.h>
#include <limits.h>
#include <stdarg.h>
#include <stdio.h>
Expand Down Expand Up @@ -1661,7 +1662,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} else {
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
const bool async_ok = split_backend->iface.cpy_tensor_async && split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy);
if (!async_ok) {
ggml_backend_synchronize(input_backend);
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
Expand Down Expand Up @@ -1702,11 +1704,13 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
return ec;
}

// TODO: pass backend to the callback, then the user can decide if they want to synchronize
ggml_backend_synchronize(split_backend);
if (need) {
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
ggml_backend_synchronize(split_backend);

if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
break;
if (!sched->callback_eval(t, false, sched->callback_eval_user_data)) {
break;
}
}

j0 = j1;
Expand Down
63 changes: 54 additions & 9 deletions ggml/src/ggml-cuda/cross-ring-interleave.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
#include <cstdio>
#include <cstring>

#ifndef GGML_CUDA_MAX_DEVICES
#define GGML_CUDA_MAX_DEVICES 16
#endif


// GPU cross-attention ring buffer for DFlash speculative decoding.

static bool dflash_cuda_debug_enabled() {
Expand Down Expand Up @@ -194,11 +199,11 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d(
return false;
}
#if CUDART_VERSION >= 10000 || defined(GGML_USE_HIP)
if (attr.type != cudaMemoryTypeDevice || attr.device != ring->device) {
if (attr.type != cudaMemoryTypeDevice) {
return false;
}
#else
if (attr.memoryType != cudaMemoryTypeDevice || attr.device != ring->device) {
if (attr.memoryType != cudaMemoryTypeDevice) {
return false;
}
#endif
Expand All @@ -219,14 +224,54 @@ extern "C" bool dflash_cross_ring_gpu_write_d2d(
}

int first = ring->ring_size - pos;
if (first >= n_tokens) {
cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
(size_t)n_tokens * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
const bool is_peer = (attr.device != ring->device);
if (is_peer) {
static bool peer_enabled[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_DEVICES] = { { false } };
if (attr.device >= 0 && attr.device < GGML_CUDA_MAX_DEVICES &&
ring->device >= 0 && ring->device < GGML_CUDA_MAX_DEVICES) {
if (!peer_enabled[attr.device][ring->device]) {
int can_access = 0;
cudaDeviceCanAccessPeer(&can_access, attr.device, ring->device);
if (can_access) {
(void)cudaSetDevice(attr.device);
cudaDeviceEnablePeerAccess(ring->device, 0);
cudaGetLastError();
}
peer_enabled[attr.device][ring->device] = true;
}
if (!peer_enabled[ring->device][attr.device]) {
int can_access = 0;
cudaDeviceCanAccessPeer(&can_access, ring->device, attr.device);
if (can_access) {
(void)cudaSetDevice(ring->device);
cudaDeviceEnablePeerAccess(attr.device, 0);
cudaGetLastError();
}
peer_enabled[ring->device][attr.device] = true;
}
(void)cudaSetDevice(ring->device);
}

if (first >= n_tokens) {
cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device,
(size_t)n_tokens * stride, cudaStreamPerThread);
} else {
cudaMemcpyPeerAsync(dst + (size_t)pos * n_embd, ring->device, src, attr.device,
(size_t)first * stride, cudaStreamPerThread);
cudaMemcpyPeerAsync(dst, ring->device, src + (size_t)first * stride, attr.device,
(size_t)(n_tokens - first) * stride, cudaStreamPerThread);
}
} else {
cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
(size_t)first * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
cudaMemcpyAsync(dst, src + (size_t)first * stride,
(size_t)(n_tokens - first) * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
(void)cudaSetDevice(ring->device);
if (first >= n_tokens) {
cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
(size_t)n_tokens * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
} else {
cudaMemcpyAsync(dst + (size_t)pos * n_embd, src,
(size_t)first * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
cudaMemcpyAsync(dst, src + (size_t)first * stride,
(size_t)(n_tokens - first) * stride, cudaMemcpyDeviceToDevice, cudaStreamPerThread);
}
}

return cudaGetLastError() == cudaSuccess;
Expand Down
6 changes: 3 additions & 3 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -1095,11 +1095,11 @@ extern "C" {
// if layer_ids are configured. GPU buffers and tape state are preserved.
LLAMA_API void llama_set_dflash_capture_active(struct llama_context * ctx, bool active);

// DFlash: enable graph-embedded GPU hidden/tape capture for target decode.
// Disable this before decode when the drafter cannot consume GPU cross-ring
// tensors directly, so the eval callback keeps CPU hidden buffers populated.
LLAMA_API void llama_set_dflash_gpu_capture(struct llama_context * ctx, bool enabled);

// DFlash: check whether multi-GPU recurrent tape allocation is allowed
LLAMA_API bool llama_dflash_allow_multi_gpu_tape();

// DFlash: set drafter sampling temperature (Gumbel-max trick)
// temp=0: greedy argmax (default), temp>0: sample from softmax(logits/temp)
LLAMA_API void llama_set_dflash_sample_temp(struct llama_context * ctx, float temp);
Expand Down
Loading