Skip to content

Commit 70d9802

Browse files
committed
tests toCPU, adds offset, adds gpuflow doc, default cmakelists builds test/test_gpu.cpp
1 parent 95e587d commit 70d9802

File tree

4 files changed

+357
-42
lines changed

4 files changed

+357
-42
lines changed

CMakeLists.txt

+18
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,24 @@ endif()
2323
include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake")
2424
include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")
2525

26+
target_link_libraries(gpu PRIVATE webgpu_dawn)
27+
28+
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test)
29+
30+
add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp)
31+
target_link_libraries(test_gpu PRIVATE gpu)
32+
33+
# Platform-specific post-build actions (e.g. copying DLLs for MSVC)
34+
if(MSVC)
35+
add_custom_command(
36+
TARGET test_gpu POST_BUILD
37+
COMMAND ${CMAKE_COMMAND} -E copy
38+
${DAWN_BUILD_DIR}/$<CONFIG>/webgpu_dawn.dll
39+
$<TARGET_FILE_DIR:test_gpu>
40+
COMMENT "Copying webgpu_dawn.dll to the build directory"
41+
)
42+
endif()
43+
2644
add_library(gpud SHARED gpu.hpp)
2745
set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX)
2846
target_link_libraries(gpud PRIVATE gpu)

docs/gpuflow.md

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# GPU.cpp Lifecycle
2+
3+
```mermaid
4+
flowchart TD
5+
%% Data Preparation & Upload
6+
subgraph "Data Preparation & Upload"
7+
A["CPU Data"]
8+
B["Define Data Properties<br>(shape, type, size)"]
9+
C["Create GPU Buffer<br>(allocate raw buffer)"]
10+
D["Create Tensor<br>(allocates Array with one<br> or more buffers<br>and associates Shape)"]
11+
12+
E["Upload Data via toGPU <br>(raw buffer)<br>toGPU<br>(ctx, data, buffer, size)"]
13+
F["Upload Data via toGPU<br>(Tensor overload)<br>toGPU(ctx, data, tensor)"]
14+
G["Optional: Upload Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
15+
end
16+
17+
%% Buffer Setup & Bindings
18+
subgraph "Buffer & Binding Setup"
19+
H["Define Bindings<br>(Bindings, TensorView)"]
20+
I["Map GPU buffers<br> to shader bindings<br>(Collection from Tensor<br> or single buffers)"]
21+
end
22+
23+
%% Kernel Setup & Execution
24+
subgraph "Kernel Setup & Execution"
25+
J["Define KernelCode<br>(WGSL template, workgroup size, precision)"]
26+
K["Create Kernel"]
27+
L["Dispatch Kernel"]
28+
end
29+
30+
%% GPU Execution & Result Readback
31+
subgraph "GPU Execution & Result Readback"
32+
M["Kernel Execution<br>(GPU shader runs)"]
33+
N["Readback Data<br>(toCPU variants)"]
34+
end
35+
36+
%% Context & Resources
37+
O["Context<br>(Device, Queue,<br>TensorPool, KernelPool)"]
38+
39+
%% Flow Connections
40+
A --> B
41+
B --> C
42+
B --> D
43+
C --> E
44+
D --> F
45+
F --> H
46+
E --> H
47+
H --> I
48+
I --> K
49+
J --> K
50+
G --- K
51+
K --> L
52+
L --> M
53+
M --> N
54+
55+
%% Context shared by all stages
56+
O --- D
57+
O --- E
58+
O --- F
59+
O --- K
60+
O --- L
61+
O --- N
62+
```
63+
64+
Rank 0: Scalar
65+
Rank 1: Vector
66+
Rank 2: Matrix
67+
Rank 3: 3D Tensor (or Cube)
68+
Rank 4: 4D Tensor
69+
Rank ..: Higher Dimensional Tensors
70+
71+
72+
• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
73+
• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
74+
• gpu::Bindings collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.
75+
• The gpu::TensorPool (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
76+
• gpu::KernelCode contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.
77+
• The gpu::createKernelAsync/gpu::createKernel functions (within the Execution Flow) use the gpu::Context, gpu::Bindings, and gpu::KernelCode to configure and construct a gpu::Kernel that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).
78+
• gpu::KernelCode’s workgroup size (a gpu::Shape) defines the dispatch configuration, and the gpu::Kernel eventually uses the underlying gpu::Array (contains WGPUBuffer, WGPUBufferUsage, size_t) and gpu::Shape data from the created Tensor.

gpu.hpp

+68-42
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include <utility> // std::pair
1717
#include <vector>
1818

19-
2019
#ifdef __EMSCRIPTEN__
2120
#include "emscripten/emscripten.h"
2221
#endif
@@ -1106,7 +1105,7 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {},
11061105
* @param ctx The Context containing the WebGPU instance handle.
11071106
* @return std::vector<dawn::native::Adapter> A vector of available GPU
11081107
* adapters.
1109-
*
1108+
*
11101109
* @code
11111110
* std::vector<dawn::native::Adapter> adapters = getAdapters(ctx);
11121111
* @endcode
@@ -1118,21 +1117,25 @@ inline std::vector<dawn::native::Adapter> getAdapters(Context &ctx) {
11181117
}
11191118

11201119
/**
1121-
* @brief Formats the given vector of Dawn adapters into a single concatenated string.
1120+
* @brief Formats the given vector of Dawn adapters into a single concatenated
1121+
* string.
11221122
*
1123-
* This function iterates over each Dawn adapter in the provided vector, retrieves its
1124-
* description using the WebGPU API, and converts the description from a WGPUStringView
1125-
* to an std::string using the formatWGPUStringView helper. The resulting descriptions
1126-
* are concatenated into a single string separated by newline characters.
1123+
* This function iterates over each Dawn adapter in the provided vector,
1124+
* retrieves its description using the WebGPU API, and converts the description
1125+
* from a WGPUStringView to an std::string using the formatWGPUStringView
1126+
* helper. The resulting descriptions are concatenated into a single string
1127+
* separated by newline characters.
11271128
*
11281129
* @param adapters A vector of Dawn adapters obtained from a WebGPU instance.
1129-
* @return std::string A newline-delimited string listing each adapter's description.
1130-
*
1130+
* @return std::string A newline-delimited string listing each adapter's
1131+
* description.
1132+
*
11311133
* @code
11321134
* std::string adapterList = formatAdapters(adapters);
11331135
* @endcode
11341136
*/
1135-
inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adapters) {
1137+
inline std::string
1138+
formatAdapters(const std::vector<dawn::native::Adapter> &adapters) {
11361139
std::string adapterList;
11371140
for (size_t i = 0; i < adapters.size(); ++i) {
11381141
auto adapterPtr = adapters[i].Get();
@@ -1157,7 +1160,7 @@ inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adap
11571160
* @param ctx The Context containing the WebGPU instance handle.
11581161
* @return std::string A newline-delimited string listing each adapter's
11591162
* description.
1160-
*
1163+
*
11611164
* @code
11621165
* std::string adapterList = listAdapters(ctx);
11631166
* @endcode
@@ -1181,7 +1184,7 @@ inline std::string listAdapters(Context &ctx) {
11811184
* @param devDescriptor Device descriptor for the WebGPU device (optional)
11821185
* @return std::future<Context> A future that will eventually hold the created
11831186
* Context.
1184-
*
1187+
*
11851188
* @code
11861189
* std::future<Context> contextFuture = createContextByGpuIdxAsync(0);
11871190
* Context ctx = waitForContextFuture(contextFuture);
@@ -1270,9 +1273,9 @@ createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
12701273
* Context ctx = createContextByGpuIdx(0);
12711274
* @endcode
12721275
*/
1273-
inline Context createContextByGpuIdx(int gpuIdx,
1274-
const WGPUInstanceDescriptor &desc = {},
1275-
const WGPUDeviceDescriptor &devDescriptor = {}) {
1276+
inline Context
1277+
createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
1278+
const WGPUDeviceDescriptor &devDescriptor = {}) {
12761279
std::future<Context> contextFuture =
12771280
createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor);
12781281
return waitForContextFuture<Context>(contextFuture);
@@ -1365,17 +1368,19 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
13651368
/**
13661369
* @brief Copies data from a GPU buffer to CPU memory.
13671370
* @param[in] ctx Context instance to manage the operation
1368-
* @param[in] tensor Tensor instance representing the GPU buffer to copy from
13691371
* @param[out] data Pointer to the CPU memory to copy the data to
13701372
* @param[in] bufferSize Size of the data buffer in bytes
13711373
* @param[in] op StagingBuffer instance to manage the operation
1374+
* @param[in] sourceOffset Offset in the GPU buffer to start copying from.
13721375
*
13731376
* @code
13741377
* toCPU(ctx, tensor, data, bufferSize);
13751378
* @endcode
13761379
*/
1377-
inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
1378-
size_t bufferSize, CopyData &op) {
1380+
1381+
// NOTE: I think this one is redundant? CopyData not used externally.
1382+
inline std::future<void> toCPUAsync(Context &ctx, void *data, size_t bufferSize,
1383+
CopyData &op, size_t sourceOffset = 0) {
13791384
// Submit the command buffer and release it.
13801385
wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
13811386
wgpuCommandBufferRelease(op.commandBuffer);
@@ -1388,8 +1393,8 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
13881393
CallbackData *cbData = new CallbackData{
13891394
op.readbackBuffer, // The GPU buffer to be read back.
13901395
bufferSize,
1391-
data, // CPU memory destination.
1392-
promise // The promise to be signaled.
1396+
data, // CPU memory destination.
1397+
promise, // The promise to be signaled.
13931398
};
13941399

13951400
// Set up the work-done callback to initiate the buffer mapping.
@@ -1402,6 +1407,11 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
14021407
// Begin the asynchronous chain by registering the queue work-done callback.
14031408
wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
14041409

1410+
// Release the readback buffer as it is no longer needed.
1411+
if (op.readbackBuffer) {
1412+
wgpuBufferRelease(op.readbackBuffer);
1413+
}
1414+
14051415
return promise->get_future();
14061416
}
14071417

@@ -1417,11 +1427,13 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
14171427
*
14181428
* @param[in] ctx Context instance to manage the operation
14191429
* @param[in] tensor Tensor instance representing the GPU buffer to copy from
1420-
* @param[in] bufferSize Size of the data buffer in bytes
1430+
* @param[in] bufferSize Size to read in bytes as out data.
14211431
* @param[out] data Pointer to the CPU memory to copy the data to
1432+
* @param[in] sourceOffset Offset in the GPU buffer to start copying from.
14221433
*/
14231434
inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
1424-
size_t bufferSize) {
1435+
size_t bufferSize,
1436+
size_t sourceOffset = 0) {
14251437
// Create a promise that will later be satisfied when the async copy
14261438
// completes.
14271439
auto promise = std::make_shared<std::promise<void>>();
@@ -1430,16 +1442,17 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
14301442
WGPUBufferDescriptor readbackBufferDescriptor = {
14311443
.label = {.data = nullptr, .length = 0},
14321444
.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
1433-
.size = bufferSize,
1445+
.size = bufferSize, // Size of the readback buffer.
14341446
};
14351447
WGPUBuffer readbackBuffer =
14361448
wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
14371449

14381450
// Create a command encoder and record a copy from the tensor GPU buffer
14391451
WGPUCommandEncoder commandEncoder =
14401452
wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
1441-
wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,
1442-
readbackBuffer, 0, bufferSize);
1453+
wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer,
1454+
sourceOffset, readbackBuffer, 0,
1455+
bufferSize);
14431456
// Finish recording by creating a command buffer and release the encoder.
14441457
WGPUCommandBuffer commandBuffer =
14451458
wgpuCommandEncoderFinish(commandEncoder, nullptr);
@@ -1472,13 +1485,16 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
14721485
// queueWorkDoneCallback.
14731486
wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
14741487

1488+
if (readbackBuffer) {
1489+
wgpuBufferRelease(readbackBuffer);
1490+
}
1491+
14751492
return promise->get_future();
14761493
}
14771494

14781495
inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
1479-
size_t size) {
1480-
// The size (in bytes) for the copy.
1481-
uint64_t bufferSize = size;
1496+
size_t bufferSize,
1497+
size_t sourceOffset = 0) {
14821498

14831499
// Create an operation structure (here we reuse CopyData solely for its
14841500
// members that we need to create a readback buffer and command buffer).
@@ -1503,7 +1519,7 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
15031519
{
15041520
WGPUCommandEncoder commandEncoder =
15051521
wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
1506-
wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0,
1522+
wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, sourceOffset,
15071523
op.readbackBuffer, 0, bufferSize);
15081524
op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
15091525
wgpuCommandEncoderRelease(commandEncoder);
@@ -1516,10 +1532,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
15161532

15171533
// Allocate callback data
15181534
CallbackData *cbData = new CallbackData{
1519-
op.readbackBuffer, // The readback buffer created above.
1520-
static_cast<size_t>(bufferSize), // Size of the copy.
1521-
data, // Destination CPU memory.
1522-
promise // Our promise to satisfy when done.
1535+
op.readbackBuffer, // The readback buffer created above.
1536+
bufferSize, // Size of the copy.
1537+
data, // Destination CPU memory. // Offset in the GPU buffer.
1538+
promise // Our promise to satisfy when done.
15231539
};
15241540

15251541
// Set up the queue work-done callback info.
@@ -1532,6 +1548,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
15321548
// Start the asynchronous chain by registering the work-done callback.
15331549
wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
15341550

1551+
if (op.readbackBuffer) {
1552+
wgpuBufferRelease(op.readbackBuffer);
1553+
}
1554+
15351555
return promise->get_future();
15361556
}
15371557

@@ -1548,9 +1568,11 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
15481568
* @endcode
15491569
*/
15501570
template <size_t N>
1551-
inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
1552-
std::array<float, N> &data) {
1553-
return toCPUAsync(ctx, tensor, data.data(), sizeof(data));
1571+
inline std::future<void>
1572+
toCPUAsync(Context &ctx, Tensor &tensor, std::array<float, N> &data,
1573+
size_t sourceOffset = 0) {
1574+
return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset
1575+
);
15541576
}
15551577

15561578
/**
@@ -1571,8 +1593,10 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
15711593
* toCPU(ctx, tensor, data, bufferSize, instance);
15721594
* @endcode
15731595
*/
1574-
inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
1575-
auto future = toCPUAsync(ctx, tensor, data, bufferSize);
1596+
inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
1597+
size_t sourceOffset = 0) {
1598+
auto future =
1599+
toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset);
15761600
wait(ctx, future);
15771601
}
15781602

@@ -1593,8 +1617,9 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
15931617
* toCPU(ctx, buffer, data, size, instance);
15941618
* @endcode
15951619
*/
1596-
inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
1597-
auto future = toCPUAsync(ctx, buffer, data, size);
1620+
inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size,
1621+
size_t sourceOffset = 0) {
1622+
auto future = toCPUAsync(ctx, buffer, data, size, sourceOffset);
15981623
wait(ctx, future);
15991624
}
16001625

@@ -1616,8 +1641,9 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
16161641
* @endcode
16171642
*/
16181643
template <size_t N>
1619-
inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {
1620-
auto future = toCPUAsync(ctx, tensor, data);
1644+
inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data,
1645+
size_t sourceOffset = 0) {
1646+
auto future = toCPUAsync(ctx, tensor, data, sourceOffset);
16211647
wait(ctx, future);
16221648
}
16231649

0 commit comments

Comments
 (0)