Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# and cmake/gpu.cmake for more details
cmake_minimum_required(VERSION 3.28)
project(gpu)

set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
# LSP
set(CMAKE_CXX_STANDARD 20)
Expand All @@ -23,6 +23,24 @@ endif()
include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake")
include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")

target_link_libraries(gpu PRIVATE webgpu_dawn)

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test)

add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp)
target_link_libraries(test_gpu PRIVATE gpu)

# Platform-specific post-build actions (e.g. copying DLLs for MSVC)
if(MSVC)
add_custom_command(
TARGET test_gpu POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${DAWN_BUILD_DIR}/$<CONFIG>/webgpu_dawn.dll
$<TARGET_FILE_DIR:test_gpu>
COMMENT "Copying webgpu_dawn.dll to the build directory"
)
endif()

add_library(gpud SHARED gpu.hpp)
set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(gpud PRIVATE gpu)
2 changes: 2 additions & 0 deletions cmake/dawn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ if(EMSCRIPTEN)
set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "")
set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE)
else()
add_compile_definitions(USE_DAWN_API)
endif()

# Enable find for no dawn rebuilds with flutter run
Expand Down
3 changes: 2 additions & 1 deletion cmake/example.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,14 @@ if(EMSCRIPTEN)
# Set Emscripten-specific link flags that enable WASM output and expose certain symbols.
# Needed to use updated version, emdawnwebgpu
set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\
-O3 \
-sUSE_WEBGPU=0 \
-sWASM=1 \
-DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \
-sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
-sEXPORTED_RUNTIME_METHODS=ccall \
-sUSE_GLFW=3 \
-sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=5MB \
-sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \
-sASYNCIFY \
--js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
--js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \
Expand Down
3 changes: 2 additions & 1 deletion cmake/gpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX)
target_include_directories(gpu PUBLIC "${PROJECT_ROOT}")
if(NOT EMSCRIPTEN)
target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/")
target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/")
target_include_directories(gpu PUBLIC "${DAWN_DIR}/include/")
else()
target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/")
endif()

2 changes: 1 addition & 1 deletion cmake/templates/index.html.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
if (typeof Module !== 'undefined') {
Module.onRuntimeInitialized = function() {
// Optionally, pass arguments to main in an array.
Module._main([]);
Module.ccall('main', 'number', [], [], { async: true });
};
} else {
console.error('Module is undefined. Check that your generated JS file is loaded properly.');
Expand Down
78 changes: 78 additions & 0 deletions docs/gpuflow.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# GPU.cpp Lifecycle

```mermaid
flowchart TD
%% Data Preparation & Upload
subgraph "Data Preparation & Upload"
A["CPU Data"]
B["Define Data Properties<br>(shape, type, size)"]
C["Create GPU Buffer<br>(allocate raw buffer)"]
D["Create Tensor<br>(allocates Array with one<br> or more buffers<br>and associates Shape)"]

E["Upload Data via toGPU <br>(raw buffer)<br>toGPU<br>(ctx, data, buffer, size)"]
F["Upload Data via toGPU<br>(Tensor overload)<br>toGPU(ctx, data, tensor)"]
G["Optional: <br> Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
end

%% Buffer Setup & Bindings
subgraph "Buffer & Binding Setup"
H["Define Bindings<br>(Bindings, TensorView)"]
I["Map GPU buffers<br> to shader bindings<br>(Collection from Tensor<br> or single buffers)"]
end

%% Kernel Setup & Execution
subgraph "Kernel Setup & Execution"
J["Define KernelCode<br>(WGSL template, workgroup size, precision)"]
K["Create Kernel"]
L["Dispatch Kernel"]
end

%% GPU Execution & Result Readback
subgraph "GPU Execution & Result Readback"
M["Kernel Execution<br>(GPU shader runs)"]
N["Readback Data<br>(toCPU variants)"]
end

%% Context & Resources
O["Context<br>(Device, Queue,<br>TensorPool, KernelPool)"]

%% Flow Connections
A --> B
B --> C
B --> D
C --> E
D --> F
F --> H
E --> H
H --> I
I --> K
J --> K
G --- K
K --> L
L --> M
M --> N

%% Context shared by all stages
O --- D
O --- E
O --- F
O --- K
O --- L
O --- N
```

• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
• `gpu::Bindings` collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.
• The `gpu::TensorPool` (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
• `gpu::KernelCode` contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.
• The `gpu::createKernelAsync/gpu::createKernel` functions (within the Execution Flow) use the `gpu::Context`, `gpu::Bindings`, and `gpu::KernelCode` to configure and construct a `gpu::Kernel` that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).
• `gpu::KernelCode`’s workgroup size (a `gpu::Shape`) defines the dispatch configuration, and the `gpu::Kernel` eventually uses the underlying `gpu::Array` (contains` WGPUBuffer, WGPUBufferUsage, size_t`) and `gpu::Shape` data from the created Tensor.

`gpu::Tensor` Ranks:
Rank 0: Scalar
Rank 1: Vector
Rank 2: Matrix
Rank 3: 3D Tensor (or Cube)
Rank 4: 4D Tensor
Rank (max 8): Higher Dimensional Tensors
11 changes: 7 additions & 4 deletions examples/hello_world/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,24 @@ int main(int argc, char **argv) {
printf("--------------\n\n");

// std::unique_ptr<Context> ctx = createContext();
#ifdef USE_DAWN_API
Context ctx = createContextByGpuIdx(0);
auto adaptersList = listAdapters(ctx);
LOG(kDefLog, kInfo, "Available GPU adapters:\n%s", adaptersList.c_str());
#else
Context ctx = createContext();
#endif
static constexpr size_t N = 10000;
std::array<float, N> inputArr, outputArr;
for (int i = 0; i < N; ++i) {
inputArr[i] = static_cast<float>(i) / 10.0; // dummy input data
}
Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
Tensor output = createTensor(ctx, Shape{N}, kf32);
std::promise<void> promise;
std::future<void> future = promise.get_future();
Kernel op = createKernel(ctx, {kGelu, 256, kf32},
Bindings{input, output},
{cdiv(N, 256), 1, 1});
dispatchKernel(ctx, op, promise);
wait(ctx, future);
dispatchKernel(ctx, op);
toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
for (int i = 0; i < 12; ++i) {
printf(" gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);
Expand Down
6 changes: 2 additions & 4 deletions examples/render/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,8 @@ int main(int argc, char **argv) {
cdiv({NCOLS, NROWS, 1}, wgSize), params);
printf("\033[2J\033[H");
while (true) {
std::promise<void> promise;
std::future<void> future = promise.get_future();
dispatchKernel(ctx, renderKernel, promise);
wait(ctx, future);

dispatchKernel(ctx, renderKernel);
toCPU(ctx, devScreen, screen.data(), sizeof(screen));
params.time = getCurrentTimeInMilliseconds() - zeroTime;

Expand Down
Loading