From 9ac780bef1c6813f43855f7d9d7d33a733876c45 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Wed, 19 Feb 2025 16:30:50 -0600 Subject: [PATCH 01/17] refactors async --- cmake/templates/index.html.in | 2 +- examples/hello_world/run.cpp | 10 +- gpu.hpp | 838 ++++++++++++++++++++++------------ numeric_types/half.cpp | 15 +- 4 files changed, 550 insertions(+), 315 deletions(-) diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in index b6f130c..6b5957b 100644 --- a/cmake/templates/index.html.in +++ b/cmake/templates/index.html.in @@ -12,7 +12,7 @@ if (typeof Module !== 'undefined') { Module.onRuntimeInitialized = function() { // Optionally, pass arguments to main in an array. - Module._main([]); + Module.ccall('main', 'number', [], [], { async: true }); }; } else { console.error('Module is undefined. Check that your generated JS file is loaded properly.'); diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp index 7453869..06970a7 100644 --- a/examples/hello_world/run.cpp +++ b/examples/hello_world/run.cpp @@ -38,12 +38,14 @@ int main(int argc, char **argv) { Tensor output = createTensor(ctx, Shape{N}, kf32); std::promise promise; std::future future = promise.get_future(); - Kernel op = createKernel(ctx, {kGelu, 256, kf32}, + std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf32}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - dispatchKernel(ctx, op, promise); - wait(ctx, future); - toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); + Kernel op = waitForFuture(ctx.instance, kernelFuture); + std::future dispatchFuture = dispatchKernel(ctx, op); + waitForFuture(ctx.instance, dispatchFuture); + std::future cpuFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); + waitForFuture(ctx.instance, cpuFuture); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]); } diff --git a/gpu.hpp b/gpu.hpp index edc8b38..052c674 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -1,6 +1,7 @@ #ifndef GPU_HPP #define GPU_HPP +#include "webgpu.h" #include #include #include @@ -15,15 +16,15 @@ #include // std::pair #include -#include "webgpu.h" +#ifndef __EMSCRIPTEN__ -#include "numeric_types/half.hpp" -#include "utils/logging.hpp" - -#ifdef __EMSCRIPTEN__ +#else #include "emscripten/emscripten.h" #endif +#include "numeric_types/half.hpp" +#include "utils/logging.hpp" + #ifdef USE_DAWN_API #include "dawn/native/DawnNative.h" #endif @@ -430,8 +431,8 @@ struct CallbackData { WGPUBuffer buffer; // managed by owning Kernel size_t bufferSize; void *output; // non-owning, only for target memory in toCPU, not used for - // kernel invocations - std::promise *promise; + // kernel invocations + std::shared_ptr> promise; std::future *future; }; @@ -530,32 +531,27 @@ struct Context { // Default constructor Context() = default; - Context(Context&& other) noexcept - : instance(other.instance), - adapter(other.adapter), - device(other.device), + Context(Context &&other) noexcept + : instance(other.instance), adapter(other.adapter), device(other.device), queue(other.queue), // Re‐initialize pools to point to *this*: - pool(this), - kernelPool(this), - adapterStatus(other.adapterStatus), - deviceStatus(other.deviceStatus) - { + pool(this), kernelPool(this), adapterStatus(other.adapterStatus), + deviceStatus(other.deviceStatus) { LOG(kDefLog, kTrace, "Moving Context ownership"); // Move over the resources in the pools: - pool.data = std::move(other.pool.data); + pool.data = std::move(other.pool.data); kernelPool.data = std::move(other.kernelPool.data); // Null out handles in the source so its destructor won't release them. other.instance = nullptr; - other.adapter = nullptr; - other.device = nullptr; - other.queue = nullptr; + other.adapter = nullptr; + other.device = nullptr; + other.queue = nullptr; // other.adapterStatus = 0; // other.deviceStatus = 0; } - Context& operator=(Context&& other) noexcept { + Context &operator=(Context &&other) noexcept { if (this != &other) { // Free any existing resources. In most cases, this should be a no-op // since we typically shouldn't have two active initialized Context @@ -625,7 +621,7 @@ inline Tensor createTensor(TensorPool &pool, WGPUDevice &device, size_t numElements = size(shape); size_t size = sizeBytes(dtype) * numElements; WGPUBufferDescriptor bufferDesc = { - .label = {.data = nullptr, .length = 0}, + .label = {.data = nullptr, .length = 0}, .usage = usage, .size = size, }; @@ -794,6 +790,162 @@ inline void check(bool condition, const char *message, } } +/** + * @brief Pumps events until the provided future is ready. + * + * This helper template function continuously checks the status of the provided std::future + * until it becomes ready. On Emscripten builds, it yields control to the JavaScript event loop + * using emscripten_sleep to allow asynchronous callbacks to execute. On other platforms, it + * processes events from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future + * is ready, its value is returned. + * + * @tparam T The type of the value contained in the future. + * @param instance The WGPUInstance used to process events. + * @param f The future to wait on. + * @return T The value retrieved from the ready future. + * + * @code + * std::future deviceFuture = requestDeviceAsync(adapter, devDescriptor); + * WGPUDevice device = waitForFuture(instance, deviceFuture); + * @endcode + */ +template +T waitForFuture(WGPUInstance instance, std::future &f) { +#ifdef __EMSCRIPTEN__ + // Poll until the future is ready. + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + // Yield control to the JS event loop. + emscripten_sleep(1); + } + return f.get(); +#else + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + wgpuInstanceProcessEvents(instance); + } + return f.get(); +#endif +} + +// Context Callbacks & Helpers + +/** + * @brief Adapter callback function invoked upon completion of an asynchronous WebGPU adapter request. + * + * This callback is triggered when the request for a WebGPU adapter completes. It verifies whether + * the adapter was successfully obtained. On failure, it logs an error message (in Emscripten builds) + * and sets an exception on the associated promise. On success, it sets the value of the promise with + * the obtained adapter. Finally, it frees the allocated memory for the promise pointer. + * + * @param status The status of the adapter request. Expected to be WGPURequestAdapterStatus_Success on success. + * @param adapter The WGPUAdapter obtained on a successful request. + * @param message A string view containing additional information about the adapter request. + * @param userdata1 A pointer to a heap-allocated std::shared_ptr>. + * @param userdata2 Unused. + */ +inline void adapterCallback(WGPURequestAdapterStatus status, + WGPUAdapter adapter, WGPUStringView message, + void *userdata1, void * /*userdata2*/) { + auto *promisePtr = + reinterpret_cast> *>(userdata1); + if (status != WGPURequestAdapterStatus_Success) { +#ifdef __EMSCRIPTEN__ + LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s", + static_cast(message.length), message.data); +#endif + (*promisePtr) + ->set_exception(std::make_exception_ptr( + std::runtime_error("Request WebGPU adapter failed"))); + } else { + (*promisePtr)->set_value(adapter); + } + delete promisePtr; +} + +/** + * @brief Callback function invoked upon completion of an asynchronous WebGPU device request. + * + * This callback is triggered when the request for a WebGPU device completes. It verifies that + * the device was successfully created. On success, the callback sets the value of the associated + * promise; otherwise, it sets an exception. After fulfilling the promise, it frees the allocated + * memory for the promise pointer. + * + * @param status The status of the device request. Expected to be WGPURequestDeviceStatus_Success on success. + * @param device The WGPUDevice obtained on successful request. + * @param message A string view containing additional information about the device request. + * @param userdata1 A pointer to a heap-allocated std::shared_ptr>. + * @param userdata2 Unused. + */ +inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device, + WGPUStringView message, void *userdata1, + void * /*userdata2*/) { + auto *promisePtr = + reinterpret_cast> *>(userdata1); + if (status != WGPURequestDeviceStatus_Success) { + (*promisePtr) + ->set_exception(std::make_exception_ptr( + std::runtime_error("Request WebGPU device failed"))); + } else { + LOG(kDefLog, kTrace, "Device Request succeeded %p", + static_cast(device)); + (*promisePtr)->set_value(device); + } + delete promisePtr; +} + +/** + * @brief Asynchronously requests a WebGPU adapter from the given instance. + * + * This helper function wraps the asynchronous call to request an adapter using the WebGPU API. + * It sets up a promise and registers an adapter callback, returning a future that will eventually + * hold the requested WGPUAdapter. + * + * @param instance The WGPUInstance from which to request the adapter. + * @param adapterOpts The options for requesting the adapter. + * @return std::future A future that will eventually hold the created WGPUAdapter. + */ +inline std::future +requestAdapterAsync(WGPUInstance instance, + const WGPURequestAdapterOptions &adapterOpts) { + auto promise = std::make_shared>(); + auto *promisePtr = new std::shared_ptr>(promise); + + WGPURequestAdapterCallbackInfo callbackInfo{ + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = adapterCallback, + .userdata1 = promisePtr, + .userdata2 = nullptr}; + wgpuInstanceRequestAdapter(instance, &adapterOpts, callbackInfo); + return promise->get_future(); +} + +/** + * @brief Asynchronously requests a WebGPU device from a given adapter. + * + * This helper function wraps the asynchronous call to request a device using the WebGPU API. + * It sets up a promise and registers a device callback, returning a future that will be fulfilled + * once the device is available. + * + * @param adapter The WGPUAdapter to request the device from. + * @param devDescriptor The descriptor specifying the characteristics of the requested device. + * @return std::future A future that will eventually hold the created WGPUDevice. + */ +inline std::future +requestDeviceAsync(WGPUAdapter adapter, + const WGPUDeviceDescriptor &devDescriptor) { + auto promise = std::make_shared>(); + auto *promisePtr = new std::shared_ptr>(promise); + + WGPURequestDeviceCallbackInfo deviceCallbackInfo{ + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = deviceCallback, + .userdata1 = promisePtr, + .userdata2 = nullptr}; + wgpuAdapterRequestDevice(adapter, &devDescriptor, deviceCallbackInfo); + return promise->get_future(); +} + /** * @brief Factory function to create a GPU context, which aggregates WebGPU API * handles to interact with the GPU including the instance, adapter, device, and @@ -812,12 +964,10 @@ inline void check(bool condition, const char *message, * @return Context instance representing the created GPU context * */ -inline Context createContext( - const WGPUInstanceDescriptor &desc = {}, - const WGPURequestAdapterOptions &adapterOpts = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) -{ - Context ctx; // stack-allocated +inline Context createContext(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + Context ctx; // Stack-allocated Context. #ifdef __EMSCRIPTEN__ ctx.instance = wgpuCreateInstance(nullptr); @@ -826,115 +976,50 @@ inline Context createContext( #endif check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__); + // Request the adapter asynchronously. LOG(kDefLog, kTrace, "Requesting adapter"); - { - struct AdapterData { - WGPUAdapter adapter = nullptr; - bool requestEnded = false; - WGPURequestAdapterStatus status; - }; - AdapterData adapterData; - - auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status, - WGPUAdapter adapter, - WGPUStringView message, - void *pUserData, void *) { - auto &ad = *reinterpret_cast(pUserData); - ad.status = status; -#ifdef __EMSCRIPTEN__ - if (status != WGPURequestAdapterStatus_Success) { - LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s", - static_cast(message.length), message.data); - } -#endif - check(status == WGPURequestAdapterStatus_Success, - "Request WebGPU adapter", __FILE__, __LINE__); - ad.adapter = adapter; - ad.requestEnded = true; - }; - - WGPURequestAdapterCallbackInfo callbackInfo { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = onAdapterRequestEnded, - .userdata1 = &adapterData, - .userdata2 = nullptr - }; - wgpuInstanceRequestAdapter(ctx.instance, &adapterOpts, callbackInfo); - - while (!adapterData.requestEnded) { - processEvents(ctx.instance); - } - ctx.adapter = adapterData.adapter; - ctx.adapterStatus = adapterData.status; + try { + auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts); + // Pump events until the adapter future is ready. + ctx.adapter = waitForFuture(ctx.instance, adapterFuture); + ctx.adapterStatus = WGPURequestAdapterStatus_Success; + } catch (const std::exception &ex) { + check(false, ex.what(), __FILE__, __LINE__); } + // Request the device asynchronously. LOG(kDefLog, kTrace, "Requesting device"); - { - struct DeviceData { - WGPUDevice device = nullptr; - bool requestEnded = false; - WGPURequestDeviceStatus status; - }; - DeviceData devData; - - auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status, - WGPUDevice device, - WGPUStringView message, - void *pUserData, void *) { - auto &dd = *reinterpret_cast(pUserData); - dd.status = status; - check(status == WGPURequestDeviceStatus_Success, - "Could not get WebGPU device.", __FILE__, __LINE__); - LOG(kDefLog, kTrace, "Device Request succeeded %p", - static_cast(device)); - dd.device = device; - dd.requestEnded= true; - }; - - WGPURequestDeviceCallbackInfo deviceCallbackInfo { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = onDeviceRequestEnded, - .userdata1= &devData, - .userdata2= nullptr - }; - wgpuAdapterRequestDevice(ctx.adapter, &devDescriptor, deviceCallbackInfo); - - LOG(kDefLog, kTrace, "Waiting for device request to end"); - while (!devData.requestEnded) { - processEvents(ctx.instance); - } + try { + auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); + // Pump events until the device future is ready. + ctx.device = waitForFuture(ctx.instance, deviceFuture); + ctx.deviceStatus = WGPURequestDeviceStatus_Success; LOG(kDefLog, kTrace, "Device request ended"); - ctx.device = devData.device; - ctx.deviceStatus = devData.status; - - // If the device was created, set up logging and fetch the queue - if (devData.status == WGPURequestDeviceStatus_Success) { - #ifndef __EMSCRIPTEN__ - WGPULoggingCallbackInfo loggingCallbackInfo { + // If the device was created, set up logging and fetch the queue. +#ifndef __EMSCRIPTEN__ + WGPULoggingCallbackInfo loggingCallbackInfo{ .nextInChain = nullptr, .callback = - [](WGPULoggingType type, WGPUStringView message, - void *, void *) { - LOG(kDefLog, kError, "Device logging callback: %.*s", - static_cast(message.length), message.data); - if (type == WGPULoggingType_Error) { - throw std::runtime_error("Device error logged."); - } - }, + [](WGPULoggingType type, WGPUStringView message, void *, void *) { + LOG(kDefLog, kError, "Device logging callback: %.*s", + static_cast(message.length), message.data); + if (type == WGPULoggingType_Error) { + throw std::runtime_error("Device error logged."); + } + }, .userdata1 = nullptr, - .userdata2 = nullptr - }; - wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); - #endif - ctx.queue = wgpuDeviceGetQueue(ctx.device); - } + .userdata2 = nullptr}; + wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); +#endif + ctx.queue = wgpuDeviceGetQueue(ctx.device); + } catch (const std::exception &ex) { + check(false, ex.what(), __FILE__, __LINE__); } return std::move(ctx); } - #ifdef USE_DAWN_API /** * @brief Factory function to create a GPU context, which aggregates WebGPU API @@ -1066,11 +1151,76 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, } #endif -inline void wait(Context &ctx, std::future &future) { - while (future.wait_for(std::chrono::seconds(0)) != - std::future_status::ready) { - processEvents(ctx.instance); - } +/** + * @brief Callback function invoked upon completion of an asynchronous GPU buffer mapping. + * + * This callback is triggered when the GPU buffer mapping for a readback buffer is completed. + * It verifies that the mapping operation was successful, retrieves the mapped memory, + * copies the data from the GPU buffer to a CPU memory region, unmaps the buffer, + * signals the completion by fulfilling the associated promise, and cleans up the allocated callback data. + * + * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success on success. + * @param message A string view containing additional information about the mapping operation. + * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the GPU buffer, + * buffer size, destination CPU memory pointer, and a promise for signaling completion. + * @param userdata2 Unused. + */ +inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, + void *userdata1, void * /*userdata2*/) { + CallbackData *cbData = reinterpret_cast(userdata1); + // Check that mapping succeeded. + check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__, + __LINE__); + + // Get the mapped memory. + const void *mappedData = + wgpuBufferGetConstMappedRange(cbData->buffer, 0, cbData->bufferSize); + check(mappedData, "Get mapped range", __FILE__, __LINE__); + + // Copy the data from the mapped GPU buffer to the CPU memory. + memcpy(cbData->output, mappedData, cbData->bufferSize); + + // Unmap the buffer. + wgpuBufferUnmap(cbData->buffer); + + // Signal that the copy has completed. + // Ensure you use the arrow operator on the shared_ptr to call set_value(). + cbData->promise->set_value(); + + // Clean up the dynamically allocated callback data. + delete cbData; +} + +/** + * @brief Callback function invoked when the GPU queue’s submitted work is complete. + * + * This callback is registered with the GPU queue after submitting work. When invoked, + * it verifies that all queued work completed successfully, and then sets up the buffer + * mapping callback to initiate the asynchronous mapping of a readback buffer. The readback + * buffer is mapped to access the processed data on the CPU. + * + * @param status The status of the completed work. Expected to be WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the readback buffer, + * buffer size, destination CPU memory pointer, and a promise to signal completion. + * @param userdata2 Unused. + */ +inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, + void *userdata1, void * /*userdata2*/) { + CallbackData *cbData = reinterpret_cast(userdata1); + // Ensure the queue work finished successfully. + check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__, + __LINE__); + + // Set up the buffer mapping callback information. + WGPUBufferMapCallbackInfo mapCallbackInfo; + mapCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + mapCallbackInfo.callback = bufferMapCallback; + mapCallbackInfo.userdata1 = cbData; + mapCallbackInfo.userdata2 = nullptr; + + // Begin the asynchronous mapping of the readback buffer. + wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize, + mapCallbackInfo); } /** @@ -1085,45 +1235,35 @@ inline void wait(Context &ctx, std::future &future) { * toCPU(ctx, tensor, data, bufferSize); * @endcode */ -inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, - CopyData &op) { +inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, + size_t bufferSize, CopyData &op) { + // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); - CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise, - &op.future}; - WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) { - check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", - __FILE__, __LINE__); - const auto *data = static_cast(userdata1); - WGPUBufferMapCallbackInfo mapCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUMapAsyncStatus status, WGPUStringView message, - void *userdata1, void *userdata2) { - const auto *data = static_cast(userdata1); - check(status == WGPUMapAsyncStatus_Success, - "Map readbackBuffer", __FILE__, __LINE__); - const void *mappedData = wgpuBufferGetConstMappedRange( - data->buffer, /*offset=*/0, data->bufferSize); - check(mappedData, "Get mapped range", __FILE__, __LINE__); - memcpy(data->output, mappedData, data->bufferSize); - wgpuBufferUnmap(data->buffer); - data->promise->set_value(); - }, - .userdata1 = const_cast(data), - .userdata2 = nullptr}; - wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0, - data->bufferSize, mapCallbackInfo); - }, - .userdata1 = &callbackData, - .userdata2 = nullptr}; + // Create a promise and get its future. + auto promise = std::make_shared>(); + + // Allocate callback data so it remains valid until the async + // chain finishes. + CallbackData *cbData = new CallbackData{ + op.readbackBuffer, // The GPU buffer to be read back. + bufferSize, + data, // CPU memory destination. + promise // The promise to be signaled. + }; + + // Set up the work-done callback to initiate the buffer mapping. + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo; + workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + workDoneCallbackInfo.callback = queueWorkDoneCallback; + workDoneCallbackInfo.userdata1 = cbData; // Pass the callback data. + workDoneCallbackInfo.userdata2 = nullptr; + + // Begin the asynchronous chain by registering the queue work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - wait(ctx, op.future); + return promise->get_future(); } /** @@ -1141,31 +1281,59 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, * @param[in] bufferSize Size of the data buffer in bytes * @param[out] data Pointer to the CPU memory to copy the data to */ -inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { - CopyData op; - op.future = op.promise.get_future(); - { - WGPUBufferDescriptor readbackBufferDescriptor = { - .label = {.data = nullptr, .length = 0}, - .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, - .size = bufferSize, - }; - op.readbackBuffer = - wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); - } - { - WGPUCommandEncoder commandEncoder; - commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); - wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0, - op.readbackBuffer, 0, bufferSize); - op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); - wgpuCommandEncoderRelease(commandEncoder); - check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__); - } - toCPU(ctx, tensor, data, bufferSize, op); - if (op.readbackBuffer) { - wgpuBufferRelease(op.readbackBuffer); - } +inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, + size_t bufferSize) { + // Create a promise that will later be satisfied when the async copy + // completes. + auto promise = std::make_shared>(); + + // Create a readback buffer that will be used for copying and mapping. + WGPUBufferDescriptor readbackBufferDescriptor = { + .label = {.data = nullptr, .length = 0}, + .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, + .size = bufferSize, + }; + WGPUBuffer readbackBuffer = + wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); + + // Create a command encoder and record a copy from the tensor GPU buffer + WGPUCommandEncoder commandEncoder = + wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0, + readbackBuffer, 0, bufferSize); + // Finish recording by creating a command buffer and release the encoder. + WGPUCommandBuffer commandBuffer = + wgpuCommandEncoderFinish(commandEncoder, nullptr); + wgpuCommandEncoderRelease(commandEncoder); + check(commandBuffer, "Create command buffer", __FILE__, __LINE__); + + // Submit the work to the queue and release the command buffer immediately. + wgpuQueueSubmit(ctx.queue, 1, &commandBuffer); + wgpuCommandBufferRelease(commandBuffer); + + // Allocate callback data + CallbackData *cbData = new CallbackData{ + readbackBuffer, // The readback buffer to map. + bufferSize, // The size of the copy. + data, // CPU memory destination. + promise // The promise to signal when done. + }; + + // Set up the work-done callback. When the queue’s submitted work is + // completed, it is routed to queueWorkDoneCallback which then starts the + // asynchronous map. + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = queueWorkDoneCallback, + .userdata1 = cbData, + .userdata2 = nullptr, + }; + + // Register the callback. The async chain continues inside + // queueWorkDoneCallback. + wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + + return promise->get_future(); } /** @@ -1176,76 +1344,74 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { * @param[out] data Array of floats to copy the data to * * @code - * toCPU(ctx, tensor, data); + * std::future toCPUFuture = toCPU(ctx, tensor, data); + * WaitForFuture(ctx.instance, toCPUFuture); * @endcode */ template -void toCPU(Context &ctx, Tensor &tensor, std::array &data) { - toCPU(ctx, tensor, data.data(), sizeof(data)); +inline std::future toCPU(Context &ctx, Tensor &tensor, + std::array &data) { + return toCPU(ctx, tensor, data.data(), sizeof(data)); } -inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { +inline std::future toCPU(Context &ctx, WGPUBuffer buffer, void *data, + size_t size) { + // The size (in bytes) for the copy. uint64_t bufferSize = size; + + // Create an operation structure (here we reuse CopyData solely for its + // members that we need to create a readback buffer and command buffer). CopyData op; - op.future = op.promise.get_future(); + + // Create the promise that will be fulfilled once the copy is done. + auto promise = std::make_shared>(); + + // Create a readback buffer that we can map for reading. { WGPUBufferDescriptor readbackBufferDescriptor = { - .label = {.data = nullptr, .length = 0}, + .label = {.data = nullptr, .length = 0}, .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, .size = bufferSize, }; op.readbackBuffer = wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); } + + // Create a command encoder which copies from the provided buffer to the + // readback buffer. { - WGPUCommandEncoder commandEncoder; - commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + WGPUCommandEncoder commandEncoder = + wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0, op.readbackBuffer, 0, bufferSize); op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); wgpuCommandEncoderRelease(commandEncoder); check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__); } + + // Submit the command and release the command buffer. wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); - CallbackData callbackData = {op.readbackBuffer, static_cast(bufferSize), data, &op.promise, - &op.future}; + // Allocate callback data + CallbackData *cbData = new CallbackData{ + op.readbackBuffer, // The readback buffer created above. + static_cast(bufferSize), // Size of the copy. + data, // Destination CPU memory. + promise // Our promise to satisfy when done. + }; + + // Set up the queue work-done callback info. WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) { - check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", - __FILE__, __LINE__); - const auto *data = static_cast(userdata1); - WGPUBufferMapCallbackInfo mapCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUMapAsyncStatus status, WGPUStringView message, - void *userdata1, void *userdata2) { - const auto *data = static_cast(userdata1); - check(status == WGPUMapAsyncStatus_Success, - "Map readbackBuffer", __FILE__, __LINE__); - const void *mappedData = wgpuBufferGetConstMappedRange( - data->buffer, /*offset=*/0, data->bufferSize); - check(mappedData, "Get mapped range", __FILE__, __LINE__); - memcpy(data->output, mappedData, data->bufferSize); - wgpuBufferUnmap(data->buffer); - data->promise->set_value(); - }, - .userdata1 = const_cast(data), - .userdata2 = nullptr}; - wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0, - data->bufferSize, mapCallbackInfo); - }, - .userdata1 = &callbackData, + .callback = queueWorkDoneCallback, // Our free function callback. + .userdata1 = cbData, // Pass the callback data pointer. .userdata2 = nullptr}; + + // Start the asynchronous chain by registering the work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - wait(ctx, op.future); - if (op.readbackBuffer) { - wgpuBufferRelease(op.readbackBuffer); - } + return promise->get_future(); } /** @@ -1376,6 +1542,19 @@ inline Shape cdiv(Shape total, Shape group) { return result; } +/** + * @brief Packages the shader compilation information along with a promise for asynchronous signaling. + * + * This structure holds a pointer to a CompilationInfo instance that collects + * details such as status, messages, line numbers, and positions from the shader compilation. + * It also contains a shared pointer to a std::promise which is used to signal the completion + * of the asynchronous shader compilation process. + */ +struct CompData { + CompilationInfo *compInfo; + std::shared_ptr> compPromise; +}; + /** * @brief A factory function to create a kernel on the GPU. The kernel is * created with the given WGSL code, input tensors, output tensor, and @@ -1399,34 +1578,38 @@ inline Shape cdiv(Shape total, Shape group) { * @return Kernel instance representing the created kernel * * @code - * Kernel kernel = createKernel(ctx, code, dataBindings, numInputs, + * std::future kernelFuture = createKernel(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize); + * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); * @endcode - * output, nThreads, params, paramsSize); + */ -inline Kernel createKernel(Context& ctx, const KernelCode &code, - const Tensor *dataBindings, size_t numTensors, - const size_t *viewOffsets, - const Shape &totalWorkgroups, - const void *params = nullptr, size_t paramsSize = 0, - CompilationInfo *compilationInfo = nullptr, - const char *cacheKey = nullptr) { +inline std::future +createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, + size_t numTensors, const size_t *viewOffsets, + const Shape &totalWorkgroups, const void *params = nullptr, + size_t paramsSize = 0, CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { // Create a cache key by the pointer values of the data bindings and the // kernel code if (cacheKey != nullptr && ctx.kernelPool.data.find(cacheKey) != ctx.kernelPool.data.end()) { - LOG(kDefLog, kInfo, "Kernel cache hit"); - return ctx.kernelPool.data[cacheKey]; + std::promise ready; + ready.set_value(ctx.kernelPool.data[cacheKey]); + return ready.get_future(); } + // Create an outer promise for the new kernel. + std::promise outerPromise; + std::future outerFuture = outerPromise.get_future(); + assert(totalWorkgroups.rank == 3); WGPUDevice device = ctx.device; WGPUQueue queue = ctx.queue; Kernel op(new RawKernel()); - // paramIndex is the index into bgLayoutEntries for the parameters buffer If // there are no parameters for the kernel, paramsSize == 0 and paramIndex is // effectively undefined (== -1) - size_t paramIndex = -1; + size_t paramIndex = static_cast(-1); // Note: paramIndex is undefined unless paramsSize > 0 size_t numBindings = numTensors; if (paramsSize > 0) { @@ -1435,11 +1618,13 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, // op.buffers, op.bufferSizes and // bgLayoutEntries } + op->buffers = std::make_unique(numBindings); op->bufferSizes = std::make_unique(numBindings); op->numBindings = numBindings; - std::vector bgLayoutEntries(numBindings); + // Create layout entries for input buffers + std::vector bgLayoutEntries(numBindings); for (size_t i = 0; i < numTensors; ++i) { bgLayoutEntries[i] = WGPUBindGroupLayoutEntry{ .binding = static_cast(i), @@ -1452,8 +1637,6 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, }; } if (paramsSize > 0) { - LOG(kDefLog, kInfo, "Create layout entry for the params buffer"); - // Create layout entry for the params buffer bgLayoutEntries[paramIndex] = WGPUBindGroupLayoutEntry{ .binding = static_cast(paramIndex), .visibility = WGPUShaderStage_Compute, @@ -1466,10 +1649,11 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, } WGPUBindGroupLayoutDescriptor bgLayoutDesc = { .entryCount = static_cast(bgLayoutEntries.size()), - .entries = bgLayoutEntries.data(), - }; + .entries = bgLayoutEntries.data()}; WGPUBindGroupLayout bgLayout = wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc); + + // Assign buffers from dataBindings. for (size_t i = 0; i < numTensors; ++i) { op->buffers[i] = dataBindings[i].data.buffer; op->bufferSizes[i] = dataBindings[i].data.size; @@ -1477,7 +1661,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, // Create a buffer for the Params struct if (paramsSize > 0) { WGPUBufferDescriptor paramsBufferDesc = { - .label = {.data = nullptr, .length = 0}, + .label = {.data = nullptr, .length = 0}, .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst, .size = paramsSize, .mappedAtCreation = false, @@ -1489,6 +1673,8 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, } else { LOG(kDefLog, kTrace, "No params buffer needed"); } + + // Build bind group entries and the bind group. std::vector bindGroupEntries(numBindings); for (size_t i = 0; i < numTensors; ++i) { bindGroupEntries[i] = WGPUBindGroupEntry{ @@ -1516,6 +1702,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, }; op->bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc); + // Create pipeline layout. WGPUPipelineLayoutDescriptor pipelineLayoutDesc = { .bindGroupLayoutCount = 1, .bindGroupLayouts = &bgLayout, @@ -1523,63 +1710,101 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, WGPUPipelineLayout pipelineLayout = wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc); + // Prepare the WGSL source and shader module descriptor. WGPUShaderSourceWGSL wgslDesc = { .chain = {.sType = WGPUSType_ShaderSourceWGSL}, .code = {.data = code.data.c_str(), .length = code.data.length()}}; - WGPUShaderModuleDescriptor shaderModuleDesc = {}; shaderModuleDesc.nextInChain = &wgslDesc.chain; shaderModuleDesc.label = {code.label.c_str(), code.label.length()}; - WGPUComputePipelineDescriptor computePipelineDesc = {}; - computePipelineDesc.layout = pipelineLayout; - computePipelineDesc.compute.module = + // Create the shader module. + WGPUShaderModule shaderModule = wgpuDeviceCreateShaderModule(device, &shaderModuleDesc); + // If compilation info is requested, register the callback immediately. + if (compilationInfo) { + auto compPromise = std::make_shared>(); + std::future compFuture = compPromise->get_future(); + // Allocate helper data to pass to the callback. + auto *compData = new CompData{compilationInfo, compPromise}; + + auto compilationCallback = [](WGPUCompilationInfoRequestStatus status, + WGPUCompilationInfo const *info, + void *userdata1, void * /*userdata2*/) { + CompData *cd = reinterpret_cast(userdata1); + if (info && cd->compInfo) { + cd->compInfo->status = status; + for (uint32_t i = 0; i < info->messageCount; ++i) { + cd->compInfo->messages.push_back( + std::string(info->messages[i].message.data, + info->messages[i].message.length)); + cd->compInfo->lineNums.push_back(info->messages[i].lineNum); + cd->compInfo->linePos.push_back(info->messages[i].linePos); + } + cd->compInfo->finished = true; + } + cd->compPromise->set_value(); + delete cd; + }; + + WGPUCompilationInfoCallbackInfo compilationCallbackInfo = {}; + compilationCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + compilationCallbackInfo.callback = compilationCallback; + compilationCallbackInfo.userdata1 = compData; + compilationCallbackInfo.userdata2 = nullptr; + + // Register callback and then wait for the result. + wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo); + waitForFuture(ctx.instance, compFuture); + } + + // Now create the compute pipeline using the shader module. + WGPUComputePipelineDescriptor computePipelineDesc = {}; + computePipelineDesc.layout = pipelineLayout; + computePipelineDesc.compute.module = shaderModule; computePipelineDesc.compute.entryPoint = {code.entryPoint.c_str(), code.entryPoint.length()}; computePipelineDesc.label = {code.label.c_str(), code.label.length()}; - op->computePipeline = wgpuDeviceCreateComputePipeline(device, &computePipelineDesc); + op->totalWorkgroups = {totalWorkgroups[0], totalWorkgroups[1], totalWorkgroups[2]}; + resetCommandBuffer(device, op); if (cacheKey != nullptr) ctx.kernelPool.data[cacheKey] = op; - auto compilationInfoCallback = [](WGPUCompilationInfoRequestStatus status, - WGPUCompilationInfo const *compilationInfo, - void *userdata1, void *userdata2) { - CompilationInfo *result = static_cast(userdata1); - if (compilationInfo && result) { - result->status = status; - for (uint32_t i = 0; i < compilationInfo->messageCount; ++i) { - printf("Message %d: %.*s\n", i, - static_cast(compilationInfo->messages[i].message.length), - compilationInfo->messages[i].message.data); - result->messages.push_back( - std::string(compilationInfo->messages[i].message.data, - compilationInfo->messages[i].message.length)); - result->lineNums.push_back(compilationInfo->messages[i].lineNum); - result->linePos.push_back(compilationInfo->messages[i].linePos); - } - result->finished = true; - } else { - LOG(kDefLog, kTrace, "No compilation info or result"); - } - }; - - WGPUCompilationInfoCallbackInfo compilationCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = compilationInfoCallback, - .userdata1 = static_cast(compilationInfo), - .userdata2 = nullptr}; + outerPromise.set_value(op); + return outerFuture; +} - while (compilationInfo && !compilationInfo->finished) { - processEvents(ctx.instance); +/** + * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done. + * + * This callback is invoked when the GPU queue signals the completion of the submitted + * workload for a kernel dispatch. It receives the work-done status and a userdata pointer, + * which is expected to be a heap‑allocated pointer to a std::promise. + * + * On success, the promise is fulfilled by calling set_value(). Otherwise, it is set with an exception. + * After setting the promise state, the allocated memory for the promise is freed. + * + * @param status The status of the work done. Expected to be WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A heap allocated pointer to std::promise which is set when the work is done. + * @param userdata2 Unused. + */ +inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, + void *userdata1, void * /*userdata2*/) { + // Cast the userdata pointer back to our heap‑allocated promise. + auto *p = reinterpret_cast *>(userdata1); + if (status == WGPUQueueWorkDoneStatus_Success) { + p->set_value(); + } else { + p->set_exception(std::make_exception_ptr( + std::runtime_error("Queue work did not complete successfully."))); } - return op; + delete p; // free the heap allocation } /** @@ -1599,17 +1824,17 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code, * @return Kernel instance representing the created kernel * * @code - * Kernel kernel = createKernel(ctx, code, tensorData, output, + * std::future kernelFuture = createKernel(ctx, code, tensorData, output,totalWorkgroups, params); + * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); * @endcode - * totalWorkgroups, params); */ template -Kernel createKernel(Context &ctx, const KernelCode &code, - const Bindings &dataBindings, - const Shape &totalWorkgroups, - const ParamsType ¶ms = ParamsType{}, - CompilationInfo *compilationInfo = nullptr, - const char *cacheKey = nullptr) { +std::future createKernel(Context &ctx, const KernelCode &code, + const Bindings &dataBindings, + const Shape &totalWorkgroups, + const ParamsType ¶ms = ParamsType{}, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { if constexpr (!IsNoParam) { return createKernel(ctx, code, dataBindings.data.data(), numInputs, dataBindings.viewOffsets.data(), totalWorkgroups, @@ -1637,30 +1862,37 @@ Kernel createKernel(Context &ctx, const KernelCode &code, * @param[in] promise Promise to set when the kernel has finished executing * * @code - * dispatchKernel(ctx, kernel); + * std::future dispatchFuture = dispatchKernel(ctx, kernel); + * WaitForFuture(ctx.instance, dispatchFuture); * @endcode */ -inline void dispatchKernel(Context &ctx, Kernel &kernel, - std::promise &promise) { +inline std::future dispatchKernel(Context &ctx, Kernel &kernel) { + // If the kernel was used before, reset the command buffer. if (kernel->used) { resetCommandBuffer(ctx.device, kernel); } + + // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &kernel->commandBuffer); wgpuCommandBufferRelease(kernel->commandBuffer); kernel->used = true; - WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = - [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) { - check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", - __FILE__, __LINE__); - auto *promise = static_cast *>(userdata1); - promise->set_value(); - }, - .userdata1 = &promise, - .userdata2 = nullptr}; + // Allocate a promise on the heap so it remains valid beyond this function’s + // scope. + std::promise *promise = new std::promise(); + std::future future = promise->get_future(); + + // Set up the callback info. + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {}; + workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; + workDoneCallbackInfo.callback = dispatchKernelCallback; + workDoneCallbackInfo.userdata1 = reinterpret_cast(promise); + workDoneCallbackInfo.userdata2 = nullptr; + + // IMPORTANT: Pass the address of the callback info structure. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + + return future; } } // namespace gpu diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp index fe5aab7..75d9dc4 100644 --- a/numeric_types/half.cpp +++ b/numeric_types/half.cpp @@ -189,7 +189,8 @@ void testContainers() { std::array h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0}; Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data()); std::array h2; - toCPU(ctx, devH, h2.data(), sizeof(h2)); + std::future toCPUFuture = toCPU(ctx, devH, h2.data(), sizeof(h2)); + waitForFuture(ctx.instance, toCPUFuture); for (int i = 0; i < 8; ++i) { printResult(h[i].data == h2[i].data, "Container round trip", static_cast(h[i]), static_cast(h2[i])); @@ -228,13 +229,13 @@ fn main( } Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf16); - std::promise promise; - std::future future = promise.get_future(); - Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output}, + std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - dispatchKernel(ctx, op, promise); - wait(ctx, future); - toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); + Kernel op = waitForFuture(ctx.instance, kernelFuture); + std::future dispatchFuture = dispatchKernel(ctx, op); + waitForFuture(ctx.instance, dispatchFuture); + std::future toCPUFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); + waitForFuture(ctx.instance, toCPUFuture); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", static_cast(inputArr[i]), static_cast(outputArr[i])); From 14e7ab59a67329573bc69a7dfce5d431ba8777b3 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Wed, 19 Feb 2025 18:06:26 -0600 Subject: [PATCH 02/17] use async context waitForContext() --- cmake/example.cmake | 4 +- examples/hello_world/run.cpp | 4 +- gpu.hpp | 279 ++++++++++++++++++++++------------- numeric_types/half.cpp | 5 +- 4 files changed, 180 insertions(+), 112 deletions(-) diff --git a/cmake/example.cmake b/cmake/example.cmake index 7cf1f8d..5953876 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -45,14 +45,16 @@ if(EMSCRIPTEN) # Set Emscripten-specific link flags that enable WASM output and expose certain symbols. # Needed to use updated version, emdawnwebgpu set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\ + -O3 \ -sUSE_WEBGPU=0 \ -sWASM=1 \ -DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \ -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \ -sEXPORTED_RUNTIME_METHODS=ccall \ -sUSE_GLFW=3 \ - -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=5MB \ + -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \ -sASYNCIFY \ + -sASYNCIFY_DEBUG \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \ diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp index 06970a7..c9f22c7 100644 --- a/examples/hello_world/run.cpp +++ b/examples/hello_world/run.cpp @@ -28,7 +28,7 @@ int main(int argc, char **argv) { printf("--------------\n\n"); // std::unique_ptr ctx = createContext(); - Context ctx = createContext(); + Context ctx = waitForContext(); static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { @@ -36,8 +36,6 @@ int main(int argc, char **argv) { } Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf32); - std::promise promise; - std::future future = promise.get_future(); std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf32}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); diff --git a/gpu.hpp b/gpu.hpp index 052c674..0119108 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -793,10 +793,11 @@ inline void check(bool condition, const char *message, /** * @brief Pumps events until the provided future is ready. * - * This helper template function continuously checks the status of the provided std::future - * until it becomes ready. On Emscripten builds, it yields control to the JavaScript event loop - * using emscripten_sleep to allow asynchronous callbacks to execute. On other platforms, it - * processes events from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future + * This helper template function continuously checks the status of the provided + * std::future until it becomes ready. On Emscripten builds, it yields + * control to the JavaScript event loop using emscripten_sleep to allow + * asynchronous callbacks to execute. On other platforms, it processes events + * from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future * is ready, its value is returned. * * @tparam T The type of the value contained in the future. @@ -805,8 +806,8 @@ inline void check(bool condition, const char *message, * @return T The value retrieved from the ready future. * * @code - * std::future deviceFuture = requestDeviceAsync(adapter, devDescriptor); - * WGPUDevice device = waitForFuture(instance, deviceFuture); + * std::future deviceFuture = requestDeviceAsync(adapter, + * devDescriptor); WGPUDevice device = waitForFuture(instance, deviceFuture); * @endcode */ template @@ -831,17 +832,56 @@ T waitForFuture(WGPUInstance instance, std::future &f) { // Context Callbacks & Helpers /** - * @brief Adapter callback function invoked upon completion of an asynchronous WebGPU adapter request. + * @brief Waits for the provided std::future to become ready by polling its status. * - * This callback is triggered when the request for a WebGPU adapter completes. It verifies whether - * the adapter was successfully obtained. On failure, it logs an error message (in Emscripten builds) - * and sets an exception on the associated promise. On success, it sets the value of the promise with - * the obtained adapter. Finally, it frees the allocated memory for the promise pointer. + * This helper template function continuously checks the status of the provided std::future until it is ready. + * On Emscripten builds, it yields control to the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous behavior. + * On non-Emscripten platforms, it sleeps for a short duration (10 milliseconds) between checks. + * Once the future is ready, its value is returned. * - * @param status The status of the adapter request. Expected to be WGPURequestAdapterStatus_Success on success. + * @tparam T The type of the value contained in the future. + * @param f The future to wait on. + * @return T The value retrieved from the ready future. + * + * @code + * std::future contextFuture = createContext(); + * Context ctx = waitForContextFuture(contextFuture); + * @endcode + */ +template T waitForContextFuture(std::future &f) { + #ifdef __EMSCRIPTEN__ + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + emscripten_sleep(1); // Yield back to the JS event loop. + } + return f.get(); + #else + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + return f.get(); + #endif + } + +/** + * @brief Adapter callback function invoked upon completion of an asynchronous + * WebGPU adapter request. + * + * This callback is triggered when the request for a WebGPU adapter completes. + * It verifies whether the adapter was successfully obtained. On failure, it + * logs an error message (in Emscripten builds) and sets an exception on the + * associated promise. On success, it sets the value of the promise with the + * obtained adapter. Finally, it frees the allocated memory for the promise + * pointer. + * + * @param status The status of the adapter request. Expected to be + * WGPURequestAdapterStatus_Success on success. * @param adapter The WGPUAdapter obtained on a successful request. - * @param message A string view containing additional information about the adapter request. - * @param userdata1 A pointer to a heap-allocated std::shared_ptr>. + * @param message A string view containing additional information about the + * adapter request. + * @param userdata1 A pointer to a heap-allocated + * std::shared_ptr>. * @param userdata2 Unused. */ inline void adapterCallback(WGPURequestAdapterStatus status, @@ -864,17 +904,22 @@ inline void adapterCallback(WGPURequestAdapterStatus status, } /** - * @brief Callback function invoked upon completion of an asynchronous WebGPU device request. + * @brief Callback function invoked upon completion of an asynchronous WebGPU + * device request. * - * This callback is triggered when the request for a WebGPU device completes. It verifies that - * the device was successfully created. On success, the callback sets the value of the associated - * promise; otherwise, it sets an exception. After fulfilling the promise, it frees the allocated - * memory for the promise pointer. + * This callback is triggered when the request for a WebGPU device completes. It + * verifies that the device was successfully created. On success, the callback + * sets the value of the associated promise; otherwise, it sets an exception. + * After fulfilling the promise, it frees the allocated memory for the promise + * pointer. * - * @param status The status of the device request. Expected to be WGPURequestDeviceStatus_Success on success. + * @param status The status of the device request. Expected to be + * WGPURequestDeviceStatus_Success on success. * @param device The WGPUDevice obtained on successful request. - * @param message A string view containing additional information about the device request. - * @param userdata1 A pointer to a heap-allocated std::shared_ptr>. + * @param message A string view containing additional information about the + * device request. + * @param userdata1 A pointer to a heap-allocated + * std::shared_ptr>. * @param userdata2 Unused. */ inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device, @@ -897,13 +942,14 @@ inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device, /** * @brief Asynchronously requests a WebGPU adapter from the given instance. * - * This helper function wraps the asynchronous call to request an adapter using the WebGPU API. - * It sets up a promise and registers an adapter callback, returning a future that will eventually - * hold the requested WGPUAdapter. + * This helper function wraps the asynchronous call to request an adapter using + * the WebGPU API. It sets up a promise and registers an adapter callback, + * returning a future that will eventually hold the requested WGPUAdapter. * * @param instance The WGPUInstance from which to request the adapter. * @param adapterOpts The options for requesting the adapter. - * @return std::future A future that will eventually hold the created WGPUAdapter. + * @return std::future A future that will eventually hold the + * created WGPUAdapter. */ inline std::future requestAdapterAsync(WGPUInstance instance, @@ -923,13 +969,15 @@ requestAdapterAsync(WGPUInstance instance, /** * @brief Asynchronously requests a WebGPU device from a given adapter. * - * This helper function wraps the asynchronous call to request a device using the WebGPU API. - * It sets up a promise and registers a device callback, returning a future that will be fulfilled - * once the device is available. + * This helper function wraps the asynchronous call to request a device using + * the WebGPU API. It sets up a promise and registers a device callback, + * returning a future that will be fulfilled once the device is available. * * @param adapter The WGPUAdapter to request the device from. - * @param devDescriptor The descriptor specifying the characteristics of the requested device. - * @return std::future A future that will eventually hold the created WGPUDevice. + * @param devDescriptor The descriptor specifying the characteristics of the + * requested device. + * @return std::future A future that will eventually hold the + * created WGPUDevice. */ inline std::future requestDeviceAsync(WGPUAdapter adapter, @@ -964,60 +1012,62 @@ requestDeviceAsync(WGPUAdapter adapter, * @return Context instance representing the created GPU context * */ -inline Context createContext(const WGPUInstanceDescriptor &desc = {}, - const WGPURequestAdapterOptions &adapterOpts = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) { - Context ctx; // Stack-allocated Context. +inline std::future +createContext(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { -#ifdef __EMSCRIPTEN__ - ctx.instance = wgpuCreateInstance(nullptr); -#else - ctx.instance = wgpuCreateInstance(&desc); -#endif - check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__); + auto promise = std::make_shared>(); - // Request the adapter asynchronously. - LOG(kDefLog, kTrace, "Requesting adapter"); + // On native platforms, run our context creation in a detached thread. + + Context ctx; + ctx.instance = wgpuCreateInstance(&desc); + if (!ctx.instance) { + promise->set_exception(std::make_exception_ptr( + std::runtime_error("Failed to create WebGPU instance."))); + return promise->get_future(); + } try { auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts); - // Pump events until the adapter future is ready. ctx.adapter = waitForFuture(ctx.instance, adapterFuture); ctx.adapterStatus = WGPURequestAdapterStatus_Success; } catch (const std::exception &ex) { - check(false, ex.what(), __FILE__, __LINE__); + promise->set_exception(std::make_exception_ptr(ex)); + return promise->get_future(); } - - // Request the device asynchronously. - LOG(kDefLog, kTrace, "Requesting device"); try { auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); - // Pump events until the device future is ready. ctx.device = waitForFuture(ctx.instance, deviceFuture); ctx.deviceStatus = WGPURequestDeviceStatus_Success; - LOG(kDefLog, kTrace, "Device request ended"); - - // If the device was created, set up logging and fetch the queue. -#ifndef __EMSCRIPTEN__ - WGPULoggingCallbackInfo loggingCallbackInfo{ - .nextInChain = nullptr, - .callback = - [](WGPULoggingType type, WGPUStringView message, void *, void *) { - LOG(kDefLog, kError, "Device logging callback: %.*s", - static_cast(message.length), message.data); - if (type == WGPULoggingType_Error) { - throw std::runtime_error("Device error logged."); - } - }, - .userdata1 = nullptr, - .userdata2 = nullptr}; - wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); -#endif - ctx.queue = wgpuDeviceGetQueue(ctx.device); } catch (const std::exception &ex) { - check(false, ex.what(), __FILE__, __LINE__); + promise->set_exception(std::make_exception_ptr(ex)); + return promise->get_future(); } + ctx.queue = wgpuDeviceGetQueue(ctx.device); + promise->set_value(std::move(ctx)); + + return promise->get_future(); +} - return std::move(ctx); +/** + * @brief Synchronously waits for and returns the created GPU context. + * + * This function invokes the asynchronous createContext() factory function to create a GPU + * context, then waits for its completion using waitForContextFuture. The returned Context + * holds handles to the WebGPU instance, adapter, device, and queue, and is used for subsequent + * GPU operations. + * + * @return Context The fully initialized GPU context. + * + * @code + * Context ctx = waitForContext(); + * // Now ctx can be used for GPU operations. + * @endcode + */ +inline Context waitForContext() { + std::future contextFuture = createContext(); + return waitForContextFuture(contextFuture); } #ifdef USE_DAWN_API @@ -1152,17 +1202,22 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, #endif /** - * @brief Callback function invoked upon completion of an asynchronous GPU buffer mapping. - * - * This callback is triggered when the GPU buffer mapping for a readback buffer is completed. - * It verifies that the mapping operation was successful, retrieves the mapped memory, - * copies the data from the GPU buffer to a CPU memory region, unmaps the buffer, - * signals the completion by fulfilling the associated promise, and cleans up the allocated callback data. - * - * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success on success. - * @param message A string view containing additional information about the mapping operation. - * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the GPU buffer, - * buffer size, destination CPU memory pointer, and a promise for signaling completion. + * @brief Callback function invoked upon completion of an asynchronous GPU + * buffer mapping. + * + * This callback is triggered when the GPU buffer mapping for a readback buffer + * is completed. It verifies that the mapping operation was successful, + * retrieves the mapped memory, copies the data from the GPU buffer to a CPU + * memory region, unmaps the buffer, signals the completion by fulfilling the + * associated promise, and cleans up the allocated callback data. + * + * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success + * on success. + * @param message A string view containing additional information about the + * mapping operation. + * @param userdata1 A pointer to a heap-allocated CallbackData structure + * containing the GPU buffer, buffer size, destination CPU memory pointer, and a + * promise for signaling completion. * @param userdata2 Unused. */ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, @@ -1192,16 +1247,20 @@ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, } /** - * @brief Callback function invoked when the GPU queue’s submitted work is complete. - * - * This callback is registered with the GPU queue after submitting work. When invoked, - * it verifies that all queued work completed successfully, and then sets up the buffer - * mapping callback to initiate the asynchronous mapping of a readback buffer. The readback - * buffer is mapped to access the processed data on the CPU. - * - * @param status The status of the completed work. Expected to be WGPUQueueWorkDoneStatus_Success on success. - * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the readback buffer, - * buffer size, destination CPU memory pointer, and a promise to signal completion. + * @brief Callback function invoked when the GPU queue’s submitted work is + * complete. + * + * This callback is registered with the GPU queue after submitting work. When + * invoked, it verifies that all queued work completed successfully, and then + * sets up the buffer mapping callback to initiate the asynchronous mapping of a + * readback buffer. The readback buffer is mapped to access the processed data + * on the CPU. + * + * @param status The status of the completed work. Expected to be + * WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A pointer to a heap-allocated CallbackData structure + * containing the readback buffer, buffer size, destination CPU memory pointer, + * and a promise to signal completion. * @param userdata2 Unused. */ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, @@ -1543,12 +1602,14 @@ inline Shape cdiv(Shape total, Shape group) { } /** - * @brief Packages the shader compilation information along with a promise for asynchronous signaling. + * @brief Packages the shader compilation information along with a promise for + * asynchronous signaling. * * This structure holds a pointer to a CompilationInfo instance that collects - * details such as status, messages, line numbers, and positions from the shader compilation. - * It also contains a shared pointer to a std::promise which is used to signal the completion - * of the asynchronous shader compilation process. + * details such as status, messages, line numbers, and positions from the shader + * compilation. It also contains a shared pointer to a std::promise which + * is used to signal the completion of the asynchronous shader compilation + * process. */ struct CompData { CompilationInfo *compInfo; @@ -1578,10 +1639,11 @@ struct CompData { * @return Kernel instance representing the created kernel * * @code - * std::future kernelFuture = createKernel(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize); + * std::future kernelFuture = createKernel(ctx, code, dataBindings, + numInputs, output, nThreads, params, paramsSize); * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); * @endcode - + */ inline std::future createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, @@ -1783,15 +1845,19 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, /** * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done. * - * This callback is invoked when the GPU queue signals the completion of the submitted - * workload for a kernel dispatch. It receives the work-done status and a userdata pointer, - * which is expected to be a heap‑allocated pointer to a std::promise. + * This callback is invoked when the GPU queue signals the completion of the + * submitted workload for a kernel dispatch. It receives the work-done status + * and a userdata pointer, which is expected to be a heap‑allocated pointer to a + * std::promise. * - * On success, the promise is fulfilled by calling set_value(). Otherwise, it is set with an exception. - * After setting the promise state, the allocated memory for the promise is freed. + * On success, the promise is fulfilled by calling set_value(). Otherwise, it is + * set with an exception. After setting the promise state, the allocated memory + * for the promise is freed. * - * @param status The status of the work done. Expected to be WGPUQueueWorkDoneStatus_Success on success. - * @param userdata1 A heap allocated pointer to std::promise which is set when the work is done. + * @param status The status of the work done. Expected to be + * WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A heap allocated pointer to std::promise which is set + * when the work is done. * @param userdata2 Unused. */ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, @@ -1824,8 +1890,9 @@ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, * @return Kernel instance representing the created kernel * * @code - * std::future kernelFuture = createKernel(ctx, code, tensorData, output,totalWorkgroups, params); - * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); + * std::future kernelFuture = createKernel(ctx, code, tensorData, + * output,totalWorkgroups, params); Kernel kernel = WaitForFuture(ctx.instance, + * kernelFuture); * @endcode */ template diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp index 75d9dc4..21a0005 100644 --- a/numeric_types/half.cpp +++ b/numeric_types/half.cpp @@ -185,7 +185,7 @@ void testContainers() { testRoundTrip(h[3]); } { - Context ctx = createContext(); + Context ctx = waitForContext(); std::array h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0}; Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data()); std::array h2; @@ -215,13 +215,14 @@ fn main( } } )"; - Context ctx = createContext( + std::future futureContext = createContext( {}, {}, /*device descriptor, enabling f16 in WGSL*/ { .requiredFeatureCount = 1, .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(), }); + Context ctx = waitForContextFuture(futureContext); static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { From 9a08f8a875d74fda1644adbb367edfdf2f70838a Mon Sep 17 00:00:00 2001 From: MichealReed Date: Thu, 20 Feb 2025 13:54:02 -0600 Subject: [PATCH 03/17] adds sync wrappers --- cmake/example.cmake | 1 - examples/hello_world/run.cpp | 11 +- examples/render/run.cpp | 6 +- gpu.hpp | 372 +++++++++++++++++++++++++---------- numeric_types/half.cpp | 19 +- 5 files changed, 283 insertions(+), 126 deletions(-) diff --git a/cmake/example.cmake b/cmake/example.cmake index 5953876..cf697b5 100644 --- a/cmake/example.cmake +++ b/cmake/example.cmake @@ -54,7 +54,6 @@ if(EMSCRIPTEN) -sUSE_GLFW=3 \ -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \ -sASYNCIFY \ - -sASYNCIFY_DEBUG \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \ --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \ diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp index c9f22c7..77549cf 100644 --- a/examples/hello_world/run.cpp +++ b/examples/hello_world/run.cpp @@ -28,7 +28,7 @@ int main(int argc, char **argv) { printf("--------------\n\n"); // std::unique_ptr ctx = createContext(); - Context ctx = waitForContext(); + Context ctx = createContext(); static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { @@ -36,14 +36,11 @@ int main(int argc, char **argv) { } Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf32); - std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf32}, + Kernel op = createKernel(ctx, {kGelu, 256, kf32}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - Kernel op = waitForFuture(ctx.instance, kernelFuture); - std::future dispatchFuture = dispatchKernel(ctx, op); - waitForFuture(ctx.instance, dispatchFuture); - std::future cpuFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); - waitForFuture(ctx.instance, cpuFuture); + dispatchKernel(ctx, op); + toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]); } diff --git a/examples/render/run.cpp b/examples/render/run.cpp index f9a90f9..64122cd 100644 --- a/examples/render/run.cpp +++ b/examples/render/run.cpp @@ -124,10 +124,8 @@ int main(int argc, char **argv) { cdiv({NCOLS, NROWS, 1}, wgSize), params); printf("\033[2J\033[H"); while (true) { - std::promise promise; - std::future future = promise.get_future(); - dispatchKernel(ctx, renderKernel, promise); - wait(ctx, future); + + dispatchKernel(ctx, renderKernel); toCPU(ctx, devScreen, screen.data(), sizeof(screen)); params.time = getCurrentTimeInMilliseconds() - zeroTime; diff --git a/gpu.hpp b/gpu.hpp index 0119108..e050c87 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -807,11 +807,10 @@ inline void check(bool condition, const char *message, * * @code * std::future deviceFuture = requestDeviceAsync(adapter, - * devDescriptor); WGPUDevice device = waitForFuture(instance, deviceFuture); + * devDescriptor); WGPUDevice device = wait(instance, deviceFuture); * @endcode */ -template -T waitForFuture(WGPUInstance instance, std::future &f) { +template T wait(Context &ctx, std::future &f) { #ifdef __EMSCRIPTEN__ // Poll until the future is ready. while (f.wait_for(std::chrono::milliseconds(0)) != @@ -823,7 +822,7 @@ T waitForFuture(WGPUInstance instance, std::future &f) { #else while (f.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { - wgpuInstanceProcessEvents(instance); + wgpuInstanceProcessEvents(ctx.instance); } return f.get(); #endif @@ -832,12 +831,15 @@ T waitForFuture(WGPUInstance instance, std::future &f) { // Context Callbacks & Helpers /** - * @brief Waits for the provided std::future to become ready by polling its status. + * @brief Waits for the provided std::future to become ready by polling its + * status. * - * This helper template function continuously checks the status of the provided std::future until it is ready. - * On Emscripten builds, it yields control to the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous behavior. - * On non-Emscripten platforms, it sleeps for a short duration (10 milliseconds) between checks. - * Once the future is ready, its value is returned. + * This helper template function continuously checks the status of the provided + * std::future until it is ready. On Emscripten builds, it yields control to + * the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous + * behavior. On non-Emscripten platforms, it sleeps for a short duration (10 + * milliseconds) between checks. Once the future is ready, its value is + * returned. * * @tparam T The type of the value contained in the future. * @param f The future to wait on. @@ -849,20 +851,20 @@ T waitForFuture(WGPUInstance instance, std::future &f) { * @endcode */ template T waitForContextFuture(std::future &f) { - #ifdef __EMSCRIPTEN__ - while (f.wait_for(std::chrono::milliseconds(0)) != - std::future_status::ready) { - emscripten_sleep(1); // Yield back to the JS event loop. - } - return f.get(); - #else - while (f.wait_for(std::chrono::milliseconds(0)) != - std::future_status::ready) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - return f.get(); - #endif +#ifdef __EMSCRIPTEN__ + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + emscripten_sleep(1); // Yield back to the JS event loop. + } + return f.get(); +#else + while (f.wait_for(std::chrono::milliseconds(0)) != + std::future_status::ready) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); } + return f.get(); +#endif +} /** * @brief Adapter callback function invoked upon completion of an asynchronous @@ -1013,9 +1015,9 @@ requestDeviceAsync(WGPUAdapter adapter, * */ inline std::future -createContext(const WGPUInstanceDescriptor &desc = {}, - const WGPURequestAdapterOptions &adapterOpts = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) { +createContextAsync(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { auto promise = std::make_shared>(); @@ -1030,7 +1032,7 @@ createContext(const WGPUInstanceDescriptor &desc = {}, } try { auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts); - ctx.adapter = waitForFuture(ctx.instance, adapterFuture); + ctx.adapter = wait(ctx, adapterFuture); ctx.adapterStatus = WGPURequestAdapterStatus_Success; } catch (const std::exception &ex) { promise->set_exception(std::make_exception_ptr(ex)); @@ -1038,7 +1040,7 @@ createContext(const WGPUInstanceDescriptor &desc = {}, } try { auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); - ctx.device = waitForFuture(ctx.instance, deviceFuture); + ctx.device = wait(ctx, deviceFuture); ctx.deviceStatus = WGPURequestDeviceStatus_Success; } catch (const std::exception &ex) { promise->set_exception(std::make_exception_ptr(ex)); @@ -1053,10 +1055,11 @@ createContext(const WGPUInstanceDescriptor &desc = {}, /** * @brief Synchronously waits for and returns the created GPU context. * - * This function invokes the asynchronous createContext() factory function to create a GPU - * context, then waits for its completion using waitForContextFuture. The returned Context - * holds handles to the WebGPU instance, adapter, device, and queue, and is used for subsequent - * GPU operations. + * This function invokes the asynchronous createContext() factory function to + * create a GPU context, then waits for its completion using + * waitForContextFuture. The returned Context holds handles to the WebGPU + * instance, adapter, device, and queue, and is used for subsequent GPU + * operations. * * @return Context The fully initialized GPU context. * @@ -1065,8 +1068,11 @@ createContext(const WGPUInstanceDescriptor &desc = {}, * // Now ctx can be used for GPU operations. * @endcode */ -inline Context waitForContext() { - std::future contextFuture = createContext(); +inline Context createContext(const WGPUInstanceDescriptor &desc = {}, + const WGPURequestAdapterOptions &adapterOpts = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + std::future contextFuture = + createContextAsync(desc, adapterOpts, devDescriptor); return waitForContextFuture(contextFuture); } @@ -1294,8 +1300,8 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, * toCPU(ctx, tensor, data, bufferSize); * @endcode */ -inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, - size_t bufferSize, CopyData &op) { +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, + size_t bufferSize, CopyData &op) { // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); @@ -1340,8 +1346,8 @@ inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, * @param[in] bufferSize Size of the data buffer in bytes * @param[out] data Pointer to the CPU memory to copy the data to */ -inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, - size_t bufferSize) { +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, + size_t bufferSize) { // Create a promise that will later be satisfied when the async copy // completes. auto promise = std::make_shared>(); @@ -1395,26 +1401,8 @@ inline std::future toCPU(Context &ctx, Tensor &tensor, void *data, return promise->get_future(); } -/** - * @brief Overload of the toCPU function to copy data from a GPU buffer to CPU - * memory for an array of floats instead of a pointer to a float buffer. - * @param[in] ctx Context instance to manage the operation - * @param[in] tensor Tensor instance representing the GPU buffer to copy from - * @param[out] data Array of floats to copy the data to - * - * @code - * std::future toCPUFuture = toCPU(ctx, tensor, data); - * WaitForFuture(ctx.instance, toCPUFuture); - * @endcode - */ -template -inline std::future toCPU(Context &ctx, Tensor &tensor, - std::array &data) { - return toCPU(ctx, tensor, data.data(), sizeof(data)); -} - -inline std::future toCPU(Context &ctx, WGPUBuffer buffer, void *data, - size_t size) { +inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, + size_t size) { // The size (in bytes) for the copy. uint64_t bufferSize = size; @@ -1473,6 +1461,92 @@ inline std::future toCPU(Context &ctx, WGPUBuffer buffer, void *data, return promise->get_future(); } +/** + * @brief Overload of the toCPU function to copy data from a GPU buffer to CPU + * memory for an array of floats instead of a pointer to a float buffer. + * @param[in] ctx Context instance to manage the operation + * @param[in] tensor Tensor instance representing the GPU buffer to copy from + * @param[out] data Array of floats to copy the data to + * + * @code + * std::future toCPUFuture = toCPU(ctx, tensor, data); + * wait(ctx, toCPUFuture); + * @endcode + */ +template +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, + std::array &data) { + return toCPUAsync(ctx, tensor, data.data(), sizeof(data)); +} + +/** + * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU + * memory. + * + * This function synchronously waits for the asynchronous copy operation to + * complete, ensuring that the data is fully transferred from the GPU buffer to + * the CPU memory before returning. + * + * @param ctx Context instance to manage the operation + * @param tensor Tensor instance representing the GPU buffer to copy from + * @param data Pointer to the CPU memory to copy the data to + * @param bufferSize Size of the data buffer in bytes + * @param instance WGPUInstance used for processing events during waiting + * + * @code + * toCPU(ctx, tensor, data, bufferSize, instance); + * @endcode + */ +inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { + auto future = toCPUAsync(ctx, tensor, data, bufferSize); + wait(ctx, future); +} + +/** + * @brief Synchronous wrapper for copying from a GPU buffer to CPU memory. + * + * This function synchronously waits for the asynchronous copy operation to + * complete, ensuring that the data is fully transferred from the GPU buffer to + * the CPU memory before returning. + * + * @param ctx Context instance to manage the operation + * @param buffer WGPUBuffer instance representing the GPU buffer to copy from + * @param data Pointer to the CPU memory to copy the data to + * @param size Size of the data buffer in bytes + * @param instance WGPUInstance used for processing events during waiting + * + * @code + * toCPU(ctx, buffer, data, size, instance); + * @endcode + */ +inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { + auto future = toCPUAsync(ctx, buffer, data, size); + wait(ctx, future); +} + +/** + * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU + * memory for an array of floats instead of a pointer to a float buffer. + * + * This function synchronously waits for the asynchronous copy operation to + * complete, ensuring that the data is fully transferred from the GPU buffer to + * the CPU memory before returning. + * + * @param ctx Context instance to manage the operation + * @param tensor Tensor instance representing the GPU buffer to copy from + * @param data Array of floats to copy the data to + * @param instance WGPUInstance used for processing events during waiting + * + * @code + * toCPU(ctx, tensor, data, instance); + * @endcode + */ +template +inline void toCPU(Context &ctx, Tensor &tensor, std::array &data) { + auto future = toCPUAsync(ctx, tensor, data); + wait(ctx, future); +} + /** * @brief Copies data from CPU memory to a GPU buffer. The toGPU overloads are * effectively a convenience wrapper around the WebGPU API call @@ -1617,9 +1691,9 @@ struct CompData { }; /** - * @brief A factory function to create a kernel on the GPU. The kernel is - * created with the given WGSL code, input tensors, output tensor, and - * optional parameters. + * @brief A factory function to create a kernel asynchronously on the GPU. + * The kernel is created with the given WGSL code, input tensors, + * output tensor, and optional parameters. * * Note that the values of the input tensors are not used here, only the * reference handles to the underlying buffers as well as the size of the @@ -1639,18 +1713,19 @@ struct CompData { * @return Kernel instance representing the created kernel * * @code - * std::future kernelFuture = createKernel(ctx, code, dataBindings, + * std::future kernelFuture = createKernelAsync(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize); - * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture); + * Kernel kernel = wait(ctx.instance, kernelFuture); * @endcode */ inline std::future -createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, - size_t numTensors, const size_t *viewOffsets, - const Shape &totalWorkgroups, const void *params = nullptr, - size_t paramsSize = 0, CompilationInfo *compilationInfo = nullptr, - const char *cacheKey = nullptr) { +createKernelAsync(Context &ctx, const KernelCode &code, + const Tensor *dataBindings, size_t numTensors, + const size_t *viewOffsets, const Shape &totalWorkgroups, + const void *params = nullptr, size_t paramsSize = 0, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { // Create a cache key by the pointer values of the data bindings and the // kernel code if (cacheKey != nullptr && @@ -1818,7 +1893,7 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, // Register callback and then wait for the result. wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo); - waitForFuture(ctx.instance, compFuture); + wait(ctx, compFuture); } // Now create the compute pipeline using the shader module. @@ -1842,35 +1917,81 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings, return outerFuture; } -/** - * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done. +/* + * @brief Overload which wraps the createKernelAsync factory function to create + * a kernel on the GPU. This overload uses takes a pointer and size for the + * input tensors instead of a static collection and a void pointer for params + * instead of a static type. * - * This callback is invoked when the GPU queue signals the completion of the - * submitted workload for a kernel dispatch. It receives the work-done status - * and a userdata pointer, which is expected to be a heap‑allocated pointer to a - * std::promise. + * @param[in] ctx Context instance to manage the kernel + * @param[in] code WGSL code for the kernel + * @param[in] dataBindings Pointer to a span of tensors bound to the kernel + * @param[in] numTensors Number of tensors in the dataBindings span + * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be + * a Shape of rank == 3. + * @param[in] params Optional parameters for the kernel. If the kernel does + * not have any parameters, use NoParam. + * @return Kernel instance representing the created kernel * - * On success, the promise is fulfilled by calling set_value(). Otherwise, it is - * set with an exception. After setting the promise state, the allocated memory - * for the promise is freed. + * @code + * std::future kernelFuture = createKernel(ctx, code, tensorData, + * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance, + * kernelFuture); + * @endcode + */ +inline Kernel createKernel(Context &ctx, const KernelCode &code, + const Tensor *dataBindings, size_t numTensors, + const size_t *viewOffsets, + const Shape &totalWorkgroups, + const void *params = nullptr, size_t paramsSize = 0, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { + std::future kernelFuture = createKernelAsync( + ctx, code, dataBindings, numTensors, viewOffsets, totalWorkgroups, params, + paramsSize, compilationInfo, cacheKey); + return wait(ctx, kernelFuture); +} + +/** + * @brief Overload which wraps the createKernelAsync factory function to create + * a kernel asynchronously on the GPU. This overload uses takes a static + * collection of input tensors instead of a pointer and a statically determined + * ParamsType instead of casting params to a void pointer. * - * @param status The status of the work done. Expected to be - * WGPUQueueWorkDoneStatus_Success on success. - * @param userdata1 A heap allocated pointer to std::promise which is set - * when the work is done. - * @param userdata2 Unused. + * @param[in] ctx Context instance to manage the kernel + * @param[in] code WGSL code for the kernel + * @param[in] dataBindings A Bindings of tensors whose GPU buffers are bound + * to the kernel as inputs and outputs. + * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be + * a Shape of rank == 3. + * @param[in] params Optional parameters for the kernel. If the kernel does + * not have any parameters, use NoParam. + * @return Kernel instance representing the created kernel + * + * @code + * std::future kernelFuture = createKernel(ctx, code, tensorData, + * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance, + * kernelFuture); + * @endcode */ -inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, - void *userdata1, void * /*userdata2*/) { - // Cast the userdata pointer back to our heap‑allocated promise. - auto *p = reinterpret_cast *>(userdata1); - if (status == WGPUQueueWorkDoneStatus_Success) { - p->set_value(); +template +std::future +createKernelAsync(Context &ctx, const KernelCode &code, + const Bindings &dataBindings, + const Shape &totalWorkgroups, + const ParamsType ¶ms = ParamsType{}, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { + if constexpr (!IsNoParam) { + return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs, + dataBindings.viewOffsets.data(), totalWorkgroups, + reinterpret_cast(¶ms), + sizeof(ParamsType), compilationInfo, cacheKey); } else { - p->set_exception(std::make_exception_ptr( - std::runtime_error("Queue work did not complete successfully."))); + return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs, + dataBindings.viewOffsets.data(), totalWorkgroups, + nullptr, 0, compilationInfo, cacheKey); } - delete p; // free the heap allocation } /** @@ -1890,18 +2011,17 @@ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, * @return Kernel instance representing the created kernel * * @code - * std::future kernelFuture = createKernel(ctx, code, tensorData, - * output,totalWorkgroups, params); Kernel kernel = WaitForFuture(ctx.instance, - * kernelFuture); + * Kernel kernel = createKernel(ctx, code, tensorData, output,totalWorkgroups, + * params); * @endcode */ template -std::future createKernel(Context &ctx, const KernelCode &code, - const Bindings &dataBindings, - const Shape &totalWorkgroups, - const ParamsType ¶ms = ParamsType{}, - CompilationInfo *compilationInfo = nullptr, - const char *cacheKey = nullptr) { +Kernel createKernel(Context &ctx, const KernelCode &code, + const Bindings &dataBindings, + const Shape &totalWorkgroups, + const ParamsType ¶ms = ParamsType{}, + CompilationInfo *compilationInfo = nullptr, + const char *cacheKey = nullptr) { if constexpr (!IsNoParam) { return createKernel(ctx, code, dataBindings.data.data(), numInputs, dataBindings.viewOffsets.data(), totalWorkgroups, @@ -1914,6 +2034,37 @@ std::future createKernel(Context &ctx, const KernelCode &code, } } +/** + * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done. + * + * This callback is invoked when the GPU queue signals the completion of the + * submitted workload for a kernel dispatch. It receives the work-done status + * and a userdata pointer, which is expected to be a heap‑allocated pointer to a + * std::promise. + * + * On success, the promise is fulfilled by calling set_value(). Otherwise, it is + * set with an exception. After setting the promise state, the allocated memory + * for the promise is freed. + * + * @param status The status of the work done. Expected to be + * WGPUQueueWorkDoneStatus_Success on success. + * @param userdata1 A heap allocated pointer to std::promise which is set + * when the work is done. + * @param userdata2 Unused. + */ +inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status, + void *userdata1, void * /*userdata2*/) { + // Cast the userdata pointer back to our heap‑allocated promise. + auto *p = reinterpret_cast *>(userdata1); + if (status == WGPUQueueWorkDoneStatus_Success) { + p->set_value(); + } else { + p->set_exception(std::make_exception_ptr( + std::runtime_error("Queue work did not complete successfully."))); + } + delete p; // free the heap allocation +} + /** * @brief Asynchronously submits a kernel to the GPU queue for execution. * It also sets up a callback to notify when the kernel has finished executing @@ -1930,10 +2081,10 @@ std::future createKernel(Context &ctx, const KernelCode &code, * * @code * std::future dispatchFuture = dispatchKernel(ctx, kernel); - * WaitForFuture(ctx.instance, dispatchFuture); + * wait(ctx.instance, dispatchFuture); * @endcode */ -inline std::future dispatchKernel(Context &ctx, Kernel &kernel) { +inline std::future dispatchKernelAsync(Context &ctx, Kernel &kernel) { // If the kernel was used before, reset the command buffer. if (kernel->used) { resetCommandBuffer(ctx.device, kernel); @@ -1962,6 +2113,23 @@ inline std::future dispatchKernel(Context &ctx, Kernel &kernel) { return future; } +/** + * @brief Synchronous wrapper for dispatchKernelAsync. This function submits + * the kernel to the GPU queue and waits for it to finish executing. + * + * @param[in] ctx Context instance to manage the kernel, from which the queue + * for the GPU is obtained + * @param[in] kernel Kernel instance to dispatch + * + * @code + * dispatchKernel(ctx, kernel); + * @endcode + */ +inline void dispatchKernel(Context &ctx, Kernel &kernel) { + auto future = dispatchKernelAsync(ctx, kernel); + wait(ctx, future); +} + } // namespace gpu #endif // GPU_H diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp index 21a0005..c183754 100644 --- a/numeric_types/half.cpp +++ b/numeric_types/half.cpp @@ -185,12 +185,11 @@ void testContainers() { testRoundTrip(h[3]); } { - Context ctx = waitForContext(); + Context ctx = createContext(); std::array h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0}; Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data()); std::array h2; - std::future toCPUFuture = toCPU(ctx, devH, h2.data(), sizeof(h2)); - waitForFuture(ctx.instance, toCPUFuture); + toCPU(ctx, devH, h2.data(), sizeof(h2)); for (int i = 0; i < 8; ++i) { printResult(h[i].data == h2[i].data, "Container round trip", static_cast(h[i]), static_cast(h2[i])); @@ -215,14 +214,13 @@ fn main( } } )"; - std::future futureContext = createContext( + Context ctx = createContext( {}, {}, /*device descriptor, enabling f16 in WGSL*/ { .requiredFeatureCount = 1, .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(), }); - Context ctx = waitForContextFuture(futureContext); static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { @@ -230,20 +228,17 @@ fn main( } Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data()); Tensor output = createTensor(ctx, Shape{N}, kf16); - std::future kernelFuture = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output}, + Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output}, {cdiv(N, 256), 1, 1}); - Kernel op = waitForFuture(ctx.instance, kernelFuture); - std::future dispatchFuture = dispatchKernel(ctx, op); - waitForFuture(ctx.instance, dispatchFuture); - std::future toCPUFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); - waitForFuture(ctx.instance, toCPUFuture); + dispatchKernel(ctx, op); + toCPU(ctx, output, outputArr.data(), sizeof(outputArr)); for (int i = 0; i < 12; ++i) { printf(" gelu(%.2f) = %.2f\n", static_cast(inputArr[i]), static_cast(outputArr[i])); } } -int testMain() { +int testHalfMain() { printf("\nHalf-precision float tests\n==========================\n"); printf("\nRegular values float round trips\n\n"); From 95e587d71d25ab74207648ca91500a7594bff870 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Thu, 20 Feb 2025 16:34:35 -0600 Subject: [PATCH 04/17] refactors the byIdx context function and sets USE_DAWN_API compile def on native --- cmake/dawn.cmake | 2 + cmake/gpu.cmake | 2 + examples/hello_world/run.cpp | 6 + gpu.hpp | 308 ++++++++++++++++++++++------------- 4 files changed, 201 insertions(+), 117 deletions(-) diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake index 2ead9ae..c6fed94 100644 --- a/cmake/dawn.cmake +++ b/cmake/dawn.cmake @@ -7,6 +7,8 @@ if(EMSCRIPTEN) set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "") set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "") set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE) +else() + add_compile_definitions(USE_DAWN_API) endif() # Enable find for no dawn rebuilds with flutter run diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index 6cce9e6..f936991 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -32,7 +32,9 @@ add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS}) set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX) target_include_directories(gpu PUBLIC "${PROJECT_ROOT}") if(NOT EMSCRIPTEN) + target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/") target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/") + target_include_directories(gpu PUBLIC "${DAWN_DIR}/include/") else() target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/") target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/") diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp index 77549cf..b44934b 100644 --- a/examples/hello_world/run.cpp +++ b/examples/hello_world/run.cpp @@ -28,7 +28,13 @@ int main(int argc, char **argv) { printf("--------------\n\n"); // std::unique_ptr ctx = createContext(); + #ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); + auto adaptersList = listAdapters(ctx); + LOG(kDefLog, kInfo, "Available GPU adapters:\n%s", adaptersList.c_str()); + #else Context ctx = createContext(); + #endif static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { diff --git a/gpu.hpp b/gpu.hpp index e050c87..906371c 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -16,9 +16,8 @@ #include // std::pair #include -#ifndef __EMSCRIPTEN__ -#else +#ifdef __EMSCRIPTEN__ #include "emscripten/emscripten.h" #endif @@ -255,6 +254,26 @@ inline std::string toString(const Shape &shape) { */ inline std::string toString(size_t value) { return std::to_string(value); } +/** + * @brief Converts a WGPUStringView to an std::string. + * + * If the view's data is null, an empty string is returned. If the view's + * length equals WGPU_STRLEN, it is assumed to be null‑terminated; otherwise, + * the explicit length is used. + * + * @param strView The WGPUStringView to convert. + * @return std::string The resulting standard string. + */ +inline std::string formatWGPUStringView(WGPUStringView strView) { + if (!strView.data) { + return ""; + } + if (strView.length == WGPU_STRLEN) { + return std::string(strView.data); + } + return std::string(strView.data, strView.length); +} + /** * @brief simple in-place string replacement helper function for substituting * placeholders in a WGSL string template. @@ -1076,136 +1095,191 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {}, return waitForContextFuture(contextFuture); } -#ifdef USE_DAWN_API +#ifndef __EMSCRIPTEN__ +#if USE_DAWN_API /** - * @brief Factory function to create a GPU context, which aggregates WebGPU API - * handles to interact with the GPU including the instance, adapter, device, and - * queue. + * @brief Retrieves the list of available GPU adapters from the Dawn instance. * - * The function takes gpu index to support for multi GPUs. - * To activate this function, it needs not only webgpu's headers but also DAWN's - * headers. + * This function creates a Dawn instance using the provided context's instance + * handle, then enumerates and returns the available GPU adapters as a vector. * - * If dawn is used, it also sets up an error callback for device loss. + * @param ctx The Context containing the WebGPU instance handle. + * @return std::vector A vector of available GPU + * adapters. + * + * @code + * std::vector adapters = getAdapters(ctx); + * @endcode + */ +inline std::vector getAdapters(Context &ctx) { + dawn::native::Instance dawnInstance( + reinterpret_cast(ctx.instance)); + return dawnInstance.EnumerateAdapters(); +} + +/** + * @brief Formats the given vector of Dawn adapters into a single concatenated string. * - * @param[in] gpuIdx GPU index - * @param[in] desc Instance descriptor for the WebGPU instance (optional) - * @param[in] devDescriptor Device descriptor for the WebGPU device (optional) - * @return Context instance representing the created GPU context + * This function iterates over each Dawn adapter in the provided vector, retrieves its + * description using the WebGPU API, and converts the description from a WGPUStringView + * to an std::string using the formatWGPUStringView helper. The resulting descriptions + * are concatenated into a single string separated by newline characters. * + * @param adapters A vector of Dawn adapters obtained from a WebGPU instance. + * @return std::string A newline-delimited string listing each adapter's description. + * * @code - * Context ctx = createContextByGpuIdx(1); + * std::string adapterList = formatAdapters(adapters); * @endcode */ -inline Context -createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) { - Context context; - { -#ifdef __EMSCRIPTEN__ - // Emscripten does not support the instance descriptor - // and throws an assertion error if it is not nullptr. - context.instance = wgpuCreateInstance(nullptr); -#else - context.instance = wgpuCreateInstance(&desc); -#endif - // check status - check(context.instance, "Initialize WebGPU", __FILE__, __LINE__); +inline std::string formatAdapters(const std::vector &adapters) { + std::string adapterList; + for (size_t i = 0; i < adapters.size(); ++i) { + auto adapterPtr = adapters[i].Get(); + if (adapterPtr) { + WGPUAdapterInfo info = {}; + wgpuAdapterGetInfo(adapterPtr, &info); + std::string desc = formatWGPUStringView(info.description); + adapterList += "GPU Adapter [" + std::to_string(i) + "]: " + desc + "\n"; + wgpuAdapterInfoFreeMembers(info); + } } + return adapterList; +} - LOG(kDefLog, kInfo, "Requesting adapter"); - { - std::vector adapters = - dawn::native::Instance( - reinterpret_cast(context.instance)) - .EnumerateAdapters(); - LOG(kDefLog, kInfo, "The number of GPUs=%d\n", adapters.size()); - // Note: Second gpu is not available on Macos, but the number of GPUs is 2 - // on Macos. - // Calling wgpuAdapterGetInfo function for the second gpu becomes - // segfault. When you check all GPUs on linux, uncomment out following - // codes. - // - // for (size_t i = 0; i < adapters.size(); i++) { - // WGPUAdapterInfo info {}; - // auto ptr = adapters[i].Get(); - // if (ptr && adapters[i]) { - // wgpuAdapterGetInfo(ptr, &info); - // LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", i, info.description); - // wgpuAdapterInfoFreeMembers(info); - // } - // } - - { - LOG(kDefLog, kInfo, "Use GPU(Adapter)[%d]\n", gpuIdx); - auto ptr = adapters[gpuIdx].Get(); - if (ptr) { - WGPUAdapterInfo info{}; - wgpuAdapterGetInfo(ptr, &info); - LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", gpuIdx, - info.description); - wgpuAdapterInfoFreeMembers(info); - } - context.adapter = adapters[gpuIdx].Get(); - dawn::native::GetProcs().adapterAddRef(context.adapter); - } +/** + * @brief Lists the available GPU adapters in the current WebGPU instance. + * + * This function retrieves the list of available GPU adapters using the + * getAdapters helper function, then formats and returns the adapter + * descriptions as a single string using the formatAdapters helper function. + * + * @param ctx The Context containing the WebGPU instance handle. + * @return std::string A newline-delimited string listing each adapter's + * description. + * + * @code + * std::string adapterList = listAdapters(ctx); + * @endcode + */ +inline std::string listAdapters(Context &ctx) { + auto adapters = getAdapters(ctx); + return formatAdapters(adapters); +} + +/** + * @brief Asynchronously creates a GPU context using the specified GPU index. + * + * This function creates a WebGPU instance, retrieves the available GPU + * adapters, and selects the adapter at the specified index. It then requests a + * device from the selected adapter and sets up a logging callback for device + * errors. The function returns a future that will be fulfilled with the + * created Context once all operations are complete. + * + * @param gpuIdx The index of the GPU adapter to use. + * @param desc Instance descriptor for the WebGPU instance (optional) + * @param devDescriptor Device descriptor for the WebGPU device (optional) + * @return std::future A future that will eventually hold the created + * Context. + * + * @code + * std::future contextFuture = createContextByGpuIdxAsync(0); + * Context ctx = waitForContextFuture(contextFuture); + * @endcode + */ +inline std::future +createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + auto promise = std::make_shared>(); + Context ctx; + + ctx.instance = wgpuCreateInstance(&desc); + + if (!ctx.instance) { + promise->set_exception(std::make_exception_ptr( + std::runtime_error("Failed to create WebGPU instance."))); + return promise->get_future(); } + check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__); - LOG(kDefLog, kInfo, "Requesting device"); - { - struct DeviceData { - WGPUDevice device = nullptr; - bool requestEnded = false; - }; - DeviceData devData; - - auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status, - WGPUDevice device, WGPUStringView message, - void *pUserData, void *) { - DeviceData &devData = *reinterpret_cast(pUserData); - check(status == WGPURequestDeviceStatus_Success, - "Could not get WebGPU device.", __FILE__, __LINE__); - LOG(kDefLog, kTrace, "Device Request succeeded %x", - static_cast(device)); - devData.device = device; - devData.requestEnded = true; - }; + // Use helper functions to obtain and format the adapters. + auto adapters = getAdapters(ctx); - WGPURequestDeviceCallbackInfo deviceCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = onDeviceRequestEnded, - .userdata1 = &devData, - .userdata2 = nullptr}; - wgpuAdapterRequestDevice(context.adapter, &devDescriptor, - deviceCallbackInfo); - - LOG(kDefLog, kInfo, "Waiting for device request to end"); - while (!devData.requestEnded) { - processEvents(context.instance); - } - LOG(kDefLog, kInfo, "Device request ended"); - assert(devData.requestEnded); - context.device = devData.device; - - WGPULoggingCallbackInfo loggingCallbackInfo = { - .nextInChain = nullptr, - .callback = - [](WGPULoggingType type, WGPUStringView message, void *userdata1, - void *userdata2) { - LOG(kDefLog, kError, "Device logging callback: %.*s", - static_cast(message.length), message.data); - if (type == WGPULoggingType_Error) { - throw std::runtime_error("Device error logged."); - } - }, - .userdata1 = nullptr, - .userdata2 = nullptr}; - wgpuDeviceSetLoggingCallback(context.device, loggingCallbackInfo); + if (gpuIdx >= adapters.size()) { + promise->set_exception( + std::make_exception_ptr(std::runtime_error("Invalid GPU index."))); + return promise->get_future(); + } + LOG(kDefLog, kInfo, "Using GPU Adapter[%d]", gpuIdx); + auto adapterPtr = adapters[gpuIdx].Get(); + if (adapterPtr) { + WGPUAdapterInfo info = {}; + wgpuAdapterGetInfo(adapterPtr, &info); + LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s", gpuIdx, + formatWGPUStringView(info.description).c_str()); + wgpuAdapterInfoFreeMembers(info); + } + ctx.adapter = reinterpret_cast(adapterPtr); + dawn::native::GetProcs().adapterAddRef(ctx.adapter); + + LOG(kDefLog, kInfo, "Requesting device"); + // Request the device asynchronously (using our requestDeviceAsync helper). + auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor); + try { + ctx.device = wait(ctx, deviceFuture); + ctx.deviceStatus = WGPURequestDeviceStatus_Success; + } catch (const std::exception &ex) { + promise->set_exception(std::make_exception_ptr(ex)); + return promise->get_future(); } - context.queue = wgpuDeviceGetQueue(context.device); - return context; + + WGPULoggingCallbackInfo loggingCallbackInfo{ + .nextInChain = nullptr, + .callback = + [](WGPULoggingType type, WGPUStringView message, void *userdata1, + void *userdata2) { + LOG(kDefLog, kError, "Device logging callback: %.*s", + static_cast(message.length), message.data); + if (type == WGPULoggingType_Error) { + throw std::runtime_error("Device error logged."); + } + }, + .userdata1 = nullptr, + .userdata2 = nullptr}; + wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo); + ctx.queue = wgpuDeviceGetQueue(ctx.device); + promise->set_value(std::move(ctx)); + return promise->get_future(); } -#endif + +/** + * @brief Synchronously creates a GPU context using the specified GPU index. + * + * This function calls the asynchronous createContextByGpuIdxAsync function to + * create a GPU context, then waits for its completion using + * waitForContextFuture. The returned Context holds handles to the WebGPU + * instance, adapter, device, and queue, and is used for subsequent GPU + * operations. + * + * @param gpuIdx The index of the GPU adapter to use. + * @param desc Instance descriptor for the WebGPU instance (optional) + * @param devDescriptor Device descriptor for the WebGPU device (optional) + * @return Context The fully initialized GPU context. + * + * @code + * Context ctx = createContextByGpuIdx(0); + * @endcode + */ +inline Context createContextByGpuIdx(int gpuIdx, + const WGPUInstanceDescriptor &desc = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { + std::future contextFuture = + createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor); + return waitForContextFuture(contextFuture); +} + +#endif // USE_DAWN_API +#endif // __EMSCRIPTEN__ /** * @brief Callback function invoked upon completion of an asynchronous GPU From 70d980287f9a7cca8889e166d67b802bc4b69319 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:14:41 -0600 Subject: [PATCH 05/17] tests toCPU, adds offset, adds gpuflow doc, default cmakelists builds test/test_gpu.cpp --- CMakeLists.txt | 18 +++++ docs/gpuflow.md | 78 +++++++++++++++++++ gpu.hpp | 110 ++++++++++++++++---------- test/test_gpu.cpp | 193 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 357 insertions(+), 42 deletions(-) create mode 100644 docs/gpuflow.md create mode 100644 test/test_gpu.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 816cdf3..a17602e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,24 @@ endif() include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake") include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake") +target_link_libraries(gpu PRIVATE webgpu_dawn) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test) + +add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp) +target_link_libraries(test_gpu PRIVATE gpu) + +# Platform-specific post-build actions (e.g. copying DLLs for MSVC) +if(MSVC) + add_custom_command( + TARGET test_gpu POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${DAWN_BUILD_DIR}/$/webgpu_dawn.dll + $ + COMMENT "Copying webgpu_dawn.dll to the build directory" + ) +endif() + add_library(gpud SHARED gpu.hpp) set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX) target_link_libraries(gpud PRIVATE gpu) diff --git a/docs/gpuflow.md b/docs/gpuflow.md new file mode 100644 index 0000000..d4eb37a --- /dev/null +++ b/docs/gpuflow.md @@ -0,0 +1,78 @@ +# GPU.cpp Lifecycle + +```mermaid +flowchart TD + %% Data Preparation & Upload + subgraph "Data Preparation & Upload" + A["CPU Data"] + B["Define Data Properties
(shape, type, size)"] + C["Create GPU Buffer
(allocate raw buffer)"] + D["Create Tensor
(allocates Array with one
or more buffers
and associates Shape)"] + + E["Upload Data via toGPU
(raw buffer)
toGPU
(ctx, data, buffer, size)"] + F["Upload Data via toGPU
(Tensor overload)
toGPU(ctx, data, tensor)"] + G["Optional: Upload Kernel Parameters
toGPU(ctx, params, Kernel)"] + end + + %% Buffer Setup & Bindings + subgraph "Buffer & Binding Setup" + H["Define Bindings
(Bindings, TensorView)"] + I["Map GPU buffers
to shader bindings
(Collection from Tensor
or single buffers)"] + end + + %% Kernel Setup & Execution + subgraph "Kernel Setup & Execution" + J["Define KernelCode
(WGSL template, workgroup size, precision)"] + K["Create Kernel"] + L["Dispatch Kernel"] + end + + %% GPU Execution & Result Readback + subgraph "GPU Execution & Result Readback" + M["Kernel Execution
(GPU shader runs)"] + N["Readback Data
(toCPU variants)"] + end + + %% Context & Resources + O["Context
(Device, Queue,
TensorPool, KernelPool)"] + + %% Flow Connections + A --> B + B --> C + B --> D + C --> E + D --> F + F --> H + E --> H + H --> I + I --> K + J --> K + G --- K + K --> L + L --> M + M --> N + + %% Context shared by all stages + O --- D + O --- E + O --- F + O --- K + O --- L + O --- N +``` + +Rank 0: Scalar +Rank 1: Vector +Rank 2: Matrix +Rank 3: 3D Tensor (or Cube) +Rank 4: 4D Tensor +Rank ..: Higher Dimensional Tensors + + +• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`. +• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};` +• gpu::Bindings collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel. +• The gpu::TensorPool (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup. +• gpu::KernelCode contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration. +• The gpu::createKernelAsync/gpu::createKernel functions (within the Execution Flow) use the gpu::Context, gpu::Bindings, and gpu::KernelCode to configure and construct a gpu::Kernel that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.). +• gpu::KernelCode’s workgroup size (a gpu::Shape) defines the dispatch configuration, and the gpu::Kernel eventually uses the underlying gpu::Array (contains WGPUBuffer, WGPUBufferUsage, size_t) and gpu::Shape data from the created Tensor. diff --git a/gpu.hpp b/gpu.hpp index 906371c..931d646 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -16,7 +16,6 @@ #include // std::pair #include - #ifdef __EMSCRIPTEN__ #include "emscripten/emscripten.h" #endif @@ -1106,7 +1105,7 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {}, * @param ctx The Context containing the WebGPU instance handle. * @return std::vector A vector of available GPU * adapters. - * + * * @code * std::vector adapters = getAdapters(ctx); * @endcode @@ -1118,21 +1117,25 @@ inline std::vector getAdapters(Context &ctx) { } /** - * @brief Formats the given vector of Dawn adapters into a single concatenated string. + * @brief Formats the given vector of Dawn adapters into a single concatenated + * string. * - * This function iterates over each Dawn adapter in the provided vector, retrieves its - * description using the WebGPU API, and converts the description from a WGPUStringView - * to an std::string using the formatWGPUStringView helper. The resulting descriptions - * are concatenated into a single string separated by newline characters. + * This function iterates over each Dawn adapter in the provided vector, + * retrieves its description using the WebGPU API, and converts the description + * from a WGPUStringView to an std::string using the formatWGPUStringView + * helper. The resulting descriptions are concatenated into a single string + * separated by newline characters. * * @param adapters A vector of Dawn adapters obtained from a WebGPU instance. - * @return std::string A newline-delimited string listing each adapter's description. - * + * @return std::string A newline-delimited string listing each adapter's + * description. + * * @code * std::string adapterList = formatAdapters(adapters); * @endcode */ -inline std::string formatAdapters(const std::vector &adapters) { +inline std::string +formatAdapters(const std::vector &adapters) { std::string adapterList; for (size_t i = 0; i < adapters.size(); ++i) { auto adapterPtr = adapters[i].Get(); @@ -1157,7 +1160,7 @@ inline std::string formatAdapters(const std::vector &adap * @param ctx The Context containing the WebGPU instance handle. * @return std::string A newline-delimited string listing each adapter's * description. - * + * * @code * std::string adapterList = listAdapters(ctx); * @endcode @@ -1181,7 +1184,7 @@ inline std::string listAdapters(Context &ctx) { * @param devDescriptor Device descriptor for the WebGPU device (optional) * @return std::future A future that will eventually hold the created * Context. - * + * * @code * std::future contextFuture = createContextByGpuIdxAsync(0); * Context ctx = waitForContextFuture(contextFuture); @@ -1270,9 +1273,9 @@ createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, * Context ctx = createContextByGpuIdx(0); * @endcode */ -inline Context createContextByGpuIdx(int gpuIdx, - const WGPUInstanceDescriptor &desc = {}, - const WGPUDeviceDescriptor &devDescriptor = {}) { +inline Context +createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, + const WGPUDeviceDescriptor &devDescriptor = {}) { std::future contextFuture = createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor); return waitForContextFuture(contextFuture); @@ -1365,17 +1368,19 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, /** * @brief Copies data from a GPU buffer to CPU memory. * @param[in] ctx Context instance to manage the operation - * @param[in] tensor Tensor instance representing the GPU buffer to copy from * @param[out] data Pointer to the CPU memory to copy the data to * @param[in] bufferSize Size of the data buffer in bytes * @param[in] op StagingBuffer instance to manage the operation + * @param[in] sourceOffset Offset in the GPU buffer to start copying from. * * @code * toCPU(ctx, tensor, data, bufferSize); * @endcode */ -inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, - size_t bufferSize, CopyData &op) { + +// NOTE: I think this one is redundant? CopyData not used externally. +inline std::future toCPUAsync(Context &ctx, void *data, size_t bufferSize, + CopyData &op, size_t sourceOffset = 0) { // Submit the command buffer and release it. wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer); wgpuCommandBufferRelease(op.commandBuffer); @@ -1388,8 +1393,8 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, CallbackData *cbData = new CallbackData{ op.readbackBuffer, // The GPU buffer to be read back. bufferSize, - data, // CPU memory destination. - promise // The promise to be signaled. + data, // CPU memory destination. + promise, // The promise to be signaled. }; // Set up the work-done callback to initiate the buffer mapping. @@ -1402,6 +1407,11 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // Begin the asynchronous chain by registering the queue work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + // Release the readback buffer as it is no longer needed. + if (op.readbackBuffer) { + wgpuBufferRelease(op.readbackBuffer); + } + return promise->get_future(); } @@ -1417,11 +1427,13 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, * * @param[in] ctx Context instance to manage the operation * @param[in] tensor Tensor instance representing the GPU buffer to copy from - * @param[in] bufferSize Size of the data buffer in bytes + * @param[in] bufferSize Size to read in bytes as out data. * @param[out] data Pointer to the CPU memory to copy the data to + * @param[in] sourceOffset Offset in the GPU buffer to start copying from. */ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, - size_t bufferSize) { + size_t bufferSize, + size_t sourceOffset = 0) { // Create a promise that will later be satisfied when the async copy // completes. auto promise = std::make_shared>(); @@ -1430,7 +1442,7 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, WGPUBufferDescriptor readbackBufferDescriptor = { .label = {.data = nullptr, .length = 0}, .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, - .size = bufferSize, + .size = bufferSize, // Size of the readback buffer. }; WGPUBuffer readbackBuffer = wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor); @@ -1438,8 +1450,9 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // Create a command encoder and record a copy from the tensor GPU buffer WGPUCommandEncoder commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); - wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0, - readbackBuffer, 0, bufferSize); + wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, + sourceOffset, readbackBuffer, 0, + bufferSize); // Finish recording by creating a command buffer and release the encoder. WGPUCommandBuffer commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); @@ -1472,13 +1485,16 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // queueWorkDoneCallback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + if (readbackBuffer) { + wgpuBufferRelease(readbackBuffer); + } + return promise->get_future(); } inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, - size_t size) { - // The size (in bytes) for the copy. - uint64_t bufferSize = size; + size_t bufferSize, + size_t sourceOffset = 0) { // Create an operation structure (here we reuse CopyData solely for its // members that we need to create a readback buffer and command buffer). @@ -1503,7 +1519,7 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, { WGPUCommandEncoder commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); - wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0, + wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, sourceOffset, op.readbackBuffer, 0, bufferSize); op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr); wgpuCommandEncoderRelease(commandEncoder); @@ -1516,10 +1532,10 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, // Allocate callback data CallbackData *cbData = new CallbackData{ - op.readbackBuffer, // The readback buffer created above. - static_cast(bufferSize), // Size of the copy. - data, // Destination CPU memory. - promise // Our promise to satisfy when done. + op.readbackBuffer, // The readback buffer created above. + bufferSize, // Size of the copy. + data, // Destination CPU memory. // Offset in the GPU buffer. + promise // Our promise to satisfy when done. }; // Set up the queue work-done callback info. @@ -1532,6 +1548,10 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, // Start the asynchronous chain by registering the work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); + if (op.readbackBuffer) { + wgpuBufferRelease(op.readbackBuffer); + } + return promise->get_future(); } @@ -1548,9 +1568,11 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, * @endcode */ template -inline std::future toCPUAsync(Context &ctx, Tensor &tensor, - std::array &data) { - return toCPUAsync(ctx, tensor, data.data(), sizeof(data)); +inline std::future +toCPUAsync(Context &ctx, Tensor &tensor, std::array &data, + size_t sourceOffset = 0) { + return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset + ); } /** @@ -1571,8 +1593,10 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, * toCPU(ctx, tensor, data, bufferSize, instance); * @endcode */ -inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { - auto future = toCPUAsync(ctx, tensor, data, bufferSize); +inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, + size_t sourceOffset = 0) { + auto future = + toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset); wait(ctx, future); } @@ -1593,8 +1617,9 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) { * toCPU(ctx, buffer, data, size, instance); * @endcode */ -inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { - auto future = toCPUAsync(ctx, buffer, data, size); +inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size, + size_t sourceOffset = 0) { + auto future = toCPUAsync(ctx, buffer, data, size, sourceOffset); wait(ctx, future); } @@ -1616,8 +1641,9 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) { * @endcode */ template -inline void toCPU(Context &ctx, Tensor &tensor, std::array &data) { - auto future = toCPUAsync(ctx, tensor, data); +inline void toCPU(Context &ctx, Tensor &tensor, std::array &data, + size_t sourceOffset = 0) { + auto future = toCPUAsync(ctx, tensor, data, sourceOffset); wait(ctx, future); } diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp new file mode 100644 index 0000000..2cc4290 --- /dev/null +++ b/test/test_gpu.cpp @@ -0,0 +1,193 @@ +//// filepath: /d:/Code/git/forks/gpu.cpp/test/test_gpu_integration.cpp +#include "gpu.hpp" +#include +#include +#include +#include +#include +#include + +using namespace gpu; + +// A simple WGSL copy kernel that copies input to output. +static const char *kCopyKernel = R"( +@group(0) @binding(0) var inp: array<{{precision}}>; +@group(0) @binding(1) var out: array<{{precision}}>; +@group(0) @binding(1) var dummy: array<{{precision}}>; +@compute @workgroup_size({{workgroupSize}}) +fn main(@builtin(global_invocation_id) gid: vec3) { + let i: u32 = gid.x; + if (i < arrayLength(&inp)) { + out[i] = inp[i]; + } +} +)"; + +// Test using the overload that takes a Tensor. +void testToCPUWithTensor() { + LOG(kDefLog, kInfo, "Running testToCPUWithTensor..."); + + // Create a real GPU context. + #ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); + #else + Context ctx = createContext(); + #endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + outputData[i] = 0.0f; + } + + // Create input and output tensors. + Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); + Tensor outputTensor = createTensor(ctx, Shape{N}, kf32); + + // Create and dispatch the copy kernel. + Kernel copyKernel = createKernel(ctx, {kCopyKernel, 256, kf32}, + Bindings{inputTensor, outputTensor}, + {cdiv(N, 256), 1, 1}); + dispatchKernel(ctx, copyKernel); + + // Synchronously copy GPU output to CPU using the tensor overload. + toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData)); + + // Verify the output matches the input. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]); + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithTensor passed."); +} + +// Test using the overload that takes a raw GPU buffer. +// We reuse the Tensor's underlying buffer for this test. +void testToCPUWithBuffer() { + LOG(kDefLog, kInfo, "Running testToCPUWithBuffer..."); + + #ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); + #else + Context ctx = createContext(); + #endif + + constexpr size_t N = 1024; + std::array data, outputData; + for (size_t i = 0; i < N; ++i) { + data[i] = static_cast(i * 2); + outputData[i] = 0.0f; + } + + // Create a tensor to allocate a GPU buffer and initialize it. + Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data()); + + // Now extract the raw GPU buffer from the tensor. + WGPUBuffer gpuBuffer = tensor.data.buffer; + + // Use the WGPUBuffer overload. This call returns a future. + auto future = toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0); + wait(ctx, future); + + // Verify that the CPU output matches the original data. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == data[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithBuffer passed."); +} + +void testToCPUWithTensorSourceOffset() { + LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset..."); +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t numElements = 25; + constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements + constexpr size_t copyCount = 10; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with known data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 50); // Arbitrary values + } + // Create a tensor from the full data. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + + // Allocate a destination CPU buffer exactly as large as the data we want to copy. + std::vector cpuOutput(copyCount, -1.0f); + + // Set sourceOffset to skip the first few float elements + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + // Call the tensor overload with sourceOffset and destOffset = 0. + auto future = toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed."); +} + +void testToCPUWithBufferSourceOffset() { + LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset..."); +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t numElements = 30; + constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements + constexpr size_t copyCount = 12; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with arbitrary data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 100); + } + // Create a tensor to initialize a GPU buffer. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + // Extract the raw GPU buffer from the tensor. + WGPUBuffer buffer = tensor.data.buffer; + + // Allocate a destination CPU buffer exactly as large as needed. + std::vector cpuOutput(copyCount, -2.0f); + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + + // Call the buffer overload with sourceOffset and destOffset = 0. + auto future = toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify that the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); +} + +int main() { + LOG(kDefLog, kInfo, "Running GPU integration tests..."); + testToCPUWithTensor(); + testToCPUWithBuffer(); + testToCPUWithTensorSourceOffset(); + testToCPUWithBufferSourceOffset(); + LOG(kDefLog, kInfo, "All tests passed."); + return 0; +} \ No newline at end of file From 16feb9e9f32e8cc2bbc12019a448c856a061d19f Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:16:44 -0600 Subject: [PATCH 06/17] remove path --- test/test_gpu.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index 2cc4290..0954e44 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -1,4 +1,3 @@ -//// filepath: /d:/Code/git/forks/gpu.cpp/test/test_gpu_integration.cpp #include "gpu.hpp" #include #include From e61e80917a73406e8fb8af5a94c743982231f51b Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:18:53 -0600 Subject: [PATCH 07/17] format --- test/test_gpu.cpp | 294 +++++++++++++++++++++++----------------------- 1 file changed, 149 insertions(+), 145 deletions(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index 0954e44..48aa1bc 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -1,7 +1,7 @@ #include "gpu.hpp" #include -#include #include +#include #include #include #include @@ -24,169 +24,173 @@ fn main(@builtin(global_invocation_id) gid: vec3) { // Test using the overload that takes a Tensor. void testToCPUWithTensor() { - LOG(kDefLog, kInfo, "Running testToCPUWithTensor..."); - - // Create a real GPU context. - #ifdef USE_DAWN_API - Context ctx = createContextByGpuIdx(0); - #else - Context ctx = createContext(); - #endif - - constexpr size_t N = 1024; - std::array inputData, outputData; - for (size_t i = 0; i < N; ++i) { - inputData[i] = static_cast(i); - outputData[i] = 0.0f; - } - - // Create input and output tensors. - Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); - Tensor outputTensor = createTensor(ctx, Shape{N}, kf32); - - // Create and dispatch the copy kernel. - Kernel copyKernel = createKernel(ctx, {kCopyKernel, 256, kf32}, - Bindings{inputTensor, outputTensor}, - {cdiv(N, 256), 1, 1}); - dispatchKernel(ctx, copyKernel); - - // Synchronously copy GPU output to CPU using the tensor overload. - toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData)); - - // Verify the output matches the input. - for (size_t i = 0; i < N; ++i) { - LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]); - LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); - assert(outputData[i] == inputData[i]); - } - LOG(kDefLog, kInfo, "testToCPUWithTensor passed."); + LOG(kDefLog, kInfo, "Running testToCPUWithTensor..."); + +// Create a real GPU context. +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + outputData[i] = 0.0f; + } + + // Create input and output tensors. + Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); + Tensor outputTensor = createTensor(ctx, Shape{N}, kf32); + + // Create and dispatch the copy kernel. + Kernel copyKernel = + createKernel(ctx, {kCopyKernel, 256, kf32}, + Bindings{inputTensor, outputTensor}, {cdiv(N, 256), 1, 1}); + dispatchKernel(ctx, copyKernel); + + // Synchronously copy GPU output to CPU using the tensor overload. + toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData)); + + // Verify the output matches the input. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]); + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithTensor passed."); } // Test using the overload that takes a raw GPU buffer. // We reuse the Tensor's underlying buffer for this test. void testToCPUWithBuffer() { - LOG(kDefLog, kInfo, "Running testToCPUWithBuffer..."); - - #ifdef USE_DAWN_API - Context ctx = createContextByGpuIdx(0); - #else - Context ctx = createContext(); - #endif - - constexpr size_t N = 1024; - std::array data, outputData; - for (size_t i = 0; i < N; ++i) { - data[i] = static_cast(i * 2); - outputData[i] = 0.0f; - } - - // Create a tensor to allocate a GPU buffer and initialize it. - Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data()); - - // Now extract the raw GPU buffer from the tensor. - WGPUBuffer gpuBuffer = tensor.data.buffer; - - // Use the WGPUBuffer overload. This call returns a future. - auto future = toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0); - wait(ctx, future); - - // Verify that the CPU output matches the original data. - for (size_t i = 0; i < N; ++i) { - LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); - assert(outputData[i] == data[i]); - } - LOG(kDefLog, kInfo, "testToCPUWithBuffer passed."); + LOG(kDefLog, kInfo, "Running testToCPUWithBuffer..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array data, outputData; + for (size_t i = 0; i < N; ++i) { + data[i] = static_cast(i * 2); + outputData[i] = 0.0f; + } + + // Create a tensor to allocate a GPU buffer and initialize it. + Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data()); + + // Now extract the raw GPU buffer from the tensor. + WGPUBuffer gpuBuffer = tensor.data.buffer; + + // Use the WGPUBuffer overload. This call returns a future. + auto future = + toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0); + wait(ctx, future); + + // Verify that the CPU output matches the original data. + for (size_t i = 0; i < N; ++i) { + LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + assert(outputData[i] == data[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithBuffer passed."); } void testToCPUWithTensorSourceOffset() { - LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset..."); + LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset..."); #ifdef USE_DAWN_API - Context ctx = createContextByGpuIdx(0); + Context ctx = createContextByGpuIdx(0); #else - Context ctx = createContext(); + Context ctx = createContext(); #endif - constexpr size_t numElements = 25; - constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements - constexpr size_t copyCount = 10; // Number of floats to copy - size_t copySize = copyCount * sizeof(float); - - // Create an input array with known data. - std::array inputData{}; - for (size_t i = 0; i < numElements; ++i) { - inputData[i] = static_cast(i + 50); // Arbitrary values - } - // Create a tensor from the full data. - Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); - - // Allocate a destination CPU buffer exactly as large as the data we want to copy. - std::vector cpuOutput(copyCount, -1.0f); - - // Set sourceOffset to skip the first few float elements - size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); - // Call the tensor overload with sourceOffset and destOffset = 0. - auto future = toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes); - wait(ctx, future); - - // Verify the copied data matches the expected subset. - for (size_t i = 0; i < copyCount; ++i) { - float expected = inputData[sourceOffsetElements + i]; - float actual = cpuOutput[i]; - LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); - LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); - assert(expected == actual); - } - LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed."); + constexpr size_t numElements = 25; + constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements + constexpr size_t copyCount = 10; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with known data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 50); // Arbitrary values + } + // Create a tensor from the full data. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + + // Allocate a destination CPU buffer exactly as large as the data we want to + // copy. + std::vector cpuOutput(copyCount, -1.0f); + + // Set sourceOffset to skip the first few float elements + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + // Call the tensor overload with sourceOffset and destOffset = 0. + auto future = + toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed."); } void testToCPUWithBufferSourceOffset() { - LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset..."); + LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset..."); #ifdef USE_DAWN_API - Context ctx = createContextByGpuIdx(0); + Context ctx = createContextByGpuIdx(0); #else - Context ctx = createContext(); + Context ctx = createContext(); #endif - constexpr size_t numElements = 30; - constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements - constexpr size_t copyCount = 12; // Number of floats to copy - size_t copySize = copyCount * sizeof(float); - - // Create an input array with arbitrary data. - std::array inputData{}; - for (size_t i = 0; i < numElements; ++i) { - inputData[i] = static_cast(i + 100); - } - // Create a tensor to initialize a GPU buffer. - Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); - // Extract the raw GPU buffer from the tensor. - WGPUBuffer buffer = tensor.data.buffer; - - // Allocate a destination CPU buffer exactly as large as needed. - std::vector cpuOutput(copyCount, -2.0f); - size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); - - // Call the buffer overload with sourceOffset and destOffset = 0. - auto future = toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes); - wait(ctx, future); - - // Verify that the copied data matches the expected subset. - for (size_t i = 0; i < copyCount; ++i) { - float expected = inputData[sourceOffsetElements + i]; - float actual = cpuOutput[i]; - LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); - LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); - assert(expected == actual); - } - LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); + constexpr size_t numElements = 30; + constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements + constexpr size_t copyCount = 12; // Number of floats to copy + size_t copySize = copyCount * sizeof(float); + + // Create an input array with arbitrary data. + std::array inputData{}; + for (size_t i = 0; i < numElements; ++i) { + inputData[i] = static_cast(i + 100); + } + // Create a tensor to initialize a GPU buffer. + Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data()); + // Extract the raw GPU buffer from the tensor. + WGPUBuffer buffer = tensor.data.buffer; + + // Allocate a destination CPU buffer exactly as large as needed. + std::vector cpuOutput(copyCount, -2.0f); + size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float); + + // Call the buffer overload with sourceOffset and destOffset = 0. + auto future = + toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes); + wait(ctx, future); + + // Verify that the copied data matches the expected subset. + for (size_t i = 0; i < copyCount; ++i) { + float expected = inputData[sourceOffsetElements + i]; + float actual = cpuOutput[i]; + LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + assert(expected == actual); + } + LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); } int main() { - LOG(kDefLog, kInfo, "Running GPU integration tests..."); - testToCPUWithTensor(); - testToCPUWithBuffer(); - testToCPUWithTensorSourceOffset(); - testToCPUWithBufferSourceOffset(); - LOG(kDefLog, kInfo, "All tests passed."); - return 0; -} \ No newline at end of file + LOG(kDefLog, kInfo, "Running GPU integration tests..."); + testToCPUWithTensor(); + testToCPUWithBuffer(); + testToCPUWithTensorSourceOffset(); + testToCPUWithBufferSourceOffset(); + LOG(kDefLog, kInfo, "All tests passed."); + return 0; +} From ad8698dc1cb10ac89f020e6920d680328a6200ae Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:22:25 -0600 Subject: [PATCH 08/17] doc formatting --- docs/gpuflow.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/gpuflow.md b/docs/gpuflow.md index d4eb37a..420397d 100644 --- a/docs/gpuflow.md +++ b/docs/gpuflow.md @@ -61,18 +61,18 @@ flowchart TD O --- N ``` +• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`. +• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};` +• `gpu::Bindings` collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel. +• The `gpu::TensorPool` (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup. +• `gpu::KernelCode` contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration. +• The `gpu::createKernelAsync/gpu::createKernel` functions (within the Execution Flow) use the `gpu::Context`, `gpu::Bindings`, and `gpu::KernelCode` to configure and construct a `gpu::Kernel` that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.). +• `gpu::KernelCode`’s workgroup size (a `gpu::Shape`) defines the dispatch configuration, and the `gpu::Kernel` eventually uses the underlying `gpu::Array` (contains` WGPUBuffer, WGPUBufferUsage, size_t`) and `gpu::Shape` data from the created Tensor. + +`gpu::Tensor` Ranks: Rank 0: Scalar Rank 1: Vector Rank 2: Matrix Rank 3: 3D Tensor (or Cube) Rank 4: 4D Tensor -Rank ..: Higher Dimensional Tensors - - -• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`. -• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};` -• gpu::Bindings collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel. -• The gpu::TensorPool (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup. -• gpu::KernelCode contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration. -• The gpu::createKernelAsync/gpu::createKernel functions (within the Execution Flow) use the gpu::Context, gpu::Bindings, and gpu::KernelCode to configure and construct a gpu::Kernel that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.). -• gpu::KernelCode’s workgroup size (a gpu::Shape) defines the dispatch configuration, and the gpu::Kernel eventually uses the underlying gpu::Array (contains WGPUBuffer, WGPUBufferUsage, size_t) and gpu::Shape data from the created Tensor. +Rank (max 8): Higher Dimensional Tensors \ No newline at end of file From 025af2a8f4621ba9612354a6e524044da2188ac3 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Fri, 21 Feb 2025 22:29:25 -0600 Subject: [PATCH 09/17] doc nits --- docs/gpuflow.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gpuflow.md b/docs/gpuflow.md index 420397d..fee9d4c 100644 --- a/docs/gpuflow.md +++ b/docs/gpuflow.md @@ -11,7 +11,7 @@ flowchart TD E["Upload Data via toGPU
(raw buffer)
toGPU
(ctx, data, buffer, size)"] F["Upload Data via toGPU
(Tensor overload)
toGPU(ctx, data, tensor)"] - G["Optional: Upload Kernel Parameters
toGPU(ctx, params, Kernel)"] + G["Optional:
Kernel Parameters
toGPU(ctx, params, Kernel)"] end %% Buffer Setup & Bindings From 3776dcd50152ba4fc18ca9029006bd9e9588dca7 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 11:33:30 -0600 Subject: [PATCH 10/17] set project root on root cmakelists --- CMakeLists.txt | 2 +- cmake/gpu.cmake | 1 - docs/gpuflow.md | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a17602e..85911a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ # and cmake/gpu.cmake for more details cmake_minimum_required(VERSION 3.28) project(gpu) - +set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with # LSP set(CMAKE_CXX_STANDARD 20) diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index f936991..d991a18 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -39,4 +39,3 @@ else() target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/") target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/") endif() - diff --git a/docs/gpuflow.md b/docs/gpuflow.md index fee9d4c..d13a228 100644 --- a/docs/gpuflow.md +++ b/docs/gpuflow.md @@ -75,4 +75,4 @@ Rank 1: Vector Rank 2: Matrix Rank 3: 3D Tensor (or Cube) Rank 4: 4D Tensor -Rank (max 8): Higher Dimensional Tensors \ No newline at end of file +Rank (max 8): Higher Dimensional Tensors From d58e1911b5e015ea073c4e4350d3378a2edf80bd Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 16:15:51 -0600 Subject: [PATCH 11/17] fix linux issue with callback info --- gpu.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/gpu.hpp b/gpu.hpp index 931d646..8c661bc 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -1354,11 +1355,12 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, __LINE__); // Set up the buffer mapping callback information. - WGPUBufferMapCallbackInfo mapCallbackInfo; - mapCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; - mapCallbackInfo.callback = bufferMapCallback; - mapCallbackInfo.userdata1 = cbData; - mapCallbackInfo.userdata2 = nullptr; + WGPUBufferMapCallbackInfo mapCallbackInfo = { + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = bufferMapCallback, + .userdata1 = cbData, // Pass the callback data. + .userdata2 = nullptr // No additional user data. + }; // Begin the asynchronous mapping of the readback buffer. wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize, From 498ba74b73962d8b647b844fc570cf758ebaf467 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 17:54:16 -0600 Subject: [PATCH 12/17] should not release readback buffer --- gpu.hpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/gpu.hpp b/gpu.hpp index 8c661bc..4854338 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -1486,11 +1486,7 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // Register the callback. The async chain continues inside // queueWorkDoneCallback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - - if (readbackBuffer) { - wgpuBufferRelease(readbackBuffer); - } - + return promise->get_future(); } @@ -1550,10 +1546,6 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, // Start the asynchronous chain by registering the work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - if (op.readbackBuffer) { - wgpuBufferRelease(op.readbackBuffer); - } - return promise->get_future(); } From 2db9be10fb3f0298294ba199d71eca894746e3a6 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 18:07:09 -0600 Subject: [PATCH 13/17] clean up callback syntax --- gpu.hpp | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/gpu.hpp b/gpu.hpp index 4854338..b057514 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -1306,7 +1306,7 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {}, */ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, void *userdata1, void * /*userdata2*/) { - CallbackData *cbData = reinterpret_cast(userdata1); + const CallbackData *cbData = static_cast(userdata1); // Check that mapping succeeded. check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__, __LINE__); @@ -1349,17 +1349,17 @@ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message, */ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, void *userdata1, void * /*userdata2*/) { - CallbackData *cbData = reinterpret_cast(userdata1); + const CallbackData *cbData = static_cast(userdata1); // Ensure the queue work finished successfully. check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__, __LINE__); // Set up the buffer mapping callback information. WGPUBufferMapCallbackInfo mapCallbackInfo = { - .mode = WGPUCallbackMode_AllowSpontaneous, - .callback = bufferMapCallback, - .userdata1 = cbData, // Pass the callback data. - .userdata2 = nullptr // No additional user data. + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = bufferMapCallback, + .userdata1 = const_cast(cbData), // Pass the callback data. + .userdata2 = nullptr // No additional user data. }; // Begin the asynchronous mapping of the readback buffer. @@ -1400,11 +1400,11 @@ inline std::future toCPUAsync(Context &ctx, void *data, size_t bufferSize, }; // Set up the work-done callback to initiate the buffer mapping. - WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo; - workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous; - workDoneCallbackInfo.callback = queueWorkDoneCallback; - workDoneCallbackInfo.userdata1 = cbData; // Pass the callback data. - workDoneCallbackInfo.userdata2 = nullptr; + WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = { + .mode = WGPUCallbackMode_AllowSpontaneous, + .callback = queueWorkDoneCallback, + .userdata1 = const_cast(cbData), + .userdata2 = nullptr}; // Begin the asynchronous chain by registering the queue work-done callback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); @@ -1486,7 +1486,7 @@ inline std::future toCPUAsync(Context &ctx, Tensor &tensor, void *data, // Register the callback. The async chain continues inside // queueWorkDoneCallback. wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo); - + return promise->get_future(); } @@ -1562,11 +1562,10 @@ inline std::future toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data, * @endcode */ template -inline std::future -toCPUAsync(Context &ctx, Tensor &tensor, std::array &data, - size_t sourceOffset = 0) { - return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset - ); +inline std::future toCPUAsync(Context &ctx, Tensor &tensor, + std::array &data, + size_t sourceOffset = 0) { + return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset); } /** @@ -1589,8 +1588,7 @@ toCPUAsync(Context &ctx, Tensor &tensor, std::array &data, */ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize, size_t sourceOffset = 0) { - auto future = - toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset); + auto future = toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset); wait(ctx, future); } From 752a53a3d426fb5bb87a89f31b601817adea25c7 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 18:38:09 -0600 Subject: [PATCH 14/17] add stress test --- test/test_gpu.cpp | 78 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 9 deletions(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index 48aa1bc..99a1af6 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -1,12 +1,34 @@ #include "gpu.hpp" #include #include +#include #include #include #include #include using namespace gpu; +using namespace std::chrono; + + +// Forward declarations: +void testToCPUWithTensor(); +void testToCPUWithBuffer(); +void testToCPUWithTensorSourceOffset(); +void testToCPUWithBufferSourceOffset(); +void stressTestToCPU(); + +int main() { + LOG(kDefLog, kInfo, "Running GPU integration tests..."); + testToCPUWithTensor(); + testToCPUWithBuffer(); + testToCPUWithTensorSourceOffset(); + testToCPUWithBufferSourceOffset(); + stressTestToCPU(); + LOG(kDefLog, kInfo, "All tests passed."); + return 0; +} + // A simple WGSL copy kernel that copies input to output. static const char *kCopyKernel = R"( @@ -22,6 +44,7 @@ fn main(@builtin(global_invocation_id) gid: vec3) { } )"; + // Test using the overload that takes a Tensor. void testToCPUWithTensor() { LOG(kDefLog, kInfo, "Running testToCPUWithTensor..."); @@ -185,12 +208,49 @@ void testToCPUWithBufferSourceOffset() { LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); } -int main() { - LOG(kDefLog, kInfo, "Running GPU integration tests..."); - testToCPUWithTensor(); - testToCPUWithBuffer(); - testToCPUWithTensorSourceOffset(); - testToCPUWithBufferSourceOffset(); - LOG(kDefLog, kInfo, "All tests passed."); - return 0; -} +void stressTestToCPU() { + LOG(kDefLog, kInfo, "Running stressTestToCPU for 2 seconds..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + // Create a persistent tensor with some test data. + std::vector inputData(N, 0.0f); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + } + Tensor tensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); + + // Prepare to run for one second. + auto startTime = high_resolution_clock::now(); + std::vector> futures; + size_t opCount = 0; + while (high_resolution_clock::now() - startTime < seconds(2)) { + // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes) + auto outputData = std::make_shared>(N, 0.0f); + // Use the tensor overload; we’re copying the entire tensor (destOffset = 0) + LOG(kDefLog, kInfo, "Copying %zu bytes from GPU to CPU...", N * sizeof(float)); + // log count + LOG(kDefLog, kInfo, "opCount = %zu", opCount); + auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0); + futures.push_back(std::move(fut)); + ++opCount; + } + + // Wait for all submitted operations to complete. + for (auto &f : futures) { + wait(ctx, f); + } + + auto endTime = high_resolution_clock::now(); + auto totalMs = duration_cast(endTime - startTime).count(); + double throughput = (opCount / (totalMs / 1000.0)); + + LOG(kDefLog, kInfo, "Stress test completed:\n" + " %zu GPU to CPU operations in %lld ms\n" + " Throughput: %.2f ops/sec", opCount, totalMs, throughput); +} \ No newline at end of file From 5f82ff4d9e0fdd1de7f2ccf8e0a0a6d8e981b2fb Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 22 Feb 2025 19:07:20 -0600 Subject: [PATCH 15/17] linux has a segfault if wait for events after. --- test/test_gpu.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index 99a1af6..aa42b83 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -227,7 +227,6 @@ void stressTestToCPU() { // Prepare to run for one second. auto startTime = high_resolution_clock::now(); - std::vector> futures; size_t opCount = 0; while (high_resolution_clock::now() - startTime < seconds(2)) { // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes) @@ -237,14 +236,9 @@ void stressTestToCPU() { // log count LOG(kDefLog, kInfo, "opCount = %zu", opCount); auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0); - futures.push_back(std::move(fut)); + wait(ctx, fut); ++opCount; } - - // Wait for all submitted operations to complete. - for (auto &f : futures) { - wait(ctx, f); - } auto endTime = high_resolution_clock::now(); auto totalMs = duration_cast(endTime - startTime).count(); From 28dabf277eebb9fb5541870014287a9d7f533036 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sun, 23 Feb 2025 10:22:27 -0600 Subject: [PATCH 16/17] EOF newline --- test/test_gpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index aa42b83..b855712 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -247,4 +247,4 @@ void stressTestToCPU() { LOG(kDefLog, kInfo, "Stress test completed:\n" " %zu GPU to CPU operations in %lld ms\n" " Throughput: %.2f ops/sec", opCount, totalMs, throughput); -} \ No newline at end of file +} From 39c816ca6b4ba0dff8808b680e0cf8f7b36973d4 Mon Sep 17 00:00:00 2001 From: MichealReed Date: Sat, 1 Mar 2025 17:34:42 -0600 Subject: [PATCH 17/17] added sleeptime optional arg --- gpu.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu.hpp b/gpu.hpp index b057514..69ed0e9 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -869,7 +869,7 @@ template T wait(Context &ctx, std::future &f) { * Context ctx = waitForContextFuture(contextFuture); * @endcode */ -template T waitForContextFuture(std::future &f) { +template T waitForContextFuture(std::future &f, size_t sleepTime = 10) { #ifdef __EMSCRIPTEN__ while (f.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { @@ -879,7 +879,7 @@ template T waitForContextFuture(std::future &f) { #else while (f.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); + std::this_thread::sleep_for(std::chrono::milliseconds(sleepTime)); } return f.get(); #endif