From 9ac780bef1c6813f43855f7d9d7d33a733876c45 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Wed, 19 Feb 2025 16:30:50 -0600
Subject: [PATCH 01/17] refactors async

---
 cmake/templates/index.html.in |   2 +-
 examples/hello_world/run.cpp  |  10 +-
 gpu.hpp                       | 838 ++++++++++++++++++++++------------
 numeric_types/half.cpp        |  15 +-
 4 files changed, 550 insertions(+), 315 deletions(-)
diff --git a/cmake/templates/index.html.in b/cmake/templates/index.html.in
index b6f130c..6b5957b 100644
--- a/cmake/templates/index.html.in
+++ b/cmake/templates/index.html.in
@@ -12,7 +12,7 @@
       if (typeof Module !== 'undefined') {
         Module.onRuntimeInitialized = function() {
           // Optionally, pass arguments to main in an array.
-          Module._main([]);
+          Module.ccall('main', 'number', [], [], { async: true });
         };
       } else {
         console.error('Module is undefined. Check that your generated JS file is loaded properly.');
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index 7453869..06970a7 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -38,12 +38,14 @@ int main(int argc, char **argv) {
   Tensor output = createTensor(ctx, Shape{N}, kf32);
   std::promise<void> promise;
   std::future<void> future = promise.get_future();
-  Kernel op = createKernel(ctx, {kGelu, 256, kf32},
+  std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf32},
                            Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  dispatchKernel(ctx, op, promise);
-  wait(ctx, future);
-  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  Kernel op = waitForFuture(ctx.instance, kernelFuture);
+  std::future<void> dispatchFuture = dispatchKernel(ctx, op);
+  waitForFuture(ctx.instance, dispatchFuture);
+  std::future<void> cpuFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  waitForFuture(ctx.instance, cpuFuture);
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);
   }
diff --git a/gpu.hpp b/gpu.hpp
index edc8b38..052c674 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -1,6 +1,7 @@
 #ifndef GPU_HPP
 #define GPU_HPP
 
+#include "webgpu.h"
 #include <array>
 #include <cassert>
 #include <cstring>
@@ -15,15 +16,15 @@
 #include <utility> // std::pair
 #include <vector>
 
-#include "webgpu.h"
+#ifndef __EMSCRIPTEN__
 
-#include "numeric_types/half.hpp"
-#include "utils/logging.hpp"
-
-#ifdef __EMSCRIPTEN__
+#else
 #include "emscripten/emscripten.h"
 #endif
 
+#include "numeric_types/half.hpp"
+#include "utils/logging.hpp"
+
 #ifdef USE_DAWN_API
 #include "dawn/native/DawnNative.h"
 #endif
@@ -430,8 +431,8 @@ struct CallbackData {
   WGPUBuffer buffer; // managed by owning Kernel
   size_t bufferSize;
   void *output; // non-owning, only for target memory in toCPU, not used for
-                // kernel invocations
-  std::promise<void> *promise;
+  // kernel invocations
+  std::shared_ptr<std::promise<void>> promise;
   std::future<void> *future;
 };
 
@@ -530,32 +531,27 @@ struct Context {
   // Default constructor
   Context() = default;
 
-  Context(Context&& other) noexcept
-      : instance(other.instance),
-        adapter(other.adapter),
-        device(other.device),
+  Context(Context &&other) noexcept
+      : instance(other.instance), adapter(other.adapter), device(other.device),
         queue(other.queue),
         // Re‐initialize pools to point to *this*:
-        pool(this),
-        kernelPool(this),
-        adapterStatus(other.adapterStatus),
-        deviceStatus(other.deviceStatus)
-  {
+        pool(this), kernelPool(this), adapterStatus(other.adapterStatus),
+        deviceStatus(other.deviceStatus) {
     LOG(kDefLog, kTrace, "Moving Context ownership");
     // Move over the resources in the pools:
-    pool.data       = std::move(other.pool.data);
+    pool.data = std::move(other.pool.data);
     kernelPool.data = std::move(other.kernelPool.data);
 
     // Null out handles in the source so its destructor won't release them.
     other.instance = nullptr;
-    other.adapter  = nullptr;
-    other.device   = nullptr;
-    other.queue    = nullptr;
+    other.adapter = nullptr;
+    other.device = nullptr;
+    other.queue = nullptr;
     // other.adapterStatus = 0;
     // other.deviceStatus = 0;
   }
 
-  Context& operator=(Context&& other) noexcept {
+  Context &operator=(Context &&other) noexcept {
     if (this != &other) {
       // Free any existing resources. In most cases, this should be a no-op
       // since we typically shouldn't have two active initialized Context
@@ -625,7 +621,7 @@ inline Tensor createTensor(TensorPool &pool, WGPUDevice &device,
   size_t numElements = size(shape);
   size_t size = sizeBytes(dtype) * numElements;
   WGPUBufferDescriptor bufferDesc = {
-      .label = {.data = nullptr, .length = 0}, 
+      .label = {.data = nullptr, .length = 0},
       .usage = usage,
       .size = size,
   };
@@ -794,6 +790,162 @@ inline void check(bool condition, const char *message,
   }
 }
 
+/**
+ * @brief Pumps events until the provided future is ready.
+ *
+ * This helper template function continuously checks the status of the provided std::future<T>
+ * until it becomes ready. On Emscripten builds, it yields control to the JavaScript event loop
+ * using emscripten_sleep to allow asynchronous callbacks to execute. On other platforms, it
+ * processes events from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future
+ * is ready, its value is returned.
+ *
+ * @tparam T The type of the value contained in the future.
+ * @param instance The WGPUInstance used to process events.
+ * @param f The future to wait on.
+ * @return T The value retrieved from the ready future.
+ *
+ * @code
+ * std::future<WGPUDevice> deviceFuture = requestDeviceAsync(adapter, devDescriptor);
+ * WGPUDevice device = waitForFuture(instance, deviceFuture);
+ * @endcode
+ */
+template <typename T>
+T waitForFuture(WGPUInstance instance, std::future<T> &f) {
+#ifdef __EMSCRIPTEN__
+  // Poll until the future is ready.
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    // Yield control to the JS event loop.
+    emscripten_sleep(1);
+  }
+  return f.get();
+#else
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    wgpuInstanceProcessEvents(instance);
+  }
+  return f.get();
+#endif
+}
+
+// Context Callbacks & Helpers
+
+/**
+ * @brief Adapter callback function invoked upon completion of an asynchronous WebGPU adapter request.
+ *
+ * This callback is triggered when the request for a WebGPU adapter completes. It verifies whether
+ * the adapter was successfully obtained. On failure, it logs an error message (in Emscripten builds)
+ * and sets an exception on the associated promise. On success, it sets the value of the promise with
+ * the obtained adapter. Finally, it frees the allocated memory for the promise pointer.
+ *
+ * @param status The status of the adapter request. Expected to be WGPURequestAdapterStatus_Success on success.
+ * @param adapter The WGPUAdapter obtained on a successful request.
+ * @param message A string view containing additional information about the adapter request.
+ * @param userdata1 A pointer to a heap-allocated std::shared_ptr<std::promise<WGPUAdapter>>.
+ * @param userdata2 Unused.
+ */
+inline void adapterCallback(WGPURequestAdapterStatus status,
+                            WGPUAdapter adapter, WGPUStringView message,
+                            void *userdata1, void * /*userdata2*/) {
+  auto *promisePtr =
+      reinterpret_cast<std::shared_ptr<std::promise<WGPUAdapter>> *>(userdata1);
+  if (status != WGPURequestAdapterStatus_Success) {
+#ifdef __EMSCRIPTEN__
+    LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s",
+        static_cast<int>(message.length), message.data);
+#endif
+    (*promisePtr)
+        ->set_exception(std::make_exception_ptr(
+            std::runtime_error("Request WebGPU adapter failed")));
+  } else {
+    (*promisePtr)->set_value(adapter);
+  }
+  delete promisePtr;
+}
+
+/**
+ * @brief Callback function invoked upon completion of an asynchronous WebGPU device request.
+ *
+ * This callback is triggered when the request for a WebGPU device completes. It verifies that
+ * the device was successfully created. On success, the callback sets the value of the associated
+ * promise; otherwise, it sets an exception. After fulfilling the promise, it frees the allocated
+ * memory for the promise pointer.
+ *
+ * @param status The status of the device request. Expected to be WGPURequestDeviceStatus_Success on success.
+ * @param device The WGPUDevice obtained on successful request.
+ * @param message A string view containing additional information about the device request.
+ * @param userdata1 A pointer to a heap-allocated std::shared_ptr<std::promise<WGPUDevice>>.
+ * @param userdata2 Unused.
+ */
+inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device,
+                           WGPUStringView message, void *userdata1,
+                           void * /*userdata2*/) {
+  auto *promisePtr =
+      reinterpret_cast<std::shared_ptr<std::promise<WGPUDevice>> *>(userdata1);
+  if (status != WGPURequestDeviceStatus_Success) {
+    (*promisePtr)
+        ->set_exception(std::make_exception_ptr(
+            std::runtime_error("Request WebGPU device failed")));
+  } else {
+    LOG(kDefLog, kTrace, "Device Request succeeded %p",
+        static_cast<void *>(device));
+    (*promisePtr)->set_value(device);
+  }
+  delete promisePtr;
+}
+
+/**
+ * @brief Asynchronously requests a WebGPU adapter from the given instance.
+ *
+ * This helper function wraps the asynchronous call to request an adapter using the WebGPU API.
+ * It sets up a promise and registers an adapter callback, returning a future that will eventually
+ * hold the requested WGPUAdapter.
+ *
+ * @param instance The WGPUInstance from which to request the adapter.
+ * @param adapterOpts The options for requesting the adapter.
+ * @return std::future<WGPUAdapter> A future that will eventually hold the created WGPUAdapter.
+ */
+inline std::future<WGPUAdapter>
+requestAdapterAsync(WGPUInstance instance,
+                    const WGPURequestAdapterOptions &adapterOpts) {
+  auto promise = std::make_shared<std::promise<WGPUAdapter>>();
+  auto *promisePtr = new std::shared_ptr<std::promise<WGPUAdapter>>(promise);
+
+  WGPURequestAdapterCallbackInfo callbackInfo{
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = adapterCallback,
+      .userdata1 = promisePtr,
+      .userdata2 = nullptr};
+  wgpuInstanceRequestAdapter(instance, &adapterOpts, callbackInfo);
+  return promise->get_future();
+}
+
+/**
+ * @brief Asynchronously requests a WebGPU device from a given adapter.
+ *
+ * This helper function wraps the asynchronous call to request a device using the WebGPU API.
+ * It sets up a promise and registers a device callback, returning a future that will be fulfilled
+ * once the device is available.
+ *
+ * @param adapter The WGPUAdapter to request the device from.
+ * @param devDescriptor The descriptor specifying the characteristics of the requested device.
+ * @return std::future<WGPUDevice> A future that will eventually hold the created WGPUDevice.
+ */
+inline std::future<WGPUDevice>
+requestDeviceAsync(WGPUAdapter adapter,
+                   const WGPUDeviceDescriptor &devDescriptor) {
+  auto promise = std::make_shared<std::promise<WGPUDevice>>();
+  auto *promisePtr = new std::shared_ptr<std::promise<WGPUDevice>>(promise);
+
+  WGPURequestDeviceCallbackInfo deviceCallbackInfo{
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = deviceCallback,
+      .userdata1 = promisePtr,
+      .userdata2 = nullptr};
+  wgpuAdapterRequestDevice(adapter, &devDescriptor, deviceCallbackInfo);
+  return promise->get_future();
+}
+
 /**
  * @brief Factory function to create a GPU context, which aggregates WebGPU API
  * handles to interact with the GPU including the instance, adapter, device, and
@@ -812,12 +964,10 @@ inline void check(bool condition, const char *message,
  * @return Context instance representing the created GPU context
  *
  */
-inline Context createContext(
-    const WGPUInstanceDescriptor &desc = {},
-    const WGPURequestAdapterOptions &adapterOpts = {},
-    const WGPUDeviceDescriptor &devDescriptor = {}) 
-{
-  Context ctx; // stack-allocated
+inline Context createContext(const WGPUInstanceDescriptor &desc = {},
+                             const WGPURequestAdapterOptions &adapterOpts = {},
+                             const WGPUDeviceDescriptor &devDescriptor = {}) {
+  Context ctx; // Stack-allocated Context.
 
 #ifdef __EMSCRIPTEN__
   ctx.instance = wgpuCreateInstance(nullptr);
@@ -826,115 +976,50 @@ inline Context createContext(
 #endif
   check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__);
 
+  // Request the adapter asynchronously.
   LOG(kDefLog, kTrace, "Requesting adapter");
-  {
-    struct AdapterData {
-      WGPUAdapter adapter = nullptr;
-      bool requestEnded = false;
-      WGPURequestAdapterStatus status;
-    };
-    AdapterData adapterData;
-
-    auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status,
-                                    WGPUAdapter adapter, 
-                                    WGPUStringView message,
-                                    void *pUserData, void *) {
-      auto &ad = *reinterpret_cast<AdapterData*>(pUserData);
-      ad.status = status;
-#ifdef __EMSCRIPTEN__
-      if (status != WGPURequestAdapterStatus_Success) {
-        LOG(kDefLog, kError, "Could not get WebGPU adapter: %.*s",
-            static_cast<int>(message.length), message.data);
-      }
-#endif
-      check(status == WGPURequestAdapterStatus_Success,
-            "Request WebGPU adapter", __FILE__, __LINE__);
-      ad.adapter      = adapter;
-      ad.requestEnded = true;
-    };
-
-    WGPURequestAdapterCallbackInfo callbackInfo {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback = onAdapterRequestEnded,
-      .userdata1 = &adapterData,
-      .userdata2 = nullptr
-    };
-    wgpuInstanceRequestAdapter(ctx.instance, &adapterOpts, callbackInfo);
-
-    while (!adapterData.requestEnded) {
-      processEvents(ctx.instance);
-    }
-    ctx.adapter = adapterData.adapter;
-    ctx.adapterStatus = adapterData.status;
+  try {
+    auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts);
+    // Pump events until the adapter future is ready.
+    ctx.adapter = waitForFuture(ctx.instance, adapterFuture);
+    ctx.adapterStatus = WGPURequestAdapterStatus_Success;
+  } catch (const std::exception &ex) {
+    check(false, ex.what(), __FILE__, __LINE__);
   }
 
+  // Request the device asynchronously.
   LOG(kDefLog, kTrace, "Requesting device");
-  {
-    struct DeviceData {
-      WGPUDevice device = nullptr;
-      bool requestEnded = false;
-      WGPURequestDeviceStatus status;
-    };
-    DeviceData devData;
-
-    auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
-                                   WGPUDevice device, 
-                                   WGPUStringView message,
-                                   void *pUserData, void *) {
-      auto &dd = *reinterpret_cast<DeviceData*>(pUserData);
-      dd.status = status;
-      check(status == WGPURequestDeviceStatus_Success,
-            "Could not get WebGPU device.", __FILE__, __LINE__);
-      LOG(kDefLog, kTrace, "Device Request succeeded %p", 
-          static_cast<void*>(device));
-      dd.device      = device;
-      dd.requestEnded= true;
-    };
-
-    WGPURequestDeviceCallbackInfo deviceCallbackInfo {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback = onDeviceRequestEnded,
-      .userdata1= &devData,
-      .userdata2= nullptr
-    };
-    wgpuAdapterRequestDevice(ctx.adapter, &devDescriptor, deviceCallbackInfo);
-
-    LOG(kDefLog, kTrace, "Waiting for device request to end");
-    while (!devData.requestEnded) {
-      processEvents(ctx.instance);
-    }
+  try {
+    auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
+    // Pump events until the device future is ready.
+    ctx.device = waitForFuture(ctx.instance, deviceFuture);
+    ctx.deviceStatus = WGPURequestDeviceStatus_Success;
     LOG(kDefLog, kTrace, "Device request ended");
 
-    ctx.device = devData.device;
-    ctx.deviceStatus = devData.status;
-
-    // If the device was created, set up logging and fetch the queue
-    if (devData.status == WGPURequestDeviceStatus_Success) {
-      #ifndef __EMSCRIPTEN__
-      WGPULoggingCallbackInfo loggingCallbackInfo {
+    // If the device was created, set up logging and fetch the queue.
+#ifndef __EMSCRIPTEN__
+    WGPULoggingCallbackInfo loggingCallbackInfo{
         .nextInChain = nullptr,
         .callback =
-          [](WGPULoggingType type, WGPUStringView message, 
-             void *, void *) {
-            LOG(kDefLog, kError, "Device logging callback: %.*s",
-                static_cast<int>(message.length), message.data);
-            if (type == WGPULoggingType_Error) {
-              throw std::runtime_error("Device error logged.");
-            }
-          },
+            [](WGPULoggingType type, WGPUStringView message, void *, void *) {
+              LOG(kDefLog, kError, "Device logging callback: %.*s",
+                  static_cast<int>(message.length), message.data);
+              if (type == WGPULoggingType_Error) {
+                throw std::runtime_error("Device error logged.");
+              }
+            },
         .userdata1 = nullptr,
-        .userdata2 = nullptr
-      };
-      wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
-      #endif
-      ctx.queue = wgpuDeviceGetQueue(ctx.device);
-    }
+        .userdata2 = nullptr};
+    wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
+#endif
+    ctx.queue = wgpuDeviceGetQueue(ctx.device);
+  } catch (const std::exception &ex) {
+    check(false, ex.what(), __FILE__, __LINE__);
   }
 
   return std::move(ctx);
 }
 
-
 #ifdef USE_DAWN_API
 /**
  * @brief Factory function to create a GPU context, which aggregates WebGPU API
@@ -1066,11 +1151,76 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
 }
 #endif
 
-inline void wait(Context &ctx, std::future<void> &future) {
-  while (future.wait_for(std::chrono::seconds(0)) !=
-         std::future_status::ready) {
-    processEvents(ctx.instance);
-  }
+/**
+ * @brief Callback function invoked upon completion of an asynchronous GPU buffer mapping.
+ *
+ * This callback is triggered when the GPU buffer mapping for a readback buffer is completed.
+ * It verifies that the mapping operation was successful, retrieves the mapped memory,
+ * copies the data from the GPU buffer to a CPU memory region, unmaps the buffer,
+ * signals the completion by fulfilling the associated promise, and cleans up the allocated callback data.
+ *
+ * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success on success.
+ * @param message A string view containing additional information about the mapping operation.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the GPU buffer,
+ *                  buffer size, destination CPU memory pointer, and a promise for signaling completion.
+ * @param userdata2 Unused.
+ */
+inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
+                              void *userdata1, void * /*userdata2*/) {
+  CallbackData *cbData = reinterpret_cast<CallbackData *>(userdata1);
+  // Check that mapping succeeded.
+  check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__,
+        __LINE__);
+
+  // Get the mapped memory.
+  const void *mappedData =
+      wgpuBufferGetConstMappedRange(cbData->buffer, 0, cbData->bufferSize);
+  check(mappedData, "Get mapped range", __FILE__, __LINE__);
+
+  // Copy the data from the mapped GPU buffer to the CPU memory.
+  memcpy(cbData->output, mappedData, cbData->bufferSize);
+
+  // Unmap the buffer.
+  wgpuBufferUnmap(cbData->buffer);
+
+  // Signal that the copy has completed.
+  // Ensure you use the arrow operator on the shared_ptr to call set_value().
+  cbData->promise->set_value();
+
+  // Clean up the dynamically allocated callback data.
+  delete cbData;
+}
+
+/**
+ * @brief Callback function invoked when the GPU queue’s submitted work is complete.
+ *
+ * This callback is registered with the GPU queue after submitting work. When invoked,
+ * it verifies that all queued work completed successfully, and then sets up the buffer
+ * mapping callback to initiate the asynchronous mapping of a readback buffer. The readback
+ * buffer is mapped to access the processed data on the CPU.
+ *
+ * @param status The status of the completed work. Expected to be WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the readback buffer,
+ *                  buffer size, destination CPU memory pointer, and a promise to signal completion.
+ * @param userdata2 Unused.
+ */
+inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
+                                  void *userdata1, void * /*userdata2*/) {
+  CallbackData *cbData = reinterpret_cast<CallbackData *>(userdata1);
+  // Ensure the queue work finished successfully.
+  check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__,
+        __LINE__);
+
+  // Set up the buffer mapping callback information.
+  WGPUBufferMapCallbackInfo mapCallbackInfo;
+  mapCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+  mapCallbackInfo.callback = bufferMapCallback;
+  mapCallbackInfo.userdata1 = cbData;
+  mapCallbackInfo.userdata2 = nullptr;
+
+  // Begin the asynchronous mapping of the readback buffer.
+  wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize,
+                     mapCallbackInfo);
 }
 
 /**
@@ -1085,45 +1235,35 @@ inline void wait(Context &ctx, std::future<void> &future) {
  * toCPU(ctx, tensor, data, bufferSize);
  * @endcode
  */
-inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
-                  CopyData &op) {
+inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
+                               size_t bufferSize, CopyData &op) {
+  // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
-  CallbackData callbackData = {op.readbackBuffer, bufferSize, data, &op.promise,
-                               &op.future};
 
-  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback =
-          [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) {
-            check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
-                  __FILE__, __LINE__);
-            const auto *data = static_cast<CallbackData *>(userdata1);
-            WGPUBufferMapCallbackInfo mapCallbackInfo = {
-                .mode = WGPUCallbackMode_AllowSpontaneous,
-                .callback =
-                    [](WGPUMapAsyncStatus status, WGPUStringView message,
-                       void *userdata1, void *userdata2) {
-                      const auto *data = static_cast<CallbackData *>(userdata1);
-                      check(status == WGPUMapAsyncStatus_Success,
-                            "Map readbackBuffer", __FILE__, __LINE__);
-                      const void *mappedData = wgpuBufferGetConstMappedRange(
-                          data->buffer, /*offset=*/0, data->bufferSize);
-                      check(mappedData, "Get mapped range", __FILE__, __LINE__);
-                      memcpy(data->output, mappedData, data->bufferSize);
-                      wgpuBufferUnmap(data->buffer);
-                      data->promise->set_value();
-                    },
-                .userdata1 = const_cast<CallbackData *>(data),
-                .userdata2 = nullptr};
-            wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0,
-                               data->bufferSize, mapCallbackInfo);
-          },
-      .userdata1 = &callbackData,
-      .userdata2 = nullptr};
+  // Create a promise and get its future.
+  auto promise = std::make_shared<std::promise<void>>();
+
+  // Allocate callback data so it remains valid until the async
+  // chain finishes.
+  CallbackData *cbData = new CallbackData{
+      op.readbackBuffer, // The GPU buffer to be read back.
+      bufferSize,
+      data,   // CPU memory destination.
+      promise // The promise to be signaled.
+  };
+
+  // Set up the work-done callback to initiate the buffer mapping.
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo;
+  workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+  workDoneCallbackInfo.callback = queueWorkDoneCallback;
+  workDoneCallbackInfo.userdata1 = cbData; // Pass the callback data.
+  workDoneCallbackInfo.userdata2 = nullptr;
+
+  // Begin the asynchronous chain by registering the queue work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
-  wait(ctx, op.future);
+  return promise->get_future();
 }
 
 /**
@@ -1141,31 +1281,59 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
  * @param[in] bufferSize Size of the data buffer in bytes
  * @param[out] data Pointer to the CPU memory to copy the data to
  */
-inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
-  CopyData op;
-  op.future = op.promise.get_future();
-  {
-    WGPUBufferDescriptor readbackBufferDescriptor = {
-        .label = {.data = nullptr, .length = 0}, 
-        .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
-        .size = bufferSize,
-    };
-    op.readbackBuffer =
-        wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
-  }
-  {
-    WGPUCommandEncoder commandEncoder;
-    commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
-    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,
-                                         op.readbackBuffer, 0, bufferSize);
-    op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
-    wgpuCommandEncoderRelease(commandEncoder);
-    check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);
-  }
-  toCPU(ctx, tensor, data, bufferSize, op);
-  if (op.readbackBuffer) {
-    wgpuBufferRelease(op.readbackBuffer);
-  }
+inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
+                               size_t bufferSize) {
+  // Create a promise that will later be satisfied when the async copy
+  // completes.
+  auto promise = std::make_shared<std::promise<void>>();
+
+  // Create a readback buffer that will be used for copying and mapping.
+  WGPUBufferDescriptor readbackBufferDescriptor = {
+      .label = {.data = nullptr, .length = 0},
+      .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
+      .size = bufferSize,
+  };
+  WGPUBuffer readbackBuffer =
+      wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
+
+  // Create a command encoder and record a copy from the tensor GPU buffer
+  WGPUCommandEncoder commandEncoder =
+      wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+  wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,
+                                       readbackBuffer, 0, bufferSize);
+  // Finish recording by creating a command buffer and release the encoder.
+  WGPUCommandBuffer commandBuffer =
+      wgpuCommandEncoderFinish(commandEncoder, nullptr);
+  wgpuCommandEncoderRelease(commandEncoder);
+  check(commandBuffer, "Create command buffer", __FILE__, __LINE__);
+
+  // Submit the work to the queue and release the command buffer immediately.
+  wgpuQueueSubmit(ctx.queue, 1, &commandBuffer);
+  wgpuCommandBufferRelease(commandBuffer);
+
+  // Allocate callback data
+  CallbackData *cbData = new CallbackData{
+      readbackBuffer, // The readback buffer to map.
+      bufferSize,     // The size of the copy.
+      data,           // CPU memory destination.
+      promise         // The promise to signal when done.
+  };
+
+  // Set up the work-done callback. When the queue’s submitted work is
+  // completed, it is routed to queueWorkDoneCallback which then starts the
+  // asynchronous map.
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = queueWorkDoneCallback,
+      .userdata1 = cbData,
+      .userdata2 = nullptr,
+  };
+
+  // Register the callback. The async chain continues inside
+  // queueWorkDoneCallback.
+  wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
+
+  return promise->get_future();
 }
 
 /**
@@ -1176,76 +1344,74 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
  * @param[out] data Array of floats to copy the data to
  *
  * @code
- * toCPU(ctx, tensor, data);
+ * std::future<void> toCPUFuture = toCPU(ctx, tensor, data);
+ * WaitForFuture(ctx.instance, toCPUFuture);
  * @endcode
  */
 template <size_t N>
-void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {
-  toCPU(ctx, tensor, data.data(), sizeof(data));
+inline std::future<void> toCPU(Context &ctx, Tensor &tensor,
+                               std::array<float, N> &data) {
+  return toCPU(ctx, tensor, data.data(), sizeof(data));
 }
 
-inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
+inline std::future<void> toCPU(Context &ctx, WGPUBuffer buffer, void *data,
+                               size_t size) {
+  // The size (in bytes) for the copy.
   uint64_t bufferSize = size;
+
+  // Create an operation structure (here we reuse CopyData solely for its
+  // members that we need to create a readback buffer and command buffer).
   CopyData op;
-  op.future = op.promise.get_future();
+
+  // Create the promise that will be fulfilled once the copy is done.
+  auto promise = std::make_shared<std::promise<void>>();
+
+  // Create a readback buffer that we can map for reading.
   {
     WGPUBufferDescriptor readbackBufferDescriptor = {
-        .label = {.data = nullptr, .length = 0}, 
+        .label = {.data = nullptr, .length = 0},
         .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
         .size = bufferSize,
     };
     op.readbackBuffer =
         wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
   }
+
+  // Create a command encoder which copies from the provided buffer to the
+  // readback buffer.
   {
-    WGPUCommandEncoder commandEncoder;
-    commandEncoder = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+    WGPUCommandEncoder commandEncoder =
+        wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
     wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0,
                                          op.readbackBuffer, 0, bufferSize);
     op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
     wgpuCommandEncoderRelease(commandEncoder);
     check(op.commandBuffer, "Create command buffer", __FILE__, __LINE__);
   }
+
+  // Submit the command and release the command buffer.
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
-  CallbackData callbackData = {op.readbackBuffer, static_cast<size_t>(bufferSize), data, &op.promise,
-                               &op.future};
 
+  // Allocate callback data
+  CallbackData *cbData = new CallbackData{
+      op.readbackBuffer,               // The readback buffer created above.
+      static_cast<size_t>(bufferSize), // Size of the copy.
+      data,                            // Destination CPU memory.
+      promise                          // Our promise to satisfy when done.
+  };
+
+  // Set up the queue work-done callback info.
   WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
       .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback =
-          [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) {
-            check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
-                  __FILE__, __LINE__);
-            const auto *data = static_cast<CallbackData *>(userdata1);
-            WGPUBufferMapCallbackInfo mapCallbackInfo = {
-                .mode = WGPUCallbackMode_AllowSpontaneous,
-                .callback =
-                    [](WGPUMapAsyncStatus status, WGPUStringView message,
-                       void *userdata1, void *userdata2) {
-                      const auto *data = static_cast<CallbackData *>(userdata1);
-                      check(status == WGPUMapAsyncStatus_Success,
-                            "Map readbackBuffer", __FILE__, __LINE__);
-                      const void *mappedData = wgpuBufferGetConstMappedRange(
-                          data->buffer, /*offset=*/0, data->bufferSize);
-                      check(mappedData, "Get mapped range", __FILE__, __LINE__);
-                      memcpy(data->output, mappedData, data->bufferSize);
-                      wgpuBufferUnmap(data->buffer);
-                      data->promise->set_value();
-                    },
-                .userdata1 = const_cast<CallbackData *>(data),
-                .userdata2 = nullptr};
-            wgpuBufferMapAsync(data->buffer, WGPUMapMode_Read, 0,
-                               data->bufferSize, mapCallbackInfo);
-          },
-      .userdata1 = &callbackData,
+      .callback = queueWorkDoneCallback, // Our free function callback.
+      .userdata1 = cbData,               // Pass the callback data pointer.
       .userdata2 = nullptr};
+
+  // Start the asynchronous chain by registering the work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
-  wait(ctx, op.future);
-  if (op.readbackBuffer) {
-    wgpuBufferRelease(op.readbackBuffer);
-  }
+  return promise->get_future();
 }
 
 /**
@@ -1376,6 +1542,19 @@ inline Shape cdiv(Shape total, Shape group) {
   return result;
 }
 
+/**
+ * @brief Packages the shader compilation information along with a promise for asynchronous signaling.
+ *
+ * This structure holds a pointer to a CompilationInfo instance that collects
+ * details such as status, messages, line numbers, and positions from the shader compilation.
+ * It also contains a shared pointer to a std::promise<void> which is used to signal the completion
+ * of the asynchronous shader compilation process.
+ */
+struct CompData {
+  CompilationInfo *compInfo;
+  std::shared_ptr<std::promise<void>> compPromise;
+};
+
 /**
  * @brief A factory function to create a kernel on the GPU. The kernel is
  * created with the given WGSL code, input tensors, output tensor, and
@@ -1399,34 +1578,38 @@ inline Shape cdiv(Shape total, Shape group) {
  * @return Kernel instance representing the created kernel
  *
  * @code
- * Kernel kernel = createKernel(ctx, code, dataBindings, numInputs,
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize);
+ * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
  * @endcode
- * output, nThreads, params, paramsSize);
+ 
  */
-inline Kernel createKernel(Context& ctx, const KernelCode &code,
-                           const Tensor *dataBindings, size_t numTensors,
-                           const size_t *viewOffsets,
-                           const Shape &totalWorkgroups,
-                           const void *params = nullptr, size_t paramsSize = 0,
-                           CompilationInfo *compilationInfo = nullptr,
-                           const char *cacheKey = nullptr) {
+inline std::future<Kernel>
+createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
+             size_t numTensors, const size_t *viewOffsets,
+             const Shape &totalWorkgroups, const void *params = nullptr,
+             size_t paramsSize = 0, CompilationInfo *compilationInfo = nullptr,
+             const char *cacheKey = nullptr) {
   // Create a cache key by the pointer values of the data bindings and the
   // kernel code
   if (cacheKey != nullptr &&
       ctx.kernelPool.data.find(cacheKey) != ctx.kernelPool.data.end()) {
-    LOG(kDefLog, kInfo, "Kernel cache hit");
-    return ctx.kernelPool.data[cacheKey];
+    std::promise<Kernel> ready;
+    ready.set_value(ctx.kernelPool.data[cacheKey]);
+    return ready.get_future();
   }
 
+  // Create an outer promise for the new kernel.
+  std::promise<Kernel> outerPromise;
+  std::future<Kernel> outerFuture = outerPromise.get_future();
+
   assert(totalWorkgroups.rank == 3);
   WGPUDevice device = ctx.device;
   WGPUQueue queue = ctx.queue;
   Kernel op(new RawKernel());
-
   // paramIndex is the index into bgLayoutEntries for the parameters buffer If
   // there are no parameters for the kernel, paramsSize == 0 and paramIndex is
   // effectively undefined (== -1)
-  size_t paramIndex = -1;
+  size_t paramIndex = static_cast<size_t>(-1);
   // Note: paramIndex is undefined unless paramsSize > 0
   size_t numBindings = numTensors;
   if (paramsSize > 0) {
@@ -1435,11 +1618,13 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
                                   // op.buffers, op.bufferSizes and
                                   // bgLayoutEntries
   }
+
   op->buffers = std::make_unique<WGPUBuffer[]>(numBindings);
   op->bufferSizes = std::make_unique<size_t[]>(numBindings);
   op->numBindings = numBindings;
-  std::vector<WGPUBindGroupLayoutEntry> bgLayoutEntries(numBindings);
+
   // Create layout entries for input buffers
+  std::vector<WGPUBindGroupLayoutEntry> bgLayoutEntries(numBindings);
   for (size_t i = 0; i < numTensors; ++i) {
     bgLayoutEntries[i] = WGPUBindGroupLayoutEntry{
         .binding = static_cast<uint32_t>(i),
@@ -1452,8 +1637,6 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
     };
   }
   if (paramsSize > 0) {
-    LOG(kDefLog, kInfo, "Create layout entry for the params buffer");
-    // Create layout entry for the params buffer
     bgLayoutEntries[paramIndex] = WGPUBindGroupLayoutEntry{
         .binding = static_cast<uint32_t>(paramIndex),
         .visibility = WGPUShaderStage_Compute,
@@ -1466,10 +1649,11 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   }
   WGPUBindGroupLayoutDescriptor bgLayoutDesc = {
       .entryCount = static_cast<uint32_t>(bgLayoutEntries.size()),
-      .entries = bgLayoutEntries.data(),
-  };
+      .entries = bgLayoutEntries.data()};
   WGPUBindGroupLayout bgLayout =
       wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc);
+
+  // Assign buffers from dataBindings.
   for (size_t i = 0; i < numTensors; ++i) {
     op->buffers[i] = dataBindings[i].data.buffer;
     op->bufferSizes[i] = dataBindings[i].data.size;
@@ -1477,7 +1661,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   // Create a buffer for the Params struct
   if (paramsSize > 0) {
     WGPUBufferDescriptor paramsBufferDesc = {
-        .label = {.data = nullptr, .length = 0}, 
+        .label = {.data = nullptr, .length = 0},
         .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,
         .size = paramsSize,
         .mappedAtCreation = false,
@@ -1489,6 +1673,8 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   } else {
     LOG(kDefLog, kTrace, "No params buffer needed");
   }
+
+  // Build bind group entries and the bind group.
   std::vector<WGPUBindGroupEntry> bindGroupEntries(numBindings);
   for (size_t i = 0; i < numTensors; ++i) {
     bindGroupEntries[i] = WGPUBindGroupEntry{
@@ -1516,6 +1702,7 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   };
   op->bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc);
 
+  // Create pipeline layout.
   WGPUPipelineLayoutDescriptor pipelineLayoutDesc = {
       .bindGroupLayoutCount = 1,
       .bindGroupLayouts = &bgLayout,
@@ -1523,63 +1710,101 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
   WGPUPipelineLayout pipelineLayout =
       wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc);
 
+  // Prepare the WGSL source and shader module descriptor.
   WGPUShaderSourceWGSL wgslDesc = {
       .chain = {.sType = WGPUSType_ShaderSourceWGSL},
       .code = {.data = code.data.c_str(), .length = code.data.length()}};
-
   WGPUShaderModuleDescriptor shaderModuleDesc = {};
   shaderModuleDesc.nextInChain = &wgslDesc.chain;
   shaderModuleDesc.label = {code.label.c_str(), code.label.length()};
 
-  WGPUComputePipelineDescriptor computePipelineDesc = {};
-  computePipelineDesc.layout = pipelineLayout;
-  computePipelineDesc.compute.module =
+  // Create the shader module.
+  WGPUShaderModule shaderModule =
       wgpuDeviceCreateShaderModule(device, &shaderModuleDesc);
 
+  // If compilation info is requested, register the callback immediately.
+  if (compilationInfo) {
+    auto compPromise = std::make_shared<std::promise<void>>();
+    std::future<void> compFuture = compPromise->get_future();
+    // Allocate helper data to pass to the callback.
+    auto *compData = new CompData{compilationInfo, compPromise};
+
+    auto compilationCallback = [](WGPUCompilationInfoRequestStatus status,
+                                  WGPUCompilationInfo const *info,
+                                  void *userdata1, void * /*userdata2*/) {
+      CompData *cd = reinterpret_cast<CompData *>(userdata1);
+      if (info && cd->compInfo) {
+        cd->compInfo->status = status;
+        for (uint32_t i = 0; i < info->messageCount; ++i) {
+          cd->compInfo->messages.push_back(
+              std::string(info->messages[i].message.data,
+                          info->messages[i].message.length));
+          cd->compInfo->lineNums.push_back(info->messages[i].lineNum);
+          cd->compInfo->linePos.push_back(info->messages[i].linePos);
+        }
+        cd->compInfo->finished = true;
+      }
+      cd->compPromise->set_value();
+      delete cd;
+    };
+
+    WGPUCompilationInfoCallbackInfo compilationCallbackInfo = {};
+    compilationCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+    compilationCallbackInfo.callback = compilationCallback;
+    compilationCallbackInfo.userdata1 = compData;
+    compilationCallbackInfo.userdata2 = nullptr;
+
+    // Register callback and then wait for the result.
+    wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo);
+    waitForFuture(ctx.instance, compFuture);
+  }
+
+  // Now create the compute pipeline using the shader module.
+  WGPUComputePipelineDescriptor computePipelineDesc = {};
+  computePipelineDesc.layout = pipelineLayout;
+  computePipelineDesc.compute.module = shaderModule;
   computePipelineDesc.compute.entryPoint = {code.entryPoint.c_str(),
                                             code.entryPoint.length()};
   computePipelineDesc.label = {code.label.c_str(), code.label.length()};
-
   op->computePipeline =
       wgpuDeviceCreateComputePipeline(device, &computePipelineDesc);
+
   op->totalWorkgroups = {totalWorkgroups[0], totalWorkgroups[1],
                          totalWorkgroups[2]};
+
   resetCommandBuffer(device, op);
   if (cacheKey != nullptr)
     ctx.kernelPool.data[cacheKey] = op;
 
-  auto compilationInfoCallback = [](WGPUCompilationInfoRequestStatus status,
-                                    WGPUCompilationInfo const *compilationInfo,
-                                    void *userdata1, void *userdata2) {
-    CompilationInfo *result = static_cast<CompilationInfo *>(userdata1);
-    if (compilationInfo && result) {
-      result->status = status;
-      for (uint32_t i = 0; i < compilationInfo->messageCount; ++i) {
-        printf("Message %d: %.*s\n", i,
-               static_cast<int>(compilationInfo->messages[i].message.length),
-               compilationInfo->messages[i].message.data);
-        result->messages.push_back(
-            std::string(compilationInfo->messages[i].message.data,
-                        compilationInfo->messages[i].message.length));
-        result->lineNums.push_back(compilationInfo->messages[i].lineNum);
-        result->linePos.push_back(compilationInfo->messages[i].linePos);
-      }
-      result->finished = true;
-    } else {
-      LOG(kDefLog, kTrace, "No compilation info or result");
-    }
-  };
-
-  WGPUCompilationInfoCallbackInfo compilationCallbackInfo = {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback = compilationInfoCallback,
-      .userdata1 = static_cast<void *>(compilationInfo),
-      .userdata2 = nullptr};
+  outerPromise.set_value(op);
+  return outerFuture;
+}
 
-  while (compilationInfo && !compilationInfo->finished) {
-    processEvents(ctx.instance);
+/**
+ * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done.
+ *
+ * This callback is invoked when the GPU queue signals the completion of the submitted
+ * workload for a kernel dispatch. It receives the work-done status and a userdata pointer,
+ * which is expected to be a heap‑allocated pointer to a std::promise<void>.
+ *
+ * On success, the promise is fulfilled by calling set_value(). Otherwise, it is set with an exception.
+ * After setting the promise state, the allocated memory for the promise is freed.
+ *
+ * @param status The status of the work done. Expected to be WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A heap allocated pointer to std::promise<void> which is set when the work is done.
+ * @param userdata2 Unused.
+ */
+inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
+                                   void *userdata1, void * /*userdata2*/) {
+  // Cast the userdata pointer back to our heap‑allocated promise.
+  auto *p = reinterpret_cast<std::promise<void> *>(userdata1);
+  if (status == WGPUQueueWorkDoneStatus_Success) {
+    p->set_value();
+  } else {
+    p->set_exception(std::make_exception_ptr(
+        std::runtime_error("Queue work did not complete successfully.")));
   }
-  return op;
+  delete p; // free the heap allocation
 }
 
 /**
@@ -1599,17 +1824,17 @@ inline Kernel createKernel(Context& ctx, const KernelCode &code,
  * @return Kernel instance representing the created kernel
  *
  * @code
- * Kernel kernel = createKernel(ctx, code, tensorData, output,
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData, output,totalWorkgroups, params);
+ * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
  * @endcode
- * totalWorkgroups, params);
  */
 template <typename ParamsType = NoParam, size_t numInputs>
-Kernel createKernel(Context &ctx, const KernelCode &code,
-                    const Bindings<numInputs> &dataBindings,
-                    const Shape &totalWorkgroups,
-                    const ParamsType &params = ParamsType{},
-                    CompilationInfo *compilationInfo = nullptr,
-                    const char *cacheKey = nullptr) {
+std::future<Kernel> createKernel(Context &ctx, const KernelCode &code,
+                                 const Bindings<numInputs> &dataBindings,
+                                 const Shape &totalWorkgroups,
+                                 const ParamsType &params = ParamsType{},
+                                 CompilationInfo *compilationInfo = nullptr,
+                                 const char *cacheKey = nullptr) {
   if constexpr (!IsNoParam<ParamsType>) {
     return createKernel(ctx, code, dataBindings.data.data(), numInputs,
                         dataBindings.viewOffsets.data(), totalWorkgroups,
@@ -1637,30 +1862,37 @@ Kernel createKernel(Context &ctx, const KernelCode &code,
  * @param[in] promise Promise to set when the kernel has finished executing
  *
  * @code
- * dispatchKernel(ctx, kernel);
+ * std::future<void> dispatchFuture = dispatchKernel(ctx, kernel);
+ * WaitForFuture(ctx.instance, dispatchFuture);
  * @endcode
  */
-inline void dispatchKernel(Context &ctx, Kernel &kernel,
-                           std::promise<void> &promise) {
+inline std::future<void> dispatchKernel(Context &ctx, Kernel &kernel) {
+  // If the kernel was used before, reset the command buffer.
   if (kernel->used) {
     resetCommandBuffer(ctx.device, kernel);
   }
+
+  // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &kernel->commandBuffer);
   wgpuCommandBufferRelease(kernel->commandBuffer);
   kernel->used = true;
 
-  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
-      .mode = WGPUCallbackMode_AllowSpontaneous,
-      .callback =
-          [](WGPUQueueWorkDoneStatus status, void *userdata1, void *userdata2) {
-            check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
-                  __FILE__, __LINE__);
-            auto *promise = static_cast<std::promise<void> *>(userdata1);
-            promise->set_value();
-          },
-      .userdata1 = &promise,
-      .userdata2 = nullptr};
+  // Allocate a promise on the heap so it remains valid beyond this function’s
+  // scope.
+  std::promise<void> *promise = new std::promise<void>();
+  std::future<void> future = promise->get_future();
+
+  // Set up the callback info.
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {};
+  workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
+  workDoneCallbackInfo.callback = dispatchKernelCallback;
+  workDoneCallbackInfo.userdata1 = reinterpret_cast<void *>(promise);
+  workDoneCallbackInfo.userdata2 = nullptr;
+
+  // IMPORTANT: Pass the address of the callback info structure.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
+
+  return future;
 }
 
 } // namespace gpu
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
index fe5aab7..75d9dc4 100644
--- a/numeric_types/half.cpp
+++ b/numeric_types/half.cpp
@@ -189,7 +189,8 @@ void testContainers() {
     std::array<half, 8> h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0};
     Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data());
     std::array<half, 8> h2;
-    toCPU(ctx, devH, h2.data(), sizeof(h2));
+    std::future<void> toCPUFuture = toCPU(ctx, devH, h2.data(), sizeof(h2));
+    waitForFuture(ctx.instance, toCPUFuture);
     for (int i = 0; i < 8; ++i) {
       printResult(h[i].data == h2[i].data, "Container round trip",
                   static_cast<float>(h[i]), static_cast<float>(h2[i]));
@@ -228,13 +229,13 @@ fn main(
   }
   Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf16);
-  std::promise<void> promise;
-  std::future<void> future = promise.get_future();
-  Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
+  std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  dispatchKernel(ctx, op, promise);
-  wait(ctx, future);
-  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  Kernel op = waitForFuture(ctx.instance, kernelFuture);
+  std::future<void> dispatchFuture = dispatchKernel(ctx, op);
+  waitForFuture(ctx.instance, dispatchFuture);
+  std::future<void> toCPUFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
+  waitForFuture(ctx.instance, toCPUFuture);
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", static_cast<float>(inputArr[i]),
            static_cast<float>(outputArr[i]));

From 14e7ab59a67329573bc69a7dfce5d431ba8777b3 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Wed, 19 Feb 2025 18:06:26 -0600
Subject: [PATCH 02/17] use async context waitForContext()

---
 cmake/example.cmake          |   4 +-
 examples/hello_world/run.cpp |   4 +-
 gpu.hpp                      | 279 ++++++++++++++++++++++-------------
 numeric_types/half.cpp       |   5 +-
 4 files changed, 180 insertions(+), 112 deletions(-)

diff --git a/cmake/example.cmake b/cmake/example.cmake
index 7cf1f8d..5953876 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -45,14 +45,16 @@ if(EMSCRIPTEN)
     # Set Emscripten-specific link flags that enable WASM output and expose certain symbols.
     # Needed to use updated version, emdawnwebgpu
     set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "\
+        -O3 \
         -sUSE_WEBGPU=0 \
         -sWASM=1 \
         -DDAWN_EMSCRIPTEN_TOOLCHAIN=${EMSCRIPTEN_DIR} \
         -sEXPORTED_FUNCTIONS=_main,_malloc,_free,_memcpy \
         -sEXPORTED_RUNTIME_METHODS=ccall \
         -sUSE_GLFW=3 \
-        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=5MB \
+        -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \
         -sASYNCIFY \
+        -sASYNCIFY_DEBUG \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index 06970a7..c9f22c7 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -28,7 +28,7 @@ int main(int argc, char **argv) {
   printf("--------------\n\n");
 
   // std::unique_ptr<Context> ctx = createContext();
-  Context ctx = createContext();
+  Context ctx = waitForContext();
   static constexpr size_t N = 10000;
   std::array<float, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
@@ -36,8 +36,6 @@ int main(int argc, char **argv) {
   }
   Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf32);
-  std::promise<void> promise;
-  std::future<void> future = promise.get_future();
   std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf32},
                            Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
diff --git a/gpu.hpp b/gpu.hpp
index 052c674..0119108 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -793,10 +793,11 @@ inline void check(bool condition, const char *message,
 /**
  * @brief Pumps events until the provided future is ready.
  *
- * This helper template function continuously checks the status of the provided std::future<T>
- * until it becomes ready. On Emscripten builds, it yields control to the JavaScript event loop
- * using emscripten_sleep to allow asynchronous callbacks to execute. On other platforms, it
- * processes events from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future
+ * This helper template function continuously checks the status of the provided
+ * std::future<T> until it becomes ready. On Emscripten builds, it yields
+ * control to the JavaScript event loop using emscripten_sleep to allow
+ * asynchronous callbacks to execute. On other platforms, it processes events
+ * from the given WGPUInstance using wgpuInstanceProcessEvents. Once the future
  * is ready, its value is returned.
  *
  * @tparam T The type of the value contained in the future.
@@ -805,8 +806,8 @@ inline void check(bool condition, const char *message,
  * @return T The value retrieved from the ready future.
  *
  * @code
- * std::future<WGPUDevice> deviceFuture = requestDeviceAsync(adapter, devDescriptor);
- * WGPUDevice device = waitForFuture(instance, deviceFuture);
+ * std::future<WGPUDevice> deviceFuture = requestDeviceAsync(adapter,
+ * devDescriptor); WGPUDevice device = waitForFuture(instance, deviceFuture);
  * @endcode
  */
 template <typename T>
@@ -831,17 +832,56 @@ T waitForFuture(WGPUInstance instance, std::future<T> &f) {
 // Context Callbacks & Helpers
 
 /**
- * @brief Adapter callback function invoked upon completion of an asynchronous WebGPU adapter request.
+ * @brief Waits for the provided std::future<T> to become ready by polling its status.
  *
- * This callback is triggered when the request for a WebGPU adapter completes. It verifies whether
- * the adapter was successfully obtained. On failure, it logs an error message (in Emscripten builds)
- * and sets an exception on the associated promise. On success, it sets the value of the promise with
- * the obtained adapter. Finally, it frees the allocated memory for the promise pointer.
+ * This helper template function continuously checks the status of the provided std::future<T> until it is ready.
+ * On Emscripten builds, it yields control to the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous behavior.
+ * On non-Emscripten platforms, it sleeps for a short duration (10 milliseconds) between checks.
+ * Once the future is ready, its value is returned.
  *
- * @param status The status of the adapter request. Expected to be WGPURequestAdapterStatus_Success on success.
+ * @tparam T The type of the value contained in the future.
+ * @param f The future to wait on.
+ * @return T The value retrieved from the ready future.
+ *
+ * @code
+ * std::future<Context> contextFuture = createContext();
+ * Context ctx = waitForContextFuture(contextFuture);
+ * @endcode
+ */
+template <typename T> T waitForContextFuture(std::future<T> &f) {
+  #ifdef __EMSCRIPTEN__
+    while (f.wait_for(std::chrono::milliseconds(0)) !=
+           std::future_status::ready) {
+      emscripten_sleep(1); // Yield back to the JS event loop.
+    }
+    return f.get();
+  #else
+    while (f.wait_for(std::chrono::milliseconds(0)) !=
+           std::future_status::ready) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+    return f.get();
+  #endif
+  }
+
+/**
+ * @brief Adapter callback function invoked upon completion of an asynchronous
+ * WebGPU adapter request.
+ *
+ * This callback is triggered when the request for a WebGPU adapter completes.
+ * It verifies whether the adapter was successfully obtained. On failure, it
+ * logs an error message (in Emscripten builds) and sets an exception on the
+ * associated promise. On success, it sets the value of the promise with the
+ * obtained adapter. Finally, it frees the allocated memory for the promise
+ * pointer.
+ *
+ * @param status The status of the adapter request. Expected to be
+ * WGPURequestAdapterStatus_Success on success.
  * @param adapter The WGPUAdapter obtained on a successful request.
- * @param message A string view containing additional information about the adapter request.
- * @param userdata1 A pointer to a heap-allocated std::shared_ptr<std::promise<WGPUAdapter>>.
+ * @param message A string view containing additional information about the
+ * adapter request.
+ * @param userdata1 A pointer to a heap-allocated
+ * std::shared_ptr<std::promise<WGPUAdapter>>.
  * @param userdata2 Unused.
  */
 inline void adapterCallback(WGPURequestAdapterStatus status,
@@ -864,17 +904,22 @@ inline void adapterCallback(WGPURequestAdapterStatus status,
 }
 
 /**
- * @brief Callback function invoked upon completion of an asynchronous WebGPU device request.
+ * @brief Callback function invoked upon completion of an asynchronous WebGPU
+ * device request.
  *
- * This callback is triggered when the request for a WebGPU device completes. It verifies that
- * the device was successfully created. On success, the callback sets the value of the associated
- * promise; otherwise, it sets an exception. After fulfilling the promise, it frees the allocated
- * memory for the promise pointer.
+ * This callback is triggered when the request for a WebGPU device completes. It
+ * verifies that the device was successfully created. On success, the callback
+ * sets the value of the associated promise; otherwise, it sets an exception.
+ * After fulfilling the promise, it frees the allocated memory for the promise
+ * pointer.
  *
- * @param status The status of the device request. Expected to be WGPURequestDeviceStatus_Success on success.
+ * @param status The status of the device request. Expected to be
+ * WGPURequestDeviceStatus_Success on success.
  * @param device The WGPUDevice obtained on successful request.
- * @param message A string view containing additional information about the device request.
- * @param userdata1 A pointer to a heap-allocated std::shared_ptr<std::promise<WGPUDevice>>.
+ * @param message A string view containing additional information about the
+ * device request.
+ * @param userdata1 A pointer to a heap-allocated
+ * std::shared_ptr<std::promise<WGPUDevice>>.
  * @param userdata2 Unused.
  */
 inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device,
@@ -897,13 +942,14 @@ inline void deviceCallback(WGPURequestDeviceStatus status, WGPUDevice device,
 /**
  * @brief Asynchronously requests a WebGPU adapter from the given instance.
  *
- * This helper function wraps the asynchronous call to request an adapter using the WebGPU API.
- * It sets up a promise and registers an adapter callback, returning a future that will eventually
- * hold the requested WGPUAdapter.
+ * This helper function wraps the asynchronous call to request an adapter using
+ * the WebGPU API. It sets up a promise and registers an adapter callback,
+ * returning a future that will eventually hold the requested WGPUAdapter.
  *
  * @param instance The WGPUInstance from which to request the adapter.
  * @param adapterOpts The options for requesting the adapter.
- * @return std::future<WGPUAdapter> A future that will eventually hold the created WGPUAdapter.
+ * @return std::future<WGPUAdapter> A future that will eventually hold the
+ * created WGPUAdapter.
  */
 inline std::future<WGPUAdapter>
 requestAdapterAsync(WGPUInstance instance,
@@ -923,13 +969,15 @@ requestAdapterAsync(WGPUInstance instance,
 /**
  * @brief Asynchronously requests a WebGPU device from a given adapter.
  *
- * This helper function wraps the asynchronous call to request a device using the WebGPU API.
- * It sets up a promise and registers a device callback, returning a future that will be fulfilled
- * once the device is available.
+ * This helper function wraps the asynchronous call to request a device using
+ * the WebGPU API. It sets up a promise and registers a device callback,
+ * returning a future that will be fulfilled once the device is available.
  *
  * @param adapter The WGPUAdapter to request the device from.
- * @param devDescriptor The descriptor specifying the characteristics of the requested device.
- * @return std::future<WGPUDevice> A future that will eventually hold the created WGPUDevice.
+ * @param devDescriptor The descriptor specifying the characteristics of the
+ * requested device.
+ * @return std::future<WGPUDevice> A future that will eventually hold the
+ * created WGPUDevice.
  */
 inline std::future<WGPUDevice>
 requestDeviceAsync(WGPUAdapter adapter,
@@ -964,60 +1012,62 @@ requestDeviceAsync(WGPUAdapter adapter,
  * @return Context instance representing the created GPU context
  *
  */
-inline Context createContext(const WGPUInstanceDescriptor &desc = {},
-                             const WGPURequestAdapterOptions &adapterOpts = {},
-                             const WGPUDeviceDescriptor &devDescriptor = {}) {
-  Context ctx; // Stack-allocated Context.
+inline std::future<Context>
+createContext(const WGPUInstanceDescriptor &desc = {},
+              const WGPURequestAdapterOptions &adapterOpts = {},
+              const WGPUDeviceDescriptor &devDescriptor = {}) {
 
-#ifdef __EMSCRIPTEN__
-  ctx.instance = wgpuCreateInstance(nullptr);
-#else
-  ctx.instance = wgpuCreateInstance(&desc);
-#endif
-  check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__);
+  auto promise = std::make_shared<std::promise<Context>>();
 
-  // Request the adapter asynchronously.
-  LOG(kDefLog, kTrace, "Requesting adapter");
+  // On native platforms, run our context creation in a detached thread.
+
+  Context ctx;
+  ctx.instance = wgpuCreateInstance(&desc);
+  if (!ctx.instance) {
+    promise->set_exception(std::make_exception_ptr(
+        std::runtime_error("Failed to create WebGPU instance.")));
+    return promise->get_future();
+  }
   try {
     auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts);
-    // Pump events until the adapter future is ready.
     ctx.adapter = waitForFuture(ctx.instance, adapterFuture);
     ctx.adapterStatus = WGPURequestAdapterStatus_Success;
   } catch (const std::exception &ex) {
-    check(false, ex.what(), __FILE__, __LINE__);
+    promise->set_exception(std::make_exception_ptr(ex));
+    return promise->get_future();
   }
-
-  // Request the device asynchronously.
-  LOG(kDefLog, kTrace, "Requesting device");
   try {
     auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
-    // Pump events until the device future is ready.
     ctx.device = waitForFuture(ctx.instance, deviceFuture);
     ctx.deviceStatus = WGPURequestDeviceStatus_Success;
-    LOG(kDefLog, kTrace, "Device request ended");
-
-    // If the device was created, set up logging and fetch the queue.
-#ifndef __EMSCRIPTEN__
-    WGPULoggingCallbackInfo loggingCallbackInfo{
-        .nextInChain = nullptr,
-        .callback =
-            [](WGPULoggingType type, WGPUStringView message, void *, void *) {
-              LOG(kDefLog, kError, "Device logging callback: %.*s",
-                  static_cast<int>(message.length), message.data);
-              if (type == WGPULoggingType_Error) {
-                throw std::runtime_error("Device error logged.");
-              }
-            },
-        .userdata1 = nullptr,
-        .userdata2 = nullptr};
-    wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
-#endif
-    ctx.queue = wgpuDeviceGetQueue(ctx.device);
   } catch (const std::exception &ex) {
-    check(false, ex.what(), __FILE__, __LINE__);
+    promise->set_exception(std::make_exception_ptr(ex));
+    return promise->get_future();
   }
+  ctx.queue = wgpuDeviceGetQueue(ctx.device);
+  promise->set_value(std::move(ctx));
+
+  return promise->get_future();
+}
 
-  return std::move(ctx);
+/**
+ * @brief Synchronously waits for and returns the created GPU context.
+ *
+ * This function invokes the asynchronous createContext() factory function to create a GPU
+ * context, then waits for its completion using waitForContextFuture. The returned Context
+ * holds handles to the WebGPU instance, adapter, device, and queue, and is used for subsequent
+ * GPU operations.
+ *
+ * @return Context The fully initialized GPU context.
+ *
+ * @code
+ * Context ctx = waitForContext();
+ * // Now ctx can be used for GPU operations.
+ * @endcode
+ */
+inline Context waitForContext() {
+  std::future<Context> contextFuture = createContext();
+  return waitForContextFuture<Context>(contextFuture);
 }
 
 #ifdef USE_DAWN_API
@@ -1152,17 +1202,22 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
 #endif
 
 /**
- * @brief Callback function invoked upon completion of an asynchronous GPU buffer mapping.
- *
- * This callback is triggered when the GPU buffer mapping for a readback buffer is completed.
- * It verifies that the mapping operation was successful, retrieves the mapped memory,
- * copies the data from the GPU buffer to a CPU memory region, unmaps the buffer,
- * signals the completion by fulfilling the associated promise, and cleans up the allocated callback data.
- *
- * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success on success.
- * @param message A string view containing additional information about the mapping operation.
- * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the GPU buffer,
- *                  buffer size, destination CPU memory pointer, and a promise for signaling completion.
+ * @brief Callback function invoked upon completion of an asynchronous GPU
+ * buffer mapping.
+ *
+ * This callback is triggered when the GPU buffer mapping for a readback buffer
+ * is completed. It verifies that the mapping operation was successful,
+ * retrieves the mapped memory, copies the data from the GPU buffer to a CPU
+ * memory region, unmaps the buffer, signals the completion by fulfilling the
+ * associated promise, and cleans up the allocated callback data.
+ *
+ * @param status The mapping status. Expected to be WGPUMapAsyncStatus_Success
+ * on success.
+ * @param message A string view containing additional information about the
+ * mapping operation.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure
+ * containing the GPU buffer, buffer size, destination CPU memory pointer, and a
+ * promise for signaling completion.
  * @param userdata2 Unused.
  */
 inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
@@ -1192,16 +1247,20 @@ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
 }
 
 /**
- * @brief Callback function invoked when the GPU queue’s submitted work is complete.
- *
- * This callback is registered with the GPU queue after submitting work. When invoked,
- * it verifies that all queued work completed successfully, and then sets up the buffer
- * mapping callback to initiate the asynchronous mapping of a readback buffer. The readback
- * buffer is mapped to access the processed data on the CPU.
- *
- * @param status The status of the completed work. Expected to be WGPUQueueWorkDoneStatus_Success on success.
- * @param userdata1 A pointer to a heap-allocated CallbackData structure containing the readback buffer,
- *                  buffer size, destination CPU memory pointer, and a promise to signal completion.
+ * @brief Callback function invoked when the GPU queue’s submitted work is
+ * complete.
+ *
+ * This callback is registered with the GPU queue after submitting work. When
+ * invoked, it verifies that all queued work completed successfully, and then
+ * sets up the buffer mapping callback to initiate the asynchronous mapping of a
+ * readback buffer. The readback buffer is mapped to access the processed data
+ * on the CPU.
+ *
+ * @param status The status of the completed work. Expected to be
+ * WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A pointer to a heap-allocated CallbackData structure
+ * containing the readback buffer, buffer size, destination CPU memory pointer,
+ * and a promise to signal completion.
  * @param userdata2 Unused.
  */
 inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
@@ -1543,12 +1602,14 @@ inline Shape cdiv(Shape total, Shape group) {
 }
 
 /**
- * @brief Packages the shader compilation information along with a promise for asynchronous signaling.
+ * @brief Packages the shader compilation information along with a promise for
+ * asynchronous signaling.
  *
  * This structure holds a pointer to a CompilationInfo instance that collects
- * details such as status, messages, line numbers, and positions from the shader compilation.
- * It also contains a shared pointer to a std::promise<void> which is used to signal the completion
- * of the asynchronous shader compilation process.
+ * details such as status, messages, line numbers, and positions from the shader
+ * compilation. It also contains a shared pointer to a std::promise<void> which
+ * is used to signal the completion of the asynchronous shader compilation
+ * process.
  */
 struct CompData {
   CompilationInfo *compInfo;
@@ -1578,10 +1639,11 @@ struct CompData {
  * @return Kernel instance representing the created kernel
  *
  * @code
- * std::future<Kernel> kernelFuture = createKernel(ctx, code, dataBindings, numInputs, output, nThreads, params, paramsSize);
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, dataBindings,
+ numInputs, output, nThreads, params, paramsSize);
  * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
  * @endcode
- 
+
  */
 inline std::future<Kernel>
 createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
@@ -1783,15 +1845,19 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
 /**
  * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done.
  *
- * This callback is invoked when the GPU queue signals the completion of the submitted
- * workload for a kernel dispatch. It receives the work-done status and a userdata pointer,
- * which is expected to be a heap‑allocated pointer to a std::promise<void>.
+ * This callback is invoked when the GPU queue signals the completion of the
+ * submitted workload for a kernel dispatch. It receives the work-done status
+ * and a userdata pointer, which is expected to be a heap‑allocated pointer to a
+ * std::promise<void>.
  *
- * On success, the promise is fulfilled by calling set_value(). Otherwise, it is set with an exception.
- * After setting the promise state, the allocated memory for the promise is freed.
+ * On success, the promise is fulfilled by calling set_value(). Otherwise, it is
+ * set with an exception. After setting the promise state, the allocated memory
+ * for the promise is freed.
  *
- * @param status The status of the work done. Expected to be WGPUQueueWorkDoneStatus_Success on success.
- * @param userdata1 A heap allocated pointer to std::promise<void> which is set when the work is done.
+ * @param status The status of the work done. Expected to be
+ * WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A heap allocated pointer to std::promise<void> which is set
+ * when the work is done.
  * @param userdata2 Unused.
  */
 inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
@@ -1824,8 +1890,9 @@ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
  * @return Kernel instance representing the created kernel
  *
  * @code
- * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData, output,totalWorkgroups, params);
- * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
+ * output,totalWorkgroups, params); Kernel kernel = WaitForFuture(ctx.instance,
+ * kernelFuture);
  * @endcode
  */
 template <typename ParamsType = NoParam, size_t numInputs>
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
index 75d9dc4..21a0005 100644
--- a/numeric_types/half.cpp
+++ b/numeric_types/half.cpp
@@ -185,7 +185,7 @@ void testContainers() {
     testRoundTrip(h[3]);
   }
   {
-    Context ctx = createContext();
+    Context ctx = waitForContext();
     std::array<half, 8> h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0};
     Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data());
     std::array<half, 8> h2;
@@ -215,13 +215,14 @@ fn main(
     }
 }
 )";
-  Context ctx = createContext(
+  std::future<Context> futureContext = createContext(
       {}, {},
       /*device descriptor, enabling f16 in WGSL*/
       {
           .requiredFeatureCount = 1,
           .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(),
       });
+  Context ctx = waitForContextFuture(futureContext);
   static constexpr size_t N = 10000;
   std::array<half, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {

From 9a08f8a875d74fda1644adbb367edfdf2f70838a Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Thu, 20 Feb 2025 13:54:02 -0600
Subject: [PATCH 03/17] adds sync wrappers

---
 cmake/example.cmake          |   1 -
 examples/hello_world/run.cpp |  11 +-
 examples/render/run.cpp      |   6 +-
 gpu.hpp                      | 372 +++++++++++++++++++++++++----------
 numeric_types/half.cpp       |  19 +-
 5 files changed, 283 insertions(+), 126 deletions(-)

diff --git a/cmake/example.cmake b/cmake/example.cmake
index 5953876..cf697b5 100644
--- a/cmake/example.cmake
+++ b/cmake/example.cmake
@@ -54,7 +54,6 @@ if(EMSCRIPTEN)
         -sUSE_GLFW=3 \
         -sALLOW_MEMORY_GROWTH=1 -sSTACK_SIZE=15MB \
         -sASYNCIFY \
-        -sASYNCIFY_DEBUG \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js \
         --js-library=${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js \
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index c9f22c7..77549cf 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -28,7 +28,7 @@ int main(int argc, char **argv) {
   printf("--------------\n\n");
 
   // std::unique_ptr<Context> ctx = createContext();
-  Context ctx = waitForContext();
+  Context ctx = createContext();
   static constexpr size_t N = 10000;
   std::array<float, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
@@ -36,14 +36,11 @@ int main(int argc, char **argv) {
   }
   Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf32);
-  std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf32},
+  Kernel op = createKernel(ctx, {kGelu, 256, kf32},
                            Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  Kernel op = waitForFuture(ctx.instance, kernelFuture);
-  std::future<void> dispatchFuture = dispatchKernel(ctx, op);
-  waitForFuture(ctx.instance, dispatchFuture);
-  std::future<void> cpuFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
-  waitForFuture(ctx.instance, cpuFuture);
+  dispatchKernel(ctx, op);
+  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", inputArr[i], outputArr[i]);
   }
diff --git a/examples/render/run.cpp b/examples/render/run.cpp
index f9a90f9..64122cd 100644
--- a/examples/render/run.cpp
+++ b/examples/render/run.cpp
@@ -124,10 +124,8 @@ int main(int argc, char **argv) {
                                      cdiv({NCOLS, NROWS, 1}, wgSize), params);
   printf("\033[2J\033[H");
   while (true) {
-    std::promise<void> promise;
-    std::future<void> future = promise.get_future();
-    dispatchKernel(ctx, renderKernel, promise);
-    wait(ctx, future);
+
+    dispatchKernel(ctx, renderKernel);
     toCPU(ctx, devScreen, screen.data(), sizeof(screen));
     params.time = getCurrentTimeInMilliseconds() - zeroTime;
 
diff --git a/gpu.hpp b/gpu.hpp
index 0119108..e050c87 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -807,11 +807,10 @@ inline void check(bool condition, const char *message,
  *
  * @code
  * std::future<WGPUDevice> deviceFuture = requestDeviceAsync(adapter,
- * devDescriptor); WGPUDevice device = waitForFuture(instance, deviceFuture);
+ * devDescriptor); WGPUDevice device = wait(instance, deviceFuture);
  * @endcode
  */
-template <typename T>
-T waitForFuture(WGPUInstance instance, std::future<T> &f) {
+template <typename T> T wait(Context &ctx, std::future<T> &f) {
 #ifdef __EMSCRIPTEN__
   // Poll until the future is ready.
   while (f.wait_for(std::chrono::milliseconds(0)) !=
@@ -823,7 +822,7 @@ T waitForFuture(WGPUInstance instance, std::future<T> &f) {
 #else
   while (f.wait_for(std::chrono::milliseconds(0)) !=
          std::future_status::ready) {
-    wgpuInstanceProcessEvents(instance);
+    wgpuInstanceProcessEvents(ctx.instance);
   }
   return f.get();
 #endif
@@ -832,12 +831,15 @@ T waitForFuture(WGPUInstance instance, std::future<T> &f) {
 // Context Callbacks & Helpers
 
 /**
- * @brief Waits for the provided std::future<T> to become ready by polling its status.
+ * @brief Waits for the provided std::future<T> to become ready by polling its
+ * status.
  *
- * This helper template function continuously checks the status of the provided std::future<T> until it is ready.
- * On Emscripten builds, it yields control to the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous behavior.
- * On non-Emscripten platforms, it sleeps for a short duration (10 milliseconds) between checks.
- * Once the future is ready, its value is returned.
+ * This helper template function continuously checks the status of the provided
+ * std::future<T> until it is ready. On Emscripten builds, it yields control to
+ * the JavaScript event loop using emscripten_sleep(1) for smooth asynchronous
+ * behavior. On non-Emscripten platforms, it sleeps for a short duration (10
+ * milliseconds) between checks. Once the future is ready, its value is
+ * returned.
  *
  * @tparam T The type of the value contained in the future.
  * @param f The future to wait on.
@@ -849,20 +851,20 @@ T waitForFuture(WGPUInstance instance, std::future<T> &f) {
  * @endcode
  */
 template <typename T> T waitForContextFuture(std::future<T> &f) {
-  #ifdef __EMSCRIPTEN__
-    while (f.wait_for(std::chrono::milliseconds(0)) !=
-           std::future_status::ready) {
-      emscripten_sleep(1); // Yield back to the JS event loop.
-    }
-    return f.get();
-  #else
-    while (f.wait_for(std::chrono::milliseconds(0)) !=
-           std::future_status::ready) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    }
-    return f.get();
-  #endif
+#ifdef __EMSCRIPTEN__
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    emscripten_sleep(1); // Yield back to the JS event loop.
+  }
+  return f.get();
+#else
+  while (f.wait_for(std::chrono::milliseconds(0)) !=
+         std::future_status::ready) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
+  return f.get();
+#endif
+}
 
 /**
  * @brief Adapter callback function invoked upon completion of an asynchronous
@@ -1013,9 +1015,9 @@ requestDeviceAsync(WGPUAdapter adapter,
  *
  */
 inline std::future<Context>
-createContext(const WGPUInstanceDescriptor &desc = {},
-              const WGPURequestAdapterOptions &adapterOpts = {},
-              const WGPUDeviceDescriptor &devDescriptor = {}) {
+createContextAsync(const WGPUInstanceDescriptor &desc = {},
+                   const WGPURequestAdapterOptions &adapterOpts = {},
+                   const WGPUDeviceDescriptor &devDescriptor = {}) {
 
   auto promise = std::make_shared<std::promise<Context>>();
 
@@ -1030,7 +1032,7 @@ createContext(const WGPUInstanceDescriptor &desc = {},
   }
   try {
     auto adapterFuture = requestAdapterAsync(ctx.instance, adapterOpts);
-    ctx.adapter = waitForFuture(ctx.instance, adapterFuture);
+    ctx.adapter = wait(ctx, adapterFuture);
     ctx.adapterStatus = WGPURequestAdapterStatus_Success;
   } catch (const std::exception &ex) {
     promise->set_exception(std::make_exception_ptr(ex));
@@ -1038,7 +1040,7 @@ createContext(const WGPUInstanceDescriptor &desc = {},
   }
   try {
     auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
-    ctx.device = waitForFuture(ctx.instance, deviceFuture);
+    ctx.device = wait(ctx, deviceFuture);
     ctx.deviceStatus = WGPURequestDeviceStatus_Success;
   } catch (const std::exception &ex) {
     promise->set_exception(std::make_exception_ptr(ex));
@@ -1053,10 +1055,11 @@ createContext(const WGPUInstanceDescriptor &desc = {},
 /**
  * @brief Synchronously waits for and returns the created GPU context.
  *
- * This function invokes the asynchronous createContext() factory function to create a GPU
- * context, then waits for its completion using waitForContextFuture. The returned Context
- * holds handles to the WebGPU instance, adapter, device, and queue, and is used for subsequent
- * GPU operations.
+ * This function invokes the asynchronous createContext() factory function to
+ * create a GPU context, then waits for its completion using
+ * waitForContextFuture. The returned Context holds handles to the WebGPU
+ * instance, adapter, device, and queue, and is used for subsequent GPU
+ * operations.
  *
  * @return Context The fully initialized GPU context.
  *
@@ -1065,8 +1068,11 @@ createContext(const WGPUInstanceDescriptor &desc = {},
  * // Now ctx can be used for GPU operations.
  * @endcode
  */
-inline Context waitForContext() {
-  std::future<Context> contextFuture = createContext();
+inline Context createContext(const WGPUInstanceDescriptor &desc = {},
+                             const WGPURequestAdapterOptions &adapterOpts = {},
+                             const WGPUDeviceDescriptor &devDescriptor = {}) {
+  std::future<Context> contextFuture =
+      createContextAsync(desc, adapterOpts, devDescriptor);
   return waitForContextFuture<Context>(contextFuture);
 }
 
@@ -1294,8 +1300,8 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
  * toCPU(ctx, tensor, data, bufferSize);
  * @endcode
  */
-inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
-                               size_t bufferSize, CopyData &op) {
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
+                                    size_t bufferSize, CopyData &op) {
   // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
@@ -1340,8 +1346,8 @@ inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
  * @param[in] bufferSize Size of the data buffer in bytes
  * @param[out] data Pointer to the CPU memory to copy the data to
  */
-inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
-                               size_t bufferSize) {
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
+                                    size_t bufferSize) {
   // Create a promise that will later be satisfied when the async copy
   // completes.
   auto promise = std::make_shared<std::promise<void>>();
@@ -1395,26 +1401,8 @@ inline std::future<void> toCPU(Context &ctx, Tensor &tensor, void *data,
   return promise->get_future();
 }
 
-/**
- * @brief Overload of the toCPU function to copy data from a GPU buffer to CPU
- * memory for an array of floats instead of a pointer to a float buffer.
- * @param[in] ctx Context instance to manage the operation
- * @param[in] tensor Tensor instance representing the GPU buffer to copy from
- * @param[out] data Array of floats to copy the data to
- *
- * @code
- * std::future<void> toCPUFuture = toCPU(ctx, tensor, data);
- * WaitForFuture(ctx.instance, toCPUFuture);
- * @endcode
- */
-template <size_t N>
-inline std::future<void> toCPU(Context &ctx, Tensor &tensor,
-                               std::array<float, N> &data) {
-  return toCPU(ctx, tensor, data.data(), sizeof(data));
-}
-
-inline std::future<void> toCPU(Context &ctx, WGPUBuffer buffer, void *data,
-                               size_t size) {
+inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
+                                    size_t size) {
   // The size (in bytes) for the copy.
   uint64_t bufferSize = size;
 
@@ -1473,6 +1461,92 @@ inline std::future<void> toCPU(Context &ctx, WGPUBuffer buffer, void *data,
   return promise->get_future();
 }
 
+/**
+ * @brief Overload of the toCPU function to copy data from a GPU buffer to CPU
+ * memory for an array of floats instead of a pointer to a float buffer.
+ * @param[in] ctx Context instance to manage the operation
+ * @param[in] tensor Tensor instance representing the GPU buffer to copy from
+ * @param[out] data Array of floats to copy the data to
+ *
+ * @code
+ * std::future<void> toCPUFuture = toCPU(ctx, tensor, data);
+ * wait(ctx, toCPUFuture);
+ * @endcode
+ */
+template <size_t N>
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
+                                    std::array<float, N> &data) {
+  return toCPUAsync(ctx, tensor, data.data(), sizeof(data));
+}
+
+/**
+ * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU
+ * memory.
+ *
+ * This function synchronously waits for the asynchronous copy operation to
+ * complete, ensuring that the data is fully transferred from the GPU buffer to
+ * the CPU memory before returning.
+ *
+ * @param ctx Context instance to manage the operation
+ * @param tensor Tensor instance representing the GPU buffer to copy from
+ * @param data Pointer to the CPU memory to copy the data to
+ * @param bufferSize Size of the data buffer in bytes
+ * @param instance WGPUInstance used for processing events during waiting
+ *
+ * @code
+ * toCPU(ctx, tensor, data, bufferSize, instance);
+ * @endcode
+ */
+inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
+  auto future = toCPUAsync(ctx, tensor, data, bufferSize);
+  wait(ctx, future);
+}
+
+/**
+ * @brief Synchronous wrapper for copying from a GPU buffer to CPU memory.
+ *
+ * This function synchronously waits for the asynchronous copy operation to
+ * complete, ensuring that the data is fully transferred from the GPU buffer to
+ * the CPU memory before returning.
+ *
+ * @param ctx Context instance to manage the operation
+ * @param buffer WGPUBuffer instance representing the GPU buffer to copy from
+ * @param data Pointer to the CPU memory to copy the data to
+ * @param size Size of the data buffer in bytes
+ * @param instance WGPUInstance used for processing events during waiting
+ *
+ * @code
+ * toCPU(ctx, buffer, data, size, instance);
+ * @endcode
+ */
+inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
+  auto future = toCPUAsync(ctx, buffer, data, size);
+  wait(ctx, future);
+}
+
+/**
+ * @brief Synchronous wrapper for copying from a Tensor GPU buffer to CPU
+ * memory for an array of floats instead of a pointer to a float buffer.
+ *
+ * This function synchronously waits for the asynchronous copy operation to
+ * complete, ensuring that the data is fully transferred from the GPU buffer to
+ * the CPU memory before returning.
+ *
+ * @param ctx Context instance to manage the operation
+ * @param tensor Tensor instance representing the GPU buffer to copy from
+ * @param data Array of floats to copy the data to
+ * @param instance WGPUInstance used for processing events during waiting
+ *
+ * @code
+ * toCPU(ctx, tensor, data, instance);
+ * @endcode
+ */
+template <size_t N>
+inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {
+  auto future = toCPUAsync(ctx, tensor, data);
+  wait(ctx, future);
+}
+
 /**
  * @brief Copies data from CPU memory to a GPU buffer. The toGPU overloads are
  * effectively a convenience wrapper around the WebGPU API call
@@ -1617,9 +1691,9 @@ struct CompData {
 };
 
 /**
- * @brief A factory function to create a kernel on the GPU. The kernel is
- * created with the given WGSL code, input tensors, output tensor, and
- * optional parameters.
+ * @brief A factory function to create a kernel asynchronously on the GPU.
+ * The kernel is created with the given WGSL code, input tensors,
+ * output tensor, and optional parameters.
  *
  * Note that the values of the input tensors are not used here, only the
  * reference handles to the underlying buffers as well as the size of the
@@ -1639,18 +1713,19 @@ struct CompData {
  * @return Kernel instance representing the created kernel
  *
  * @code
- * std::future<Kernel> kernelFuture = createKernel(ctx, code, dataBindings,
+ * std::future<Kernel> kernelFuture = createKernelAsync(ctx, code, dataBindings,
  numInputs, output, nThreads, params, paramsSize);
- * Kernel kernel = WaitForFuture(ctx.instance, kernelFuture);
+ * Kernel kernel = wait(ctx.instance, kernelFuture);
  * @endcode
 
  */
 inline std::future<Kernel>
-createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
-             size_t numTensors, const size_t *viewOffsets,
-             const Shape &totalWorkgroups, const void *params = nullptr,
-             size_t paramsSize = 0, CompilationInfo *compilationInfo = nullptr,
-             const char *cacheKey = nullptr) {
+createKernelAsync(Context &ctx, const KernelCode &code,
+                  const Tensor *dataBindings, size_t numTensors,
+                  const size_t *viewOffsets, const Shape &totalWorkgroups,
+                  const void *params = nullptr, size_t paramsSize = 0,
+                  CompilationInfo *compilationInfo = nullptr,
+                  const char *cacheKey = nullptr) {
   // Create a cache key by the pointer values of the data bindings and the
   // kernel code
   if (cacheKey != nullptr &&
@@ -1818,7 +1893,7 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
 
     // Register callback and then wait for the result.
     wgpuShaderModuleGetCompilationInfo(shaderModule, compilationCallbackInfo);
-    waitForFuture(ctx.instance, compFuture);
+    wait(ctx, compFuture);
   }
 
   // Now create the compute pipeline using the shader module.
@@ -1842,35 +1917,81 @@ createKernel(Context &ctx, const KernelCode &code, const Tensor *dataBindings,
   return outerFuture;
 }
 
-/**
- * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done.
+/*
+ * @brief Overload which wraps the createKernelAsync factory function to create
+ * a kernel on the GPU. This overload uses takes a pointer and size for the
+ * input tensors instead of a static collection and a void pointer for params
+ * instead of a static type.
  *
- * This callback is invoked when the GPU queue signals the completion of the
- * submitted workload for a kernel dispatch. It receives the work-done status
- * and a userdata pointer, which is expected to be a heap‑allocated pointer to a
- * std::promise<void>.
+ * @param[in] ctx Context instance to manage the kernel
+ * @param[in] code WGSL code for the kernel
+ * @param[in] dataBindings Pointer to a span of tensors bound to the kernel
+ * @param[in] numTensors Number of tensors in the dataBindings span
+ * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be
+ * a Shape of rank == 3.
+ * @param[in] params Optional parameters for the kernel. If the kernel does
+ * not have any parameters, use NoParam.
+ * @return Kernel instance representing the created kernel
  *
- * On success, the promise is fulfilled by calling set_value(). Otherwise, it is
- * set with an exception. After setting the promise state, the allocated memory
- * for the promise is freed.
+ * @code
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
+ * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance,
+ * kernelFuture);
+ * @endcode
+ */
+inline Kernel createKernel(Context &ctx, const KernelCode &code,
+                           const Tensor *dataBindings, size_t numTensors,
+                           const size_t *viewOffsets,
+                           const Shape &totalWorkgroups,
+                           const void *params = nullptr, size_t paramsSize = 0,
+                           CompilationInfo *compilationInfo = nullptr,
+                           const char *cacheKey = nullptr) {
+  std::future<Kernel> kernelFuture = createKernelAsync(
+      ctx, code, dataBindings, numTensors, viewOffsets, totalWorkgroups, params,
+      paramsSize, compilationInfo, cacheKey);
+  return wait(ctx, kernelFuture);
+}
+
+/**
+ * @brief Overload which wraps the createKernelAsync factory function to create
+ * a kernel asynchronously on the GPU. This overload uses takes a static
+ * collection of input tensors instead of a pointer and a statically determined
+ * ParamsType instead of casting params to a void pointer.
  *
- * @param status The status of the work done. Expected to be
- * WGPUQueueWorkDoneStatus_Success on success.
- * @param userdata1 A heap allocated pointer to std::promise<void> which is set
- * when the work is done.
- * @param userdata2 Unused.
+ * @param[in] ctx Context instance to manage the kernel
+ * @param[in] code WGSL code for the kernel
+ * @param[in] dataBindings A Bindings of tensors whose GPU buffers are bound
+ * to the kernel as inputs and outputs.
+ * @param[in] totalWorkgroups Number of workgroups in the x, y, z grid, must be
+ * a Shape of rank == 3.
+ * @param[in] params Optional parameters for the kernel. If the kernel does
+ * not have any parameters, use NoParam.
+ * @return Kernel instance representing the created kernel
+ *
+ * @code
+ * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
+ * output,totalWorkgroups, params); Kernel kernel = wait(ctx.instance,
+ * kernelFuture);
+ * @endcode
  */
-inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
-                                   void *userdata1, void * /*userdata2*/) {
-  // Cast the userdata pointer back to our heap‑allocated promise.
-  auto *p = reinterpret_cast<std::promise<void> *>(userdata1);
-  if (status == WGPUQueueWorkDoneStatus_Success) {
-    p->set_value();
+template <typename ParamsType = NoParam, size_t numInputs>
+std::future<Kernel>
+createKernelAsync(Context &ctx, const KernelCode &code,
+                  const Bindings<numInputs> &dataBindings,
+                  const Shape &totalWorkgroups,
+                  const ParamsType &params = ParamsType{},
+                  CompilationInfo *compilationInfo = nullptr,
+                  const char *cacheKey = nullptr) {
+  if constexpr (!IsNoParam<ParamsType>) {
+    return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs,
+                             dataBindings.viewOffsets.data(), totalWorkgroups,
+                             reinterpret_cast<const void *>(&params),
+                             sizeof(ParamsType), compilationInfo, cacheKey);
   } else {
-    p->set_exception(std::make_exception_ptr(
-        std::runtime_error("Queue work did not complete successfully.")));
+    return createKernelAsync(ctx, code, dataBindings.data.data(), numInputs,
+                             dataBindings.viewOffsets.data(), totalWorkgroups,
+                             nullptr, 0, compilationInfo, cacheKey);
   }
-  delete p; // free the heap allocation
 }
 
 /**
@@ -1890,18 +2011,17 @@ inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
  * @return Kernel instance representing the created kernel
  *
  * @code
- * std::future<Kernel> kernelFuture = createKernel(ctx, code, tensorData,
- * output,totalWorkgroups, params); Kernel kernel = WaitForFuture(ctx.instance,
- * kernelFuture);
+ * Kernel kernel = createKernel(ctx, code, tensorData, output,totalWorkgroups,
+ * params);
  * @endcode
  */
 template <typename ParamsType = NoParam, size_t numInputs>
-std::future<Kernel> createKernel(Context &ctx, const KernelCode &code,
-                                 const Bindings<numInputs> &dataBindings,
-                                 const Shape &totalWorkgroups,
-                                 const ParamsType &params = ParamsType{},
-                                 CompilationInfo *compilationInfo = nullptr,
-                                 const char *cacheKey = nullptr) {
+Kernel createKernel(Context &ctx, const KernelCode &code,
+                    const Bindings<numInputs> &dataBindings,
+                    const Shape &totalWorkgroups,
+                    const ParamsType &params = ParamsType{},
+                    CompilationInfo *compilationInfo = nullptr,
+                    const char *cacheKey = nullptr) {
   if constexpr (!IsNoParam<ParamsType>) {
     return createKernel(ctx, code, dataBindings.data.data(), numInputs,
                         dataBindings.viewOffsets.data(), totalWorkgroups,
@@ -1914,6 +2034,37 @@ std::future<Kernel> createKernel(Context &ctx, const KernelCode &code,
   }
 }
 
+/**
+ * @brief Free‑standing callback for dispatchKernel’s asynchronous work‐done.
+ *
+ * This callback is invoked when the GPU queue signals the completion of the
+ * submitted workload for a kernel dispatch. It receives the work-done status
+ * and a userdata pointer, which is expected to be a heap‑allocated pointer to a
+ * std::promise<void>.
+ *
+ * On success, the promise is fulfilled by calling set_value(). Otherwise, it is
+ * set with an exception. After setting the promise state, the allocated memory
+ * for the promise is freed.
+ *
+ * @param status The status of the work done. Expected to be
+ * WGPUQueueWorkDoneStatus_Success on success.
+ * @param userdata1 A heap allocated pointer to std::promise<void> which is set
+ * when the work is done.
+ * @param userdata2 Unused.
+ */
+inline void dispatchKernelCallback(WGPUQueueWorkDoneStatus status,
+                                   void *userdata1, void * /*userdata2*/) {
+  // Cast the userdata pointer back to our heap‑allocated promise.
+  auto *p = reinterpret_cast<std::promise<void> *>(userdata1);
+  if (status == WGPUQueueWorkDoneStatus_Success) {
+    p->set_value();
+  } else {
+    p->set_exception(std::make_exception_ptr(
+        std::runtime_error("Queue work did not complete successfully.")));
+  }
+  delete p; // free the heap allocation
+}
+
 /**
  * @brief Asynchronously submits a kernel to the GPU queue for execution.
  * It also sets up a callback to notify when the kernel has finished executing
@@ -1930,10 +2081,10 @@ std::future<Kernel> createKernel(Context &ctx, const KernelCode &code,
  *
  * @code
  * std::future<void> dispatchFuture = dispatchKernel(ctx, kernel);
- * WaitForFuture(ctx.instance, dispatchFuture);
+ * wait(ctx.instance, dispatchFuture);
  * @endcode
  */
-inline std::future<void> dispatchKernel(Context &ctx, Kernel &kernel) {
+inline std::future<void> dispatchKernelAsync(Context &ctx, Kernel &kernel) {
   // If the kernel was used before, reset the command buffer.
   if (kernel->used) {
     resetCommandBuffer(ctx.device, kernel);
@@ -1962,6 +2113,23 @@ inline std::future<void> dispatchKernel(Context &ctx, Kernel &kernel) {
   return future;
 }
 
+/**
+ * @brief Synchronous wrapper for dispatchKernelAsync. This function submits
+ * the kernel to the GPU queue and waits for it to finish executing.
+ *
+ * @param[in] ctx Context instance to manage the kernel, from which the queue
+ * for the GPU is obtained
+ * @param[in] kernel Kernel instance to dispatch
+ *
+ * @code
+ * dispatchKernel(ctx, kernel);
+ * @endcode
+ */
+inline void dispatchKernel(Context &ctx, Kernel &kernel) {
+  auto future = dispatchKernelAsync(ctx, kernel);
+  wait(ctx, future);
+}
+
 } // namespace gpu
 
 #endif // GPU_H
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
index 21a0005..c183754 100644
--- a/numeric_types/half.cpp
+++ b/numeric_types/half.cpp
@@ -185,12 +185,11 @@ void testContainers() {
     testRoundTrip(h[3]);
   }
   {
-    Context ctx = waitForContext();
+    Context ctx = createContext();
     std::array<half, 8> h = {1.0f, 0.5f, 2.0f, 3.14f, 1.0, 2.0, 3.0, 4.0};
     Tensor devH = createTensor(ctx, {h.size()}, kf16, h.data());
     std::array<half, 8> h2;
-    std::future<void> toCPUFuture = toCPU(ctx, devH, h2.data(), sizeof(h2));
-    waitForFuture(ctx.instance, toCPUFuture);
+    toCPU(ctx, devH, h2.data(), sizeof(h2));
     for (int i = 0; i < 8; ++i) {
       printResult(h[i].data == h2[i].data, "Container round trip",
                   static_cast<float>(h[i]), static_cast<float>(h2[i]));
@@ -215,14 +214,13 @@ fn main(
     }
 }
 )";
-  std::future<Context> futureContext = createContext(
+  Context ctx = createContext(
       {}, {},
       /*device descriptor, enabling f16 in WGSL*/
       {
           .requiredFeatureCount = 1,
           .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(),
       });
-  Context ctx = waitForContextFuture(futureContext);
   static constexpr size_t N = 10000;
   std::array<half, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
@@ -230,20 +228,17 @@ fn main(
   }
   Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf16);
-  std::future<Kernel> kernelFuture = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
+  Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
                            {cdiv(N, 256), 1, 1});
-  Kernel op = waitForFuture(ctx.instance, kernelFuture);
-  std::future<void> dispatchFuture = dispatchKernel(ctx, op);
-  waitForFuture(ctx.instance, dispatchFuture);
-  std::future<void> toCPUFuture = toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
-  waitForFuture(ctx.instance, toCPUFuture);
+  dispatchKernel(ctx, op);
+  toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
   for (int i = 0; i < 12; ++i) {
     printf("  gelu(%.2f) = %.2f\n", static_cast<float>(inputArr[i]),
            static_cast<float>(outputArr[i]));
   }
 }
 
-int testMain() {
+int testHalfMain() {
   printf("\nHalf-precision float tests\n==========================\n");
 
   printf("\nRegular values float round trips\n\n");

From 95e587d71d25ab74207648ca91500a7594bff870 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Thu, 20 Feb 2025 16:34:35 -0600
Subject: [PATCH 04/17] refactors the byIdx context function and sets
 USE_DAWN_API compile def on native

---
 cmake/dawn.cmake             |   2 +
 cmake/gpu.cmake              |   2 +
 examples/hello_world/run.cpp |   6 +
 gpu.hpp                      | 308 ++++++++++++++++++++++-------------
 4 files changed, 201 insertions(+), 117 deletions(-)

diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
index 2ead9ae..c6fed94 100644
--- a/cmake/dawn.cmake
+++ b/cmake/dawn.cmake
@@ -7,6 +7,8 @@ if(EMSCRIPTEN)
     set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "")
     set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
     set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE)
+else()
+    add_compile_definitions(USE_DAWN_API)
 endif()
 
 # Enable find for no dawn rebuilds with flutter run
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index 6cce9e6..f936991 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -32,7 +32,9 @@ add_library(gpu STATIC ${GPU_SOURCES} ${GPU_HEADERS})
 set_target_properties(gpu PROPERTIES LINKER_LANGUAGE CXX)
 target_include_directories(gpu PUBLIC "${PROJECT_ROOT}")
 if(NOT EMSCRIPTEN)
+    target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/")
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/include/dawn/")
+    target_include_directories(gpu PUBLIC "${DAWN_DIR}/include/")
 else()
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/")
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
index 77549cf..b44934b 100644
--- a/examples/hello_world/run.cpp
+++ b/examples/hello_world/run.cpp
@@ -28,7 +28,13 @@ int main(int argc, char **argv) {
   printf("--------------\n\n");
 
   // std::unique_ptr<Context> ctx = createContext();
+  #ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+  auto adaptersList = listAdapters(ctx);
+  LOG(kDefLog, kInfo, "Available GPU adapters:\n%s", adaptersList.c_str());
+  #else
   Context ctx = createContext();
+  #endif
   static constexpr size_t N = 10000;
   std::array<float, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
diff --git a/gpu.hpp b/gpu.hpp
index e050c87..906371c 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -16,9 +16,8 @@
 #include <utility> // std::pair
 #include <vector>
 
-#ifndef __EMSCRIPTEN__
 
-#else
+#ifdef __EMSCRIPTEN__
 #include "emscripten/emscripten.h"
 #endif
 
@@ -255,6 +254,26 @@ inline std::string toString(const Shape &shape) {
  */
 inline std::string toString(size_t value) { return std::to_string(value); }
 
+/**
+ * @brief Converts a WGPUStringView to an std::string.
+ *
+ * If the view's data is null, an empty string is returned. If the view's
+ * length equals WGPU_STRLEN, it is assumed to be null‑terminated; otherwise,
+ * the explicit length is used.
+ *
+ * @param strView The WGPUStringView to convert.
+ * @return std::string The resulting standard string.
+ */
+inline std::string formatWGPUStringView(WGPUStringView strView) {
+  if (!strView.data) {
+    return "";
+  }
+  if (strView.length == WGPU_STRLEN) {
+    return std::string(strView.data);
+  }
+  return std::string(strView.data, strView.length);
+}
+
 /**
  * @brief simple in-place string replacement helper function for substituting
  * placeholders in a WGSL string template.
@@ -1076,136 +1095,191 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {},
   return waitForContextFuture<Context>(contextFuture);
 }
 
-#ifdef USE_DAWN_API
+#ifndef __EMSCRIPTEN__
+#if USE_DAWN_API
 /**
- * @brief Factory function to create a GPU context, which aggregates WebGPU API
- * handles to interact with the GPU including the instance, adapter, device, and
- * queue.
+ * @brief Retrieves the list of available GPU adapters from the Dawn instance.
  *
- * The function takes gpu index to support for multi GPUs.
- * To activate this function, it needs not only webgpu's headers but also DAWN's
- * headers.
+ * This function creates a Dawn instance using the provided context's instance
+ * handle, then enumerates and returns the available GPU adapters as a vector.
  *
- * If dawn is used, it also sets up an error callback for device loss.
+ * @param ctx The Context containing the WebGPU instance handle.
+ * @return std::vector<dawn::native::Adapter> A vector of available GPU
+ * adapters.
+ * 
+ * @code
+ * std::vector<dawn::native::Adapter> adapters = getAdapters(ctx);
+ * @endcode
+ */
+inline std::vector<dawn::native::Adapter> getAdapters(Context &ctx) {
+  dawn::native::Instance dawnInstance(
+      reinterpret_cast<dawn::native::InstanceBase *>(ctx.instance));
+  return dawnInstance.EnumerateAdapters();
+}
+
+/**
+ * @brief Formats the given vector of Dawn adapters into a single concatenated string.
  *
- * @param[in] gpuIdx GPU index
- * @param[in] desc Instance descriptor for the WebGPU instance (optional)
- * @param[in] devDescriptor Device descriptor for the WebGPU device (optional)
- * @return Context instance representing the created GPU context
+ * This function iterates over each Dawn adapter in the provided vector, retrieves its
+ * description using the WebGPU API, and converts the description from a WGPUStringView
+ * to an std::string using the formatWGPUStringView helper. The resulting descriptions
+ * are concatenated into a single string separated by newline characters.
  *
+ * @param adapters A vector of Dawn adapters obtained from a WebGPU instance.
+ * @return std::string A newline-delimited string listing each adapter's description.
+ * 
  * @code
- * Context ctx = createContextByGpuIdx(1);
+ * std::string adapterList = formatAdapters(adapters);
  * @endcode
  */
-inline Context
-createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
-                      const WGPUDeviceDescriptor &devDescriptor = {}) {
-  Context context;
-  {
-#ifdef __EMSCRIPTEN__
-    // Emscripten does not support the instance descriptor
-    // and throws an assertion error if it is not nullptr.
-    context.instance = wgpuCreateInstance(nullptr);
-#else
-    context.instance = wgpuCreateInstance(&desc);
-#endif
-    // check status
-    check(context.instance, "Initialize WebGPU", __FILE__, __LINE__);
+inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adapters) {
+  std::string adapterList;
+  for (size_t i = 0; i < adapters.size(); ++i) {
+    auto adapterPtr = adapters[i].Get();
+    if (adapterPtr) {
+      WGPUAdapterInfo info = {};
+      wgpuAdapterGetInfo(adapterPtr, &info);
+      std::string desc = formatWGPUStringView(info.description);
+      adapterList += "GPU Adapter [" + std::to_string(i) + "]: " + desc + "\n";
+      wgpuAdapterInfoFreeMembers(info);
+    }
   }
+  return adapterList;
+}
 
-  LOG(kDefLog, kInfo, "Requesting adapter");
-  {
-    std::vector<dawn::native::Adapter> adapters =
-        dawn::native::Instance(
-            reinterpret_cast<dawn::native::InstanceBase *>(context.instance))
-            .EnumerateAdapters();
-    LOG(kDefLog, kInfo, "The number of GPUs=%d\n", adapters.size());
-    // Note: Second gpu is not available on Macos, but the number of GPUs is 2
-    // on Macos.
-    //       Calling wgpuAdapterGetInfo function for the second gpu becomes
-    //       segfault. When you check all GPUs on linux, uncomment out following
-    //       codes.
-    //
-    // for (size_t i = 0; i < adapters.size(); i++) {
-    //   WGPUAdapterInfo info {};
-    //   auto ptr = adapters[i].Get();
-    //   if (ptr && adapters[i]) {
-    //     wgpuAdapterGetInfo(ptr, &info);
-    //     LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", i, info.description);
-    //     wgpuAdapterInfoFreeMembers(info);
-    //   }
-    // }
-
-    {
-      LOG(kDefLog, kInfo, "Use GPU(Adapter)[%d]\n", gpuIdx);
-      auto ptr = adapters[gpuIdx].Get();
-      if (ptr) {
-        WGPUAdapterInfo info{};
-        wgpuAdapterGetInfo(ptr, &info);
-        LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s\n", gpuIdx,
-            info.description);
-        wgpuAdapterInfoFreeMembers(info);
-      }
-      context.adapter = adapters[gpuIdx].Get();
-      dawn::native::GetProcs().adapterAddRef(context.adapter);
-    }
+/**
+ * @brief Lists the available GPU adapters in the current WebGPU instance.
+ *
+ * This function retrieves the list of available GPU adapters using the
+ * getAdapters helper function, then formats and returns the adapter
+ * descriptions as a single string using the formatAdapters helper function.
+ *
+ * @param ctx The Context containing the WebGPU instance handle.
+ * @return std::string A newline-delimited string listing each adapter's
+ * description.
+ * 
+ * @code
+ * std::string adapterList = listAdapters(ctx);
+ * @endcode
+ */
+inline std::string listAdapters(Context &ctx) {
+  auto adapters = getAdapters(ctx);
+  return formatAdapters(adapters);
+}
+
+/**
+ * @brief Asynchronously creates a GPU context using the specified GPU index.
+ *
+ * This function creates a WebGPU instance, retrieves the available GPU
+ * adapters, and selects the adapter at the specified index. It then requests a
+ * device from the selected adapter and sets up a logging callback for device
+ * errors. The function returns a future that will be fulfilled with the
+ * created Context once all operations are complete.
+ *
+ * @param gpuIdx The index of the GPU adapter to use.
+ * @param desc Instance descriptor for the WebGPU instance (optional)
+ * @param devDescriptor Device descriptor for the WebGPU device (optional)
+ * @return std::future<Context> A future that will eventually hold the created
+ * Context.
+ * 
+ * @code
+ * std::future<Context> contextFuture = createContextByGpuIdxAsync(0);
+ * Context ctx = waitForContextFuture(contextFuture);
+ * @endcode
+ */
+inline std::future<Context>
+createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
+                           const WGPUDeviceDescriptor &devDescriptor = {}) {
+  auto promise = std::make_shared<std::promise<Context>>();
+  Context ctx;
+
+  ctx.instance = wgpuCreateInstance(&desc);
+
+  if (!ctx.instance) {
+    promise->set_exception(std::make_exception_ptr(
+        std::runtime_error("Failed to create WebGPU instance.")));
+    return promise->get_future();
   }
+  check(ctx.instance, "Initialize WebGPU", __FILE__, __LINE__);
 
-  LOG(kDefLog, kInfo, "Requesting device");
-  {
-    struct DeviceData {
-      WGPUDevice device = nullptr;
-      bool requestEnded = false;
-    };
-    DeviceData devData;
-
-    auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
-                                   WGPUDevice device, WGPUStringView message,
-                                   void *pUserData, void *) {
-      DeviceData &devData = *reinterpret_cast<DeviceData *>(pUserData);
-      check(status == WGPURequestDeviceStatus_Success,
-            "Could not get WebGPU device.", __FILE__, __LINE__);
-      LOG(kDefLog, kTrace, "Device Request succeeded %x",
-          static_cast<void *>(device));
-      devData.device = device;
-      devData.requestEnded = true;
-    };
+  // Use helper functions to obtain and format the adapters.
+  auto adapters = getAdapters(ctx);
 
-    WGPURequestDeviceCallbackInfo deviceCallbackInfo = {
-        .mode = WGPUCallbackMode_AllowSpontaneous,
-        .callback = onDeviceRequestEnded,
-        .userdata1 = &devData,
-        .userdata2 = nullptr};
-    wgpuAdapterRequestDevice(context.adapter, &devDescriptor,
-                             deviceCallbackInfo);
-
-    LOG(kDefLog, kInfo, "Waiting for device request to end");
-    while (!devData.requestEnded) {
-      processEvents(context.instance);
-    }
-    LOG(kDefLog, kInfo, "Device request ended");
-    assert(devData.requestEnded);
-    context.device = devData.device;
-
-    WGPULoggingCallbackInfo loggingCallbackInfo = {
-        .nextInChain = nullptr,
-        .callback =
-            [](WGPULoggingType type, WGPUStringView message, void *userdata1,
-               void *userdata2) {
-              LOG(kDefLog, kError, "Device logging callback: %.*s",
-                  static_cast<int>(message.length), message.data);
-              if (type == WGPULoggingType_Error) {
-                throw std::runtime_error("Device error logged.");
-              }
-            },
-        .userdata1 = nullptr,
-        .userdata2 = nullptr};
-    wgpuDeviceSetLoggingCallback(context.device, loggingCallbackInfo);
+  if (gpuIdx >= adapters.size()) {
+    promise->set_exception(
+        std::make_exception_ptr(std::runtime_error("Invalid GPU index.")));
+    return promise->get_future();
+  }
+  LOG(kDefLog, kInfo, "Using GPU Adapter[%d]", gpuIdx);
+  auto adapterPtr = adapters[gpuIdx].Get();
+  if (adapterPtr) {
+    WGPUAdapterInfo info = {};
+    wgpuAdapterGetInfo(adapterPtr, &info);
+    LOG(kDefLog, kInfo, "GPU(Adapter)[%d] = %s", gpuIdx,
+        formatWGPUStringView(info.description).c_str());
+    wgpuAdapterInfoFreeMembers(info);
+  }
+  ctx.adapter = reinterpret_cast<WGPUAdapter>(adapterPtr);
+  dawn::native::GetProcs().adapterAddRef(ctx.adapter);
+
+  LOG(kDefLog, kInfo, "Requesting device");
+  // Request the device asynchronously (using our requestDeviceAsync helper).
+  auto deviceFuture = requestDeviceAsync(ctx.adapter, devDescriptor);
+  try {
+    ctx.device = wait(ctx, deviceFuture);
+    ctx.deviceStatus = WGPURequestDeviceStatus_Success;
+  } catch (const std::exception &ex) {
+    promise->set_exception(std::make_exception_ptr(ex));
+    return promise->get_future();
   }
-  context.queue = wgpuDeviceGetQueue(context.device);
-  return context;
+
+  WGPULoggingCallbackInfo loggingCallbackInfo{
+      .nextInChain = nullptr,
+      .callback =
+          [](WGPULoggingType type, WGPUStringView message, void *userdata1,
+             void *userdata2) {
+            LOG(kDefLog, kError, "Device logging callback: %.*s",
+                static_cast<int>(message.length), message.data);
+            if (type == WGPULoggingType_Error) {
+              throw std::runtime_error("Device error logged.");
+            }
+          },
+      .userdata1 = nullptr,
+      .userdata2 = nullptr};
+  wgpuDeviceSetLoggingCallback(ctx.device, loggingCallbackInfo);
+  ctx.queue = wgpuDeviceGetQueue(ctx.device);
+  promise->set_value(std::move(ctx));
+  return promise->get_future();
 }
-#endif
+
+/**
+ * @brief Synchronously creates a GPU context using the specified GPU index.
+ *
+ * This function calls the asynchronous createContextByGpuIdxAsync function to
+ * create a GPU context, then waits for its completion using
+ * waitForContextFuture. The returned Context holds handles to the WebGPU
+ * instance, adapter, device, and queue, and is used for subsequent GPU
+ * operations.
+ *
+ * @param gpuIdx The index of the GPU adapter to use.
+ * @param desc Instance descriptor for the WebGPU instance (optional)
+ * @param devDescriptor Device descriptor for the WebGPU device (optional)
+ * @return Context The fully initialized GPU context.
+ *
+ * @code
+ * Context ctx = createContextByGpuIdx(0);
+ * @endcode
+ */
+inline Context createContextByGpuIdx(int gpuIdx,
+                             const WGPUInstanceDescriptor &desc = {},
+                             const WGPUDeviceDescriptor &devDescriptor = {}) {
+  std::future<Context> contextFuture =
+      createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor);
+  return waitForContextFuture<Context>(contextFuture);
+}
+
+#endif // USE_DAWN_API
+#endif // __EMSCRIPTEN__
 
 /**
  * @brief Callback function invoked upon completion of an asynchronous GPU

From 70d980287f9a7cca8889e166d67b802bc4b69319 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:14:41 -0600
Subject: [PATCH 05/17] tests toCPU, adds offset, adds gpuflow doc, default
 cmakelists builds test/test_gpu.cpp

---
 CMakeLists.txt    |  18 +++++
 docs/gpuflow.md   |  78 +++++++++++++++++++
 gpu.hpp           | 110 ++++++++++++++++----------
 test/test_gpu.cpp | 193 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 357 insertions(+), 42 deletions(-)
 create mode 100644 docs/gpuflow.md
 create mode 100644 test/test_gpu.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 816cdf3..a17602e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,24 @@ endif()
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/dawn.cmake")
 include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/gpu.cmake")
 
+target_link_libraries(gpu PRIVATE webgpu_dawn)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test)
+
+add_executable(test_gpu ${CMAKE_CURRENT_SOURCE_DIR}/test/test_gpu.cpp)
+target_link_libraries(test_gpu PRIVATE gpu)
+
+# Platform-specific post-build actions (e.g. copying DLLs for MSVC)
+if(MSVC)
+    add_custom_command(
+        TARGET test_gpu POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy
+                ${DAWN_BUILD_DIR}/$<CONFIG>/webgpu_dawn.dll
+                $<TARGET_FILE_DIR:test_gpu>
+        COMMENT "Copying webgpu_dawn.dll to the build directory"
+    )
+endif()
+
 add_library(gpud SHARED gpu.hpp)
 set_target_properties(gpud PROPERTIES LINKER_LANGUAGE CXX)
 target_link_libraries(gpud PRIVATE gpu)
diff --git a/docs/gpuflow.md b/docs/gpuflow.md
new file mode 100644
index 0000000..d4eb37a
--- /dev/null
+++ b/docs/gpuflow.md
@@ -0,0 +1,78 @@
+# GPU.cpp Lifecycle
+
+```mermaid
+flowchart TD
+  %% Data Preparation & Upload
+  subgraph "Data Preparation & Upload"
+    A["CPU Data"]
+    B["Define Data Properties<br>(shape, type, size)"]
+    C["Create GPU Buffer<br>(allocate raw buffer)"]
+    D["Create Tensor<br>(allocates Array with one<br> or more buffers<br>and associates Shape)"]
+    
+    E["Upload Data via toGPU <br>(raw buffer)<br>toGPU<br>(ctx, data, buffer, size)"]
+    F["Upload Data via toGPU<br>(Tensor overload)<br>toGPU(ctx, data, tensor)"]
+    G["Optional: Upload Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
+  end
+
+  %% Buffer Setup & Bindings
+  subgraph "Buffer & Binding Setup"
+    H["Define Bindings<br>(Bindings, TensorView)"]
+    I["Map GPU buffers<br> to shader bindings<br>(Collection from Tensor<br> or single buffers)"]
+  end
+
+  %% Kernel Setup & Execution
+  subgraph "Kernel Setup & Execution"
+    J["Define KernelCode<br>(WGSL template, workgroup size, precision)"]
+    K["Create Kernel"]
+    L["Dispatch Kernel"]
+  end
+
+  %% GPU Execution & Result Readback
+  subgraph "GPU Execution & Result Readback"
+    M["Kernel Execution<br>(GPU shader runs)"]
+    N["Readback Data<br>(toCPU variants)"]
+  end
+
+  %% Context & Resources
+  O["Context<br>(Device, Queue,<br>TensorPool, KernelPool)"]
+
+  %% Flow Connections
+  A --> B
+  B --> C
+  B --> D
+  C --> E
+  D --> F
+  F --> H
+  E --> H
+  H --> I
+  I --> K
+  J --> K
+  G --- K
+  K --> L
+  L --> M
+  M --> N
+
+  %% Context shared by all stages
+  O --- D
+  O --- E
+  O --- F
+  O --- K
+  O --- L
+  O --- N
+```
+
+Rank 0: Scalar
+Rank 1: Vector
+Rank 2: Matrix
+Rank 3: 3D Tensor (or Cube)
+Rank 4: 4D Tensor
+Rank ..: Higher Dimensional Tensors
+
+
+• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
+• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
+• gpu::Bindings collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.  
+• The gpu::TensorPool (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
+• gpu::KernelCode contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.  
+• The gpu::createKernelAsync/gpu::createKernel functions (within the Execution Flow) use the gpu::Context, gpu::Bindings, and gpu::KernelCode to configure and construct a gpu::Kernel that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).  
+• gpu::KernelCode’s workgroup size (a gpu::Shape) defines the dispatch configuration, and the gpu::Kernel eventually uses the underlying gpu::Array (contains WGPUBuffer, WGPUBufferUsage, size_t) and gpu::Shape data from the created Tensor.
diff --git a/gpu.hpp b/gpu.hpp
index 906371c..931d646 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -16,7 +16,6 @@
 #include <utility> // std::pair
 #include <vector>
 
-
 #ifdef __EMSCRIPTEN__
 #include "emscripten/emscripten.h"
 #endif
@@ -1106,7 +1105,7 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {},
  * @param ctx The Context containing the WebGPU instance handle.
  * @return std::vector<dawn::native::Adapter> A vector of available GPU
  * adapters.
- * 
+ *
  * @code
  * std::vector<dawn::native::Adapter> adapters = getAdapters(ctx);
  * @endcode
@@ -1118,21 +1117,25 @@ inline std::vector<dawn::native::Adapter> getAdapters(Context &ctx) {
 }
 
 /**
- * @brief Formats the given vector of Dawn adapters into a single concatenated string.
+ * @brief Formats the given vector of Dawn adapters into a single concatenated
+ * string.
  *
- * This function iterates over each Dawn adapter in the provided vector, retrieves its
- * description using the WebGPU API, and converts the description from a WGPUStringView
- * to an std::string using the formatWGPUStringView helper. The resulting descriptions
- * are concatenated into a single string separated by newline characters.
+ * This function iterates over each Dawn adapter in the provided vector,
+ * retrieves its description using the WebGPU API, and converts the description
+ * from a WGPUStringView to an std::string using the formatWGPUStringView
+ * helper. The resulting descriptions are concatenated into a single string
+ * separated by newline characters.
  *
  * @param adapters A vector of Dawn adapters obtained from a WebGPU instance.
- * @return std::string A newline-delimited string listing each adapter's description.
- * 
+ * @return std::string A newline-delimited string listing each adapter's
+ * description.
+ *
  * @code
  * std::string adapterList = formatAdapters(adapters);
  * @endcode
  */
-inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adapters) {
+inline std::string
+formatAdapters(const std::vector<dawn::native::Adapter> &adapters) {
   std::string adapterList;
   for (size_t i = 0; i < adapters.size(); ++i) {
     auto adapterPtr = adapters[i].Get();
@@ -1157,7 +1160,7 @@ inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adap
  * @param ctx The Context containing the WebGPU instance handle.
  * @return std::string A newline-delimited string listing each adapter's
  * description.
- * 
+ *
  * @code
  * std::string adapterList = listAdapters(ctx);
  * @endcode
@@ -1181,7 +1184,7 @@ inline std::string listAdapters(Context &ctx) {
  * @param devDescriptor Device descriptor for the WebGPU device (optional)
  * @return std::future<Context> A future that will eventually hold the created
  * Context.
- * 
+ *
  * @code
  * std::future<Context> contextFuture = createContextByGpuIdxAsync(0);
  * Context ctx = waitForContextFuture(contextFuture);
@@ -1270,9 +1273,9 @@ createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
  * Context ctx = createContextByGpuIdx(0);
  * @endcode
  */
-inline Context createContextByGpuIdx(int gpuIdx,
-                             const WGPUInstanceDescriptor &desc = {},
-                             const WGPUDeviceDescriptor &devDescriptor = {}) {
+inline Context
+createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
+                      const WGPUDeviceDescriptor &devDescriptor = {}) {
   std::future<Context> contextFuture =
       createContextByGpuIdxAsync(gpuIdx, desc, devDescriptor);
   return waitForContextFuture<Context>(contextFuture);
@@ -1365,17 +1368,19 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
 /**
  * @brief Copies data from a GPU buffer to CPU memory.
  * @param[in] ctx Context instance to manage the operation
- * @param[in] tensor Tensor instance representing the GPU buffer to copy from
  * @param[out] data Pointer to the CPU memory to copy the data to
  * @param[in] bufferSize Size of the data buffer in bytes
  * @param[in] op StagingBuffer instance to manage the operation
+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
  *
  * @code
  * toCPU(ctx, tensor, data, bufferSize);
  * @endcode
  */
-inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
-                                    size_t bufferSize, CopyData &op) {
+
+// NOTE: I think this one is redundant? CopyData not used externally.
+inline std::future<void> toCPUAsync(Context &ctx, void *data, size_t bufferSize,
+                                    CopyData &op, size_t sourceOffset = 0) {
   // Submit the command buffer and release it.
   wgpuQueueSubmit(ctx.queue, 1, &op.commandBuffer);
   wgpuCommandBufferRelease(op.commandBuffer);
@@ -1388,8 +1393,8 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   CallbackData *cbData = new CallbackData{
       op.readbackBuffer, // The GPU buffer to be read back.
       bufferSize,
-      data,   // CPU memory destination.
-      promise // The promise to be signaled.
+      data,    // CPU memory destination.
+      promise, // The promise to be signaled.
   };
 
   // Set up the work-done callback to initiate the buffer mapping.
@@ -1402,6 +1407,11 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // Begin the asynchronous chain by registering the queue work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
+  // Release the readback buffer as it is no longer needed.
+  if (op.readbackBuffer) {
+    wgpuBufferRelease(op.readbackBuffer);
+  }
+
   return promise->get_future();
 }
 
@@ -1417,11 +1427,13 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
  *
  * @param[in] ctx Context instance to manage the operation
  * @param[in] tensor Tensor instance representing the GPU buffer to copy from
- * @param[in] bufferSize Size of the data buffer in bytes
+ * @param[in] bufferSize Size to read in bytes as out data.
  * @param[out] data Pointer to the CPU memory to copy the data to
+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
  */
 inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
-                                    size_t bufferSize) {
+                                    size_t bufferSize,
+                                    size_t sourceOffset = 0) {
   // Create a promise that will later be satisfied when the async copy
   // completes.
   auto promise = std::make_shared<std::promise<void>>();
@@ -1430,7 +1442,7 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   WGPUBufferDescriptor readbackBufferDescriptor = {
       .label = {.data = nullptr, .length = 0},
       .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
-      .size = bufferSize,
+      .size = bufferSize, // Size of the readback buffer.
   };
   WGPUBuffer readbackBuffer =
       wgpuDeviceCreateBuffer(ctx.device, &readbackBufferDescriptor);
@@ -1438,8 +1450,9 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // Create a command encoder and record a copy from the tensor GPU buffer
   WGPUCommandEncoder commandEncoder =
       wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
-  wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer, 0,
-                                       readbackBuffer, 0, bufferSize);
+  wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, tensor.data.buffer,
+                                       sourceOffset, readbackBuffer, 0,
+                                       bufferSize);
   // Finish recording by creating a command buffer and release the encoder.
   WGPUCommandBuffer commandBuffer =
       wgpuCommandEncoderFinish(commandEncoder, nullptr);
@@ -1472,13 +1485,16 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // queueWorkDoneCallback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
+  if (readbackBuffer) {
+    wgpuBufferRelease(readbackBuffer);
+  }
+
   return promise->get_future();
 }
 
 inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
-                                    size_t size) {
-  // The size (in bytes) for the copy.
-  uint64_t bufferSize = size;
+                                    size_t bufferSize,
+                                    size_t sourceOffset = 0) {
 
   // Create an operation structure (here we reuse CopyData solely for its
   // members that we need to create a readback buffer and command buffer).
@@ -1503,7 +1519,7 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
   {
     WGPUCommandEncoder commandEncoder =
         wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
-    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, 0,
+    wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, buffer, sourceOffset,
                                          op.readbackBuffer, 0, bufferSize);
     op.commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
     wgpuCommandEncoderRelease(commandEncoder);
@@ -1516,10 +1532,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
 
   // Allocate callback data
   CallbackData *cbData = new CallbackData{
-      op.readbackBuffer,               // The readback buffer created above.
-      static_cast<size_t>(bufferSize), // Size of the copy.
-      data,                            // Destination CPU memory.
-      promise                          // Our promise to satisfy when done.
+      op.readbackBuffer, // The readback buffer created above.
+      bufferSize,        // Size of the copy.
+      data,   // Destination CPU memory.         // Offset in the GPU buffer.
+      promise // Our promise to satisfy when done.
   };
 
   // Set up the queue work-done callback info.
@@ -1532,6 +1548,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
   // Start the asynchronous chain by registering the work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
+  if (op.readbackBuffer) {
+    wgpuBufferRelease(op.readbackBuffer);
+  }
+
   return promise->get_future();
 }
 
@@ -1548,9 +1568,11 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
  * @endcode
  */
 template <size_t N>
-inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
-                                    std::array<float, N> &data) {
-  return toCPUAsync(ctx, tensor, data.data(), sizeof(data));
+inline std::future<void>
+toCPUAsync(Context &ctx, Tensor &tensor, std::array<float, N> &data,
+           size_t sourceOffset = 0) {
+  return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset
+                    );
 }
 
 /**
@@ -1571,8 +1593,10 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
  * toCPU(ctx, tensor, data, bufferSize, instance);
  * @endcode
  */
-inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
-  auto future = toCPUAsync(ctx, tensor, data, bufferSize);
+inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
+                  size_t sourceOffset = 0) {
+  auto future =
+      toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset);
   wait(ctx, future);
 }
 
@@ -1593,8 +1617,9 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
  * toCPU(ctx, buffer, data, size, instance);
  * @endcode
  */
-inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
-  auto future = toCPUAsync(ctx, buffer, data, size);
+inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size,
+                  size_t sourceOffset = 0) {
+  auto future = toCPUAsync(ctx, buffer, data, size, sourceOffset);
   wait(ctx, future);
 }
 
@@ -1616,8 +1641,9 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
  * @endcode
  */
 template <size_t N>
-inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data) {
-  auto future = toCPUAsync(ctx, tensor, data);
+inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data,
+                  size_t sourceOffset = 0) {
+  auto future = toCPUAsync(ctx, tensor, data, sourceOffset);
   wait(ctx, future);
 }
 
diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
new file mode 100644
index 0000000..2cc4290
--- /dev/null
+++ b/test/test_gpu.cpp
@@ -0,0 +1,193 @@
+//// filepath: /d:/Code/git/forks/gpu.cpp/test/test_gpu_integration.cpp
+#include "gpu.hpp"
+#include <array>
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <future>
+#include <vector>
+
+using namespace gpu;
+
+// A simple WGSL copy kernel that copies input to output.
+static const char *kCopyKernel = R"(
+@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
+@compute @workgroup_size({{workgroupSize}})
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+  let i: u32 = gid.x;
+  if (i < arrayLength(&inp)) {
+    out[i] = inp[i];
+  }
+}
+)";
+
+// Test using the overload that takes a Tensor.
+void testToCPUWithTensor() {
+    LOG(kDefLog, kInfo, "Running testToCPUWithTensor...");
+
+    // Create a real GPU context.
+    #ifdef USE_DAWN_API
+    Context ctx = createContextByGpuIdx(0);
+    #else
+    Context ctx = createContext();
+    #endif
+
+    constexpr size_t N = 1024;
+    std::array<float, N> inputData, outputData;
+    for (size_t i = 0; i < N; ++i) {
+        inputData[i] = static_cast<float>(i);
+        outputData[i] = 0.0f;
+    }
+
+    // Create input and output tensors.
+    Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
+    Tensor outputTensor = createTensor(ctx, Shape{N}, kf32);
+
+    // Create and dispatch the copy kernel.
+    Kernel copyKernel = createKernel(ctx, {kCopyKernel, 256, kf32},
+                                     Bindings{inputTensor, outputTensor},
+                                     {cdiv(N, 256), 1, 1});
+    dispatchKernel(ctx, copyKernel);
+
+    // Synchronously copy GPU output to CPU using the tensor overload.
+    toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData));
+
+    // Verify the output matches the input.
+    for (size_t i = 0; i < N; ++i) {
+        LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
+        LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+        assert(outputData[i] == inputData[i]);
+    }
+    LOG(kDefLog, kInfo, "testToCPUWithTensor passed.");
+}
+
+// Test using the overload that takes a raw GPU buffer.
+// We reuse the Tensor's underlying buffer for this test.
+void testToCPUWithBuffer() {
+    LOG(kDefLog, kInfo, "Running testToCPUWithBuffer...");
+
+    #ifdef USE_DAWN_API
+    Context ctx = createContextByGpuIdx(0);
+    #else
+    Context ctx = createContext();
+    #endif
+
+    constexpr size_t N = 1024;
+    std::array<float, N> data, outputData;
+    for (size_t i = 0; i < N; ++i) {
+        data[i] = static_cast<float>(i * 2);
+        outputData[i] = 0.0f;
+    }
+
+    // Create a tensor to allocate a GPU buffer and initialize it.
+    Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data());
+
+    // Now extract the raw GPU buffer from the tensor.
+    WGPUBuffer gpuBuffer = tensor.data.buffer;
+
+    // Use the WGPUBuffer overload. This call returns a future.
+    auto future = toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0);
+    wait(ctx, future);
+
+    // Verify that the CPU output matches the original data.
+    for (size_t i = 0; i < N; ++i) {
+        LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+        assert(outputData[i] == data[i]);
+    }
+    LOG(kDefLog, kInfo, "testToCPUWithBuffer passed.");
+}
+
+void testToCPUWithTensorSourceOffset() {
+    LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset...");
+#ifdef USE_DAWN_API
+    Context ctx = createContextByGpuIdx(0);
+#else
+    Context ctx = createContext();
+#endif
+
+    constexpr size_t numElements = 25;
+    constexpr size_t sourceOffsetElements = 5;  // Skip first 5 elements
+    constexpr size_t copyCount = 10;            // Number of floats to copy
+    size_t copySize = copyCount * sizeof(float);
+
+    // Create an input array with known data.
+    std::array<float, numElements> inputData{};
+    for (size_t i = 0; i < numElements; ++i) {
+        inputData[i] = static_cast<float>(i + 50); // Arbitrary values
+    }
+    // Create a tensor from the full data.
+    Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+
+    // Allocate a destination CPU buffer exactly as large as the data we want to copy.
+    std::vector<float> cpuOutput(copyCount, -1.0f);
+
+    // Set sourceOffset to skip the first few float elements
+    size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+    // Call the tensor overload with sourceOffset and destOffset = 0.
+    auto future = toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes);
+    wait(ctx, future);
+
+    // Verify the copied data matches the expected subset.
+    for (size_t i = 0; i < copyCount; ++i) {
+        float expected = inputData[sourceOffsetElements + i];
+        float actual = cpuOutput[i];
+        LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+        LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+        assert(expected == actual);
+    }
+    LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed.");
+}
+
+void testToCPUWithBufferSourceOffset() {
+    LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset...");
+#ifdef USE_DAWN_API
+    Context ctx = createContextByGpuIdx(0);
+#else
+    Context ctx = createContext();
+#endif
+
+    constexpr size_t numElements = 30;
+    constexpr size_t sourceOffsetElements = 7;  // Skip first 7 elements
+    constexpr size_t copyCount = 12;            // Number of floats to copy
+    size_t copySize = copyCount * sizeof(float);
+
+    // Create an input array with arbitrary data.
+    std::array<float, numElements> inputData{};
+    for (size_t i = 0; i < numElements; ++i) {
+        inputData[i] = static_cast<float>(i + 100);
+    }
+    // Create a tensor to initialize a GPU buffer.
+    Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+    // Extract the raw GPU buffer from the tensor.
+    WGPUBuffer buffer = tensor.data.buffer;
+
+    // Allocate a destination CPU buffer exactly as large as needed.
+    std::vector<float> cpuOutput(copyCount, -2.0f);
+    size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+
+    // Call the buffer overload with sourceOffset and destOffset = 0.
+    auto future = toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes);
+    wait(ctx, future);
+
+    // Verify that the copied data matches the expected subset.
+    for (size_t i = 0; i < copyCount; ++i) {
+        float expected = inputData[sourceOffsetElements + i];
+        float actual = cpuOutput[i];
+        LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+        LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+        assert(expected == actual);
+    }
+    LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
+}
+
+int main() {
+    LOG(kDefLog, kInfo, "Running GPU integration tests...");
+    testToCPUWithTensor();
+    testToCPUWithBuffer();
+    testToCPUWithTensorSourceOffset();
+    testToCPUWithBufferSourceOffset();
+    LOG(kDefLog, kInfo, "All tests passed.");
+    return 0;
+}
\ No newline at end of file

From 16feb9e9f32e8cc2bbc12019a448c856a061d19f Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:16:44 -0600
Subject: [PATCH 06/17] remove path

---
 test/test_gpu.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index 2cc4290..0954e44 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -1,4 +1,3 @@
-//// filepath: /d:/Code/git/forks/gpu.cpp/test/test_gpu_integration.cpp
 #include "gpu.hpp"
 #include <array>
 #include <cstdio>

From e61e80917a73406e8fb8af5a94c743982231f51b Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:18:53 -0600
Subject: [PATCH 07/17] format

---
 test/test_gpu.cpp | 294 +++++++++++++++++++++++-----------------------
 1 file changed, 149 insertions(+), 145 deletions(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index 0954e44..48aa1bc 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -1,7 +1,7 @@
 #include "gpu.hpp"
 #include <array>
-#include <cstdio>
 #include <cassert>
+#include <cstdio>
 #include <cstring>
 #include <future>
 #include <vector>
@@ -24,169 +24,173 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
 
 // Test using the overload that takes a Tensor.
 void testToCPUWithTensor() {
-    LOG(kDefLog, kInfo, "Running testToCPUWithTensor...");
-
-    // Create a real GPU context.
-    #ifdef USE_DAWN_API
-    Context ctx = createContextByGpuIdx(0);
-    #else
-    Context ctx = createContext();
-    #endif
-
-    constexpr size_t N = 1024;
-    std::array<float, N> inputData, outputData;
-    for (size_t i = 0; i < N; ++i) {
-        inputData[i] = static_cast<float>(i);
-        outputData[i] = 0.0f;
-    }
-
-    // Create input and output tensors.
-    Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
-    Tensor outputTensor = createTensor(ctx, Shape{N}, kf32);
-
-    // Create and dispatch the copy kernel.
-    Kernel copyKernel = createKernel(ctx, {kCopyKernel, 256, kf32},
-                                     Bindings{inputTensor, outputTensor},
-                                     {cdiv(N, 256), 1, 1});
-    dispatchKernel(ctx, copyKernel);
-
-    // Synchronously copy GPU output to CPU using the tensor overload.
-    toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData));
-
-    // Verify the output matches the input.
-    for (size_t i = 0; i < N; ++i) {
-        LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
-        LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
-        assert(outputData[i] == inputData[i]);
-    }
-    LOG(kDefLog, kInfo, "testToCPUWithTensor passed.");
+  LOG(kDefLog, kInfo, "Running testToCPUWithTensor...");
+
+// Create a real GPU context.
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<float, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<float>(i);
+    outputData[i] = 0.0f;
+  }
+
+  // Create input and output tensors.
+  Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
+  Tensor outputTensor = createTensor(ctx, Shape{N}, kf32);
+
+  // Create and dispatch the copy kernel.
+  Kernel copyKernel =
+      createKernel(ctx, {kCopyKernel, 256, kf32},
+                   Bindings{inputTensor, outputTensor}, {cdiv(N, 256), 1, 1});
+  dispatchKernel(ctx, copyKernel);
+
+  // Synchronously copy GPU output to CPU using the tensor overload.
+  toCPU(ctx, outputTensor, outputData.data(), sizeof(outputData));
+
+  // Verify the output matches the input.
+  for (size_t i = 0; i < N; ++i) {
+    LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
+    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    assert(outputData[i] == inputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithTensor passed.");
 }
 
 // Test using the overload that takes a raw GPU buffer.
 // We reuse the Tensor's underlying buffer for this test.
 void testToCPUWithBuffer() {
-    LOG(kDefLog, kInfo, "Running testToCPUWithBuffer...");
-
-    #ifdef USE_DAWN_API
-    Context ctx = createContextByGpuIdx(0);
-    #else
-    Context ctx = createContext();
-    #endif
-
-    constexpr size_t N = 1024;
-    std::array<float, N> data, outputData;
-    for (size_t i = 0; i < N; ++i) {
-        data[i] = static_cast<float>(i * 2);
-        outputData[i] = 0.0f;
-    }
-
-    // Create a tensor to allocate a GPU buffer and initialize it.
-    Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data());
-
-    // Now extract the raw GPU buffer from the tensor.
-    WGPUBuffer gpuBuffer = tensor.data.buffer;
-
-    // Use the WGPUBuffer overload. This call returns a future.
-    auto future = toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0);
-    wait(ctx, future);
-
-    // Verify that the CPU output matches the original data.
-    for (size_t i = 0; i < N; ++i) {
-        LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
-        assert(outputData[i] == data[i]);
-    }
-    LOG(kDefLog, kInfo, "testToCPUWithBuffer passed.");
+  LOG(kDefLog, kInfo, "Running testToCPUWithBuffer...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<float, N> data, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    data[i] = static_cast<float>(i * 2);
+    outputData[i] = 0.0f;
+  }
+
+  // Create a tensor to allocate a GPU buffer and initialize it.
+  Tensor tensor = createTensor(ctx, Shape{N}, kf32, data.data());
+
+  // Now extract the raw GPU buffer from the tensor.
+  WGPUBuffer gpuBuffer = tensor.data.buffer;
+
+  // Use the WGPUBuffer overload. This call returns a future.
+  auto future =
+      toCPUAsync(ctx, gpuBuffer, outputData.data(), sizeof(outputData), 0);
+  wait(ctx, future);
+
+  // Verify that the CPU output matches the original data.
+  for (size_t i = 0; i < N; ++i) {
+    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    assert(outputData[i] == data[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithBuffer passed.");
 }
 
 void testToCPUWithTensorSourceOffset() {
-    LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset...");
+  LOG(kDefLog, kInfo, "Running testToCPUWithTensorSourceOffset...");
 #ifdef USE_DAWN_API
-    Context ctx = createContextByGpuIdx(0);
+  Context ctx = createContextByGpuIdx(0);
 #else
-    Context ctx = createContext();
+  Context ctx = createContext();
 #endif
 
-    constexpr size_t numElements = 25;
-    constexpr size_t sourceOffsetElements = 5;  // Skip first 5 elements
-    constexpr size_t copyCount = 10;            // Number of floats to copy
-    size_t copySize = copyCount * sizeof(float);
-
-    // Create an input array with known data.
-    std::array<float, numElements> inputData{};
-    for (size_t i = 0; i < numElements; ++i) {
-        inputData[i] = static_cast<float>(i + 50); // Arbitrary values
-    }
-    // Create a tensor from the full data.
-    Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
-
-    // Allocate a destination CPU buffer exactly as large as the data we want to copy.
-    std::vector<float> cpuOutput(copyCount, -1.0f);
-
-    // Set sourceOffset to skip the first few float elements
-    size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
-    // Call the tensor overload with sourceOffset and destOffset = 0.
-    auto future = toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes);
-    wait(ctx, future);
-
-    // Verify the copied data matches the expected subset.
-    for (size_t i = 0; i < copyCount; ++i) {
-        float expected = inputData[sourceOffsetElements + i];
-        float actual = cpuOutput[i];
-        LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
-        LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
-        assert(expected == actual);
-    }
-    LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed.");
+  constexpr size_t numElements = 25;
+  constexpr size_t sourceOffsetElements = 5; // Skip first 5 elements
+  constexpr size_t copyCount = 10;           // Number of floats to copy
+  size_t copySize = copyCount * sizeof(float);
+
+  // Create an input array with known data.
+  std::array<float, numElements> inputData{};
+  for (size_t i = 0; i < numElements; ++i) {
+    inputData[i] = static_cast<float>(i + 50); // Arbitrary values
+  }
+  // Create a tensor from the full data.
+  Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+
+  // Allocate a destination CPU buffer exactly as large as the data we want to
+  // copy.
+  std::vector<float> cpuOutput(copyCount, -1.0f);
+
+  // Set sourceOffset to skip the first few float elements
+  size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+  // Call the tensor overload with sourceOffset and destOffset = 0.
+  auto future =
+      toCPUAsync(ctx, tensor, cpuOutput.data(), copySize, sourceOffsetBytes);
+  wait(ctx, future);
+
+  // Verify the copied data matches the expected subset.
+  for (size_t i = 0; i < copyCount; ++i) {
+    float expected = inputData[sourceOffsetElements + i];
+    float actual = cpuOutput[i];
+    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    assert(expected == actual);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed.");
 }
 
 void testToCPUWithBufferSourceOffset() {
-    LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset...");
+  LOG(kDefLog, kInfo, "Running testToCPUWithBufferSourceOffset...");
 #ifdef USE_DAWN_API
-    Context ctx = createContextByGpuIdx(0);
+  Context ctx = createContextByGpuIdx(0);
 #else
-    Context ctx = createContext();
+  Context ctx = createContext();
 #endif
 
-    constexpr size_t numElements = 30;
-    constexpr size_t sourceOffsetElements = 7;  // Skip first 7 elements
-    constexpr size_t copyCount = 12;            // Number of floats to copy
-    size_t copySize = copyCount * sizeof(float);
-
-    // Create an input array with arbitrary data.
-    std::array<float, numElements> inputData{};
-    for (size_t i = 0; i < numElements; ++i) {
-        inputData[i] = static_cast<float>(i + 100);
-    }
-    // Create a tensor to initialize a GPU buffer.
-    Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
-    // Extract the raw GPU buffer from the tensor.
-    WGPUBuffer buffer = tensor.data.buffer;
-
-    // Allocate a destination CPU buffer exactly as large as needed.
-    std::vector<float> cpuOutput(copyCount, -2.0f);
-    size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
-
-    // Call the buffer overload with sourceOffset and destOffset = 0.
-    auto future = toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes);
-    wait(ctx, future);
-
-    // Verify that the copied data matches the expected subset.
-    for (size_t i = 0; i < copyCount; ++i) {
-        float expected = inputData[sourceOffsetElements + i];
-        float actual = cpuOutput[i];
-        LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
-        LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
-        assert(expected == actual);
-    }
-    LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
+  constexpr size_t numElements = 30;
+  constexpr size_t sourceOffsetElements = 7; // Skip first 7 elements
+  constexpr size_t copyCount = 12;           // Number of floats to copy
+  size_t copySize = copyCount * sizeof(float);
+
+  // Create an input array with arbitrary data.
+  std::array<float, numElements> inputData{};
+  for (size_t i = 0; i < numElements; ++i) {
+    inputData[i] = static_cast<float>(i + 100);
+  }
+  // Create a tensor to initialize a GPU buffer.
+  Tensor tensor = createTensor(ctx, Shape{numElements}, kf32, inputData.data());
+  // Extract the raw GPU buffer from the tensor.
+  WGPUBuffer buffer = tensor.data.buffer;
+
+  // Allocate a destination CPU buffer exactly as large as needed.
+  std::vector<float> cpuOutput(copyCount, -2.0f);
+  size_t sourceOffsetBytes = sourceOffsetElements * sizeof(float);
+
+  // Call the buffer overload with sourceOffset and destOffset = 0.
+  auto future =
+      toCPUAsync(ctx, buffer, cpuOutput.data(), copySize, sourceOffsetBytes);
+  wait(ctx, future);
+
+  // Verify that the copied data matches the expected subset.
+  for (size_t i = 0; i < copyCount; ++i) {
+    float expected = inputData[sourceOffsetElements + i];
+    float actual = cpuOutput[i];
+    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    assert(expected == actual);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
 }
 
 int main() {
-    LOG(kDefLog, kInfo, "Running GPU integration tests...");
-    testToCPUWithTensor();
-    testToCPUWithBuffer();
-    testToCPUWithTensorSourceOffset();
-    testToCPUWithBufferSourceOffset();
-    LOG(kDefLog, kInfo, "All tests passed.");
-    return 0;
-}
\ No newline at end of file
+  LOG(kDefLog, kInfo, "Running GPU integration tests...");
+  testToCPUWithTensor();
+  testToCPUWithBuffer();
+  testToCPUWithTensorSourceOffset();
+  testToCPUWithBufferSourceOffset();
+  LOG(kDefLog, kInfo, "All tests passed.");
+  return 0;
+}

From ad8698dc1cb10ac89f020e6920d680328a6200ae Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:22:25 -0600
Subject: [PATCH 08/17] doc formatting

---
 docs/gpuflow.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/gpuflow.md b/docs/gpuflow.md
index d4eb37a..420397d 100644
--- a/docs/gpuflow.md
+++ b/docs/gpuflow.md
@@ -61,18 +61,18 @@ flowchart TD
   O --- N
 ```
 
+• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
+• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
+• `gpu::Bindings` collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.  
+• The `gpu::TensorPool` (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
+• `gpu::KernelCode` contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.  
+• The `gpu::createKernelAsync/gpu::createKernel` functions (within the Execution Flow) use the `gpu::Context`, `gpu::Bindings`, and `gpu::KernelCode` to configure and construct a `gpu::Kernel` that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).  
+• `gpu::KernelCode`’s workgroup size (a `gpu::Shape`) defines the dispatch configuration, and the `gpu::Kernel` eventually uses the underlying `gpu::Array` (contains` WGPUBuffer, WGPUBufferUsage, size_t`) and `gpu::Shape` data from the created Tensor.
+
+`gpu::Tensor` Ranks:
 Rank 0: Scalar
 Rank 1: Vector
 Rank 2: Matrix
 Rank 3: 3D Tensor (or Cube)
 Rank 4: 4D Tensor
-Rank ..: Higher Dimensional Tensors
-
-
-• The `gpu::Array` (which wraps a GPU buffer with usage and size) and the `gpu::Shape` (which defines dimensions and rank) are combined—via the creation process—to produce a `gpu::Tensor`.
-• A `gpu::TensorView` provides a non‑owning view into a slice of a `gpu::Tensor`. Ex. `TensorView view = {tensor, 0, 256};`
-• gpu::Bindings collect multiple Tensors (or TensorViews) along with view offset/size information for use in a kernel.  
-• The gpu::TensorPool (managed by the Context) is responsible for the lifetime of tensors and GPU resource cleanup.
-• gpu::KernelCode contains the WGSL shader template plus metadata (workgroup size, precision, label, and entry point) that drive the kernel configuration.  
-• The gpu::createKernelAsync/gpu::createKernel functions (within the Execution Flow) use the gpu::Context, gpu::Bindings, and gpu::KernelCode to configure and construct a gpu::Kernel that manages all the underlying GPU resources (buffers, bind groups, compute pipeline, etc.).  
-• gpu::KernelCode’s workgroup size (a gpu::Shape) defines the dispatch configuration, and the gpu::Kernel eventually uses the underlying gpu::Array (contains WGPUBuffer, WGPUBufferUsage, size_t) and gpu::Shape data from the created Tensor.
+Rank (max 8): Higher Dimensional Tensors
\ No newline at end of file

From 025af2a8f4621ba9612354a6e524044da2188ac3 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Fri, 21 Feb 2025 22:29:25 -0600
Subject: [PATCH 09/17] doc nits

---
 docs/gpuflow.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gpuflow.md b/docs/gpuflow.md
index 420397d..fee9d4c 100644
--- a/docs/gpuflow.md
+++ b/docs/gpuflow.md
@@ -11,7 +11,7 @@ flowchart TD
     
     E["Upload Data via toGPU <br>(raw buffer)<br>toGPU<br>(ctx, data, buffer, size)"]
     F["Upload Data via toGPU<br>(Tensor overload)<br>toGPU(ctx, data, tensor)"]
-    G["Optional: Upload Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
+    G["Optional: <br> Kernel Parameters<br>toGPU(ctx, params, Kernel)"]
   end
 
   %% Buffer Setup & Bindings

From 3776dcd50152ba4fc18ca9029006bd9e9588dca7 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 11:33:30 -0600
Subject: [PATCH 10/17] set project root on root cmakelists

---
 CMakeLists.txt  | 2 +-
 cmake/gpu.cmake | 1 -
 docs/gpuflow.md | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a17602e..85911a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@
 # and cmake/gpu.cmake for more details
 cmake_minimum_required(VERSION 3.28)
 project(gpu)
-
+set(PROJECT_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # export compile_commands.json to use with
                                       # LSP
 set(CMAKE_CXX_STANDARD 20)
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index f936991..d991a18 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -39,4 +39,3 @@ else()
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/")
     target_include_directories(gpu PUBLIC "${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/")
 endif()
-
diff --git a/docs/gpuflow.md b/docs/gpuflow.md
index fee9d4c..d13a228 100644
--- a/docs/gpuflow.md
+++ b/docs/gpuflow.md
@@ -75,4 +75,4 @@ Rank 1: Vector
 Rank 2: Matrix
 Rank 3: 3D Tensor (or Cube)
 Rank 4: 4D Tensor
-Rank (max 8): Higher Dimensional Tensors
\ No newline at end of file
+Rank (max 8): Higher Dimensional Tensors

From d58e1911b5e015ea073c4e4350d3378a2edf80bd Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 16:15:51 -0600
Subject: [PATCH 11/17] fix linux issue with callback info

---
 gpu.hpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/gpu.hpp b/gpu.hpp
index 931d646..8c661bc 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -10,6 +10,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <thread>
 #include <tuple>
 #include <type_traits>
 #include <unordered_map>
@@ -1354,11 +1355,12 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
         __LINE__);
 
   // Set up the buffer mapping callback information.
-  WGPUBufferMapCallbackInfo mapCallbackInfo;
-  mapCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
-  mapCallbackInfo.callback = bufferMapCallback;
-  mapCallbackInfo.userdata1 = cbData;
-  mapCallbackInfo.userdata2 = nullptr;
+  WGPUBufferMapCallbackInfo mapCallbackInfo = {
+    .mode = WGPUCallbackMode_AllowSpontaneous,
+    .callback = bufferMapCallback,
+    .userdata1 = cbData, // Pass the callback data.
+    .userdata2 = nullptr // No additional user data.
+  };
 
   // Begin the asynchronous mapping of the readback buffer.
   wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize,

From 498ba74b73962d8b647b844fc570cf758ebaf467 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 17:54:16 -0600
Subject: [PATCH 12/17] should not release readback buffer

---
 gpu.hpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/gpu.hpp b/gpu.hpp
index 8c661bc..4854338 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -1486,11 +1486,7 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // Register the callback. The async chain continues inside
   // queueWorkDoneCallback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
-
-  if (readbackBuffer) {
-    wgpuBufferRelease(readbackBuffer);
-  }
-
+  
   return promise->get_future();
 }
 
@@ -1550,10 +1546,6 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
   // Start the asynchronous chain by registering the work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
-  if (op.readbackBuffer) {
-    wgpuBufferRelease(op.readbackBuffer);
-  }
-
   return promise->get_future();
 }
 

From 2db9be10fb3f0298294ba199d71eca894746e3a6 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 18:07:09 -0600
Subject: [PATCH 13/17] clean up callback syntax

---
 gpu.hpp | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/gpu.hpp b/gpu.hpp
index 4854338..b057514 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -1306,7 +1306,7 @@ createContextByGpuIdx(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
  */
 inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
                               void *userdata1, void * /*userdata2*/) {
-  CallbackData *cbData = reinterpret_cast<CallbackData *>(userdata1);
+  const CallbackData *cbData = static_cast<CallbackData *>(userdata1);
   // Check that mapping succeeded.
   check(status == WGPUMapAsyncStatus_Success, "Map readbackBuffer", __FILE__,
         __LINE__);
@@ -1349,17 +1349,17 @@ inline void bufferMapCallback(WGPUMapAsyncStatus status, WGPUStringView message,
  */
 inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
                                   void *userdata1, void * /*userdata2*/) {
-  CallbackData *cbData = reinterpret_cast<CallbackData *>(userdata1);
+  const CallbackData *cbData = static_cast<CallbackData *>(userdata1);
   // Ensure the queue work finished successfully.
   check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done", __FILE__,
         __LINE__);
 
   // Set up the buffer mapping callback information.
   WGPUBufferMapCallbackInfo mapCallbackInfo = {
-    .mode = WGPUCallbackMode_AllowSpontaneous,
-    .callback = bufferMapCallback,
-    .userdata1 = cbData, // Pass the callback data.
-    .userdata2 = nullptr // No additional user data.
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = bufferMapCallback,
+      .userdata1 = const_cast<CallbackData *>(cbData), // Pass the callback data.
+      .userdata2 = nullptr // No additional user data.
   };
 
   // Begin the asynchronous mapping of the readback buffer.
@@ -1400,11 +1400,11 @@ inline std::future<void> toCPUAsync(Context &ctx, void *data, size_t bufferSize,
   };
 
   // Set up the work-done callback to initiate the buffer mapping.
-  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo;
-  workDoneCallbackInfo.mode = WGPUCallbackMode_AllowSpontaneous;
-  workDoneCallbackInfo.callback = queueWorkDoneCallback;
-  workDoneCallbackInfo.userdata1 = cbData; // Pass the callback data.
-  workDoneCallbackInfo.userdata2 = nullptr;
+  WGPUQueueWorkDoneCallbackInfo workDoneCallbackInfo = {
+      .mode = WGPUCallbackMode_AllowSpontaneous,
+      .callback = queueWorkDoneCallback,
+      .userdata1 = const_cast<CallbackData *>(cbData),
+      .userdata2 = nullptr};
 
   // Begin the asynchronous chain by registering the queue work-done callback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
@@ -1486,7 +1486,7 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
   // Register the callback. The async chain continues inside
   // queueWorkDoneCallback.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
-  
+
   return promise->get_future();
 }
 
@@ -1562,11 +1562,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
  * @endcode
  */
 template <size_t N>
-inline std::future<void>
-toCPUAsync(Context &ctx, Tensor &tensor, std::array<float, N> &data,
-           size_t sourceOffset = 0) {
-  return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset
-                    );
+inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
+                                    std::array<float, N> &data,
+                                    size_t sourceOffset = 0) {
+  return toCPUAsync(ctx, tensor, data.data(), sizeof(data), sourceOffset);
 }
 
 /**
@@ -1589,8 +1588,7 @@ toCPUAsync(Context &ctx, Tensor &tensor, std::array<float, N> &data,
  */
 inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
                   size_t sourceOffset = 0) {
-  auto future =
-      toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset);
+  auto future = toCPUAsync(ctx, tensor, data, bufferSize, sourceOffset);
   wait(ctx, future);
 }
 

From 752a53a3d426fb5bb87a89f31b601817adea25c7 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 18:38:09 -0600
Subject: [PATCH 14/17] add stress test

---
 test/test_gpu.cpp | 78 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 69 insertions(+), 9 deletions(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index 48aa1bc..99a1af6 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -1,12 +1,34 @@
 #include "gpu.hpp"
 #include <array>
 #include <cassert>
+#include <chrono>
 #include <cstdio>
 #include <cstring>
 #include <future>
 #include <vector>
 
 using namespace gpu;
+using namespace std::chrono;
+
+
+// Forward declarations:
+void testToCPUWithTensor();
+void testToCPUWithBuffer();
+void testToCPUWithTensorSourceOffset();
+void testToCPUWithBufferSourceOffset();
+void stressTestToCPU();
+
+int main() {
+  LOG(kDefLog, kInfo, "Running GPU integration tests...");
+  testToCPUWithTensor();
+  testToCPUWithBuffer();
+  testToCPUWithTensorSourceOffset();
+  testToCPUWithBufferSourceOffset();
+  stressTestToCPU();
+  LOG(kDefLog, kInfo, "All tests passed.");
+  return 0;
+}
+
 
 // A simple WGSL copy kernel that copies input to output.
 static const char *kCopyKernel = R"(
@@ -22,6 +44,7 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
 }
 )";
 
+
 // Test using the overload that takes a Tensor.
 void testToCPUWithTensor() {
   LOG(kDefLog, kInfo, "Running testToCPUWithTensor...");
@@ -185,12 +208,49 @@ void testToCPUWithBufferSourceOffset() {
   LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
 }
 
-int main() {
-  LOG(kDefLog, kInfo, "Running GPU integration tests...");
-  testToCPUWithTensor();
-  testToCPUWithBuffer();
-  testToCPUWithTensorSourceOffset();
-  testToCPUWithBufferSourceOffset();
-  LOG(kDefLog, kInfo, "All tests passed.");
-  return 0;
-}
+void stressTestToCPU() {
+  LOG(kDefLog, kInfo, "Running stressTestToCPU for 2 seconds...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  // Create a persistent tensor with some test data.
+  std::vector<float> inputData(N, 0.0f);
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<float>(i);
+  }
+  Tensor tensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
+
+  // Prepare to run for one second.
+  auto startTime = high_resolution_clock::now();
+  std::vector<std::future<void>> futures;
+  size_t opCount = 0;
+  while (high_resolution_clock::now() - startTime < seconds(2)) {
+    // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes)
+    auto outputData = std::make_shared<std::vector<float>>(N, 0.0f);
+    // Use the tensor overload; we’re copying the entire tensor (destOffset = 0)
+    LOG(kDefLog, kInfo, "Copying %zu bytes from GPU to CPU...", N * sizeof(float));
+    // log count
+    LOG(kDefLog, kInfo, "opCount = %zu", opCount);
+    auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0);
+    futures.push_back(std::move(fut));
+    ++opCount;
+  }
+
+  // Wait for all submitted operations to complete.
+  for (auto &f : futures) {
+    wait(ctx, f);
+  }
+  
+  auto endTime = high_resolution_clock::now();
+  auto totalMs = duration_cast<milliseconds>(endTime - startTime).count();
+  double throughput = (opCount / (totalMs / 1000.0));
+
+  LOG(kDefLog, kInfo, "Stress test completed:\n"
+            "  %zu GPU to CPU operations in %lld ms\n"
+            "  Throughput: %.2f ops/sec", opCount, totalMs, throughput);
+}
\ No newline at end of file

From 5f82ff4d9e0fdd1de7f2ccf8e0a0a6d8e981b2fb Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 22 Feb 2025 19:07:20 -0600
Subject: [PATCH 15/17] linux has a segfault if wait for events after.

---
 test/test_gpu.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index 99a1af6..aa42b83 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -227,7 +227,6 @@ void stressTestToCPU() {
 
   // Prepare to run for one second.
   auto startTime = high_resolution_clock::now();
-  std::vector<std::future<void>> futures;
   size_t opCount = 0;
   while (high_resolution_clock::now() - startTime < seconds(2)) {
     // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes)
@@ -237,14 +236,9 @@ void stressTestToCPU() {
     // log count
     LOG(kDefLog, kInfo, "opCount = %zu", opCount);
     auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0);
-    futures.push_back(std::move(fut));
+    wait(ctx, fut);
     ++opCount;
   }
-
-  // Wait for all submitted operations to complete.
-  for (auto &f : futures) {
-    wait(ctx, f);
-  }
   
   auto endTime = high_resolution_clock::now();
   auto totalMs = duration_cast<milliseconds>(endTime - startTime).count();

From 28dabf277eebb9fb5541870014287a9d7f533036 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sun, 23 Feb 2025 10:22:27 -0600
Subject: [PATCH 16/17] EOF newline

---
 test/test_gpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index aa42b83..b855712 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -247,4 +247,4 @@ void stressTestToCPU() {
   LOG(kDefLog, kInfo, "Stress test completed:\n"
             "  %zu GPU to CPU operations in %lld ms\n"
             "  Throughput: %.2f ops/sec", opCount, totalMs, throughput);
-}
\ No newline at end of file
+}

From 39c816ca6b4ba0dff8808b680e0cf8f7b36973d4 Mon Sep 17 00:00:00 2001
From: MichealReed <micheal.reed@outlook.com>
Date: Sat, 1 Mar 2025 17:34:42 -0600
Subject: [PATCH 17/17] added sleeptime optional arg

---
 gpu.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu.hpp b/gpu.hpp
index b057514..69ed0e9 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -869,7 +869,7 @@ template <typename T> T wait(Context &ctx, std::future<T> &f) {
  * Context ctx = waitForContextFuture(contextFuture);
  * @endcode
  */
-template <typename T> T waitForContextFuture(std::future<T> &f) {
+template <typename T> T waitForContextFuture(std::future<T> &f, size_t sleepTime = 10) {
 #ifdef __EMSCRIPTEN__
   while (f.wait_for(std::chrono::milliseconds(0)) !=
          std::future_status::ready) {
@@ -879,7 +879,7 @@ template <typename T> T waitForContextFuture(std::future<T> &f) {
 #else
   while (f.wait_for(std::chrono::milliseconds(0)) !=
          std::future_status::ready) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    std::this_thread::sleep_for(std::chrono::milliseconds(sleepTime));
   }
   return f.get();
 #endif