16
16
#include < utility> // std::pair
17
17
#include < vector>
18
18
19
-
20
19
#ifdef __EMSCRIPTEN__
21
20
#include " emscripten/emscripten.h"
22
21
#endif
@@ -1106,7 +1105,7 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {},
1106
1105
* @param ctx The Context containing the WebGPU instance handle.
1107
1106
* @return std::vector<dawn::native::Adapter> A vector of available GPU
1108
1107
* adapters.
1109
- *
1108
+ *
1110
1109
* @code
1111
1110
* std::vector<dawn::native::Adapter> adapters = getAdapters(ctx);
1112
1111
* @endcode
@@ -1118,21 +1117,25 @@ inline std::vector<dawn::native::Adapter> getAdapters(Context &ctx) {
1118
1117
}
1119
1118
1120
1119
/* *
1121
- * @brief Formats the given vector of Dawn adapters into a single concatenated string.
1120
+ * @brief Formats the given vector of Dawn adapters into a single concatenated
1121
+ * string.
1122
1122
*
1123
- * This function iterates over each Dawn adapter in the provided vector, retrieves its
1124
- * description using the WebGPU API, and converts the description from a WGPUStringView
1125
- * to an std::string using the formatWGPUStringView helper. The resulting descriptions
1126
- * are concatenated into a single string separated by newline characters.
1123
+ * This function iterates over each Dawn adapter in the provided vector,
1124
+ * retrieves its description using the WebGPU API, and converts the description
1125
+ * from a WGPUStringView to an std::string using the formatWGPUStringView
1126
+ * helper. The resulting descriptions are concatenated into a single string
1127
+ * separated by newline characters.
1127
1128
*
1128
1129
* @param adapters A vector of Dawn adapters obtained from a WebGPU instance.
1129
- * @return std::string A newline-delimited string listing each adapter's description.
1130
- *
1130
+ * @return std::string A newline-delimited string listing each adapter's
1131
+ * description.
1132
+ *
1131
1133
* @code
1132
1134
* std::string adapterList = formatAdapters(adapters);
1133
1135
* @endcode
1134
1136
*/
1135
- inline std::string formatAdapters (const std::vector<dawn::native::Adapter> &adapters) {
1137
+ inline std::string
1138
+ formatAdapters (const std::vector<dawn::native::Adapter> &adapters) {
1136
1139
std::string adapterList;
1137
1140
for (size_t i = 0 ; i < adapters.size (); ++i) {
1138
1141
auto adapterPtr = adapters[i].Get ();
@@ -1157,7 +1160,7 @@ inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adap
1157
1160
* @param ctx The Context containing the WebGPU instance handle.
1158
1161
* @return std::string A newline-delimited string listing each adapter's
1159
1162
* description.
1160
- *
1163
+ *
1161
1164
* @code
1162
1165
* std::string adapterList = listAdapters(ctx);
1163
1166
* @endcode
@@ -1181,7 +1184,7 @@ inline std::string listAdapters(Context &ctx) {
1181
1184
* @param devDescriptor Device descriptor for the WebGPU device (optional)
1182
1185
* @return std::future<Context> A future that will eventually hold the created
1183
1186
* Context.
1184
- *
1187
+ *
1185
1188
* @code
1186
1189
* std::future<Context> contextFuture = createContextByGpuIdxAsync(0);
1187
1190
* Context ctx = waitForContextFuture(contextFuture);
@@ -1270,9 +1273,9 @@ createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
1270
1273
* Context ctx = createContextByGpuIdx(0);
1271
1274
* @endcode
1272
1275
*/
1273
- inline Context createContextByGpuIdx ( int gpuIdx,
1274
- const WGPUInstanceDescriptor &desc = {},
1275
- const WGPUDeviceDescriptor &devDescriptor = {}) {
1276
+ inline Context
1277
+ createContextByGpuIdx ( int gpuIdx, const WGPUInstanceDescriptor &desc = {},
1278
+ const WGPUDeviceDescriptor &devDescriptor = {}) {
1276
1279
std::future<Context> contextFuture =
1277
1280
createContextByGpuIdxAsync (gpuIdx, desc, devDescriptor);
1278
1281
return waitForContextFuture<Context>(contextFuture);
@@ -1365,17 +1368,19 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
1365
1368
/* *
1366
1369
* @brief Copies data from a GPU buffer to CPU memory.
1367
1370
* @param[in] ctx Context instance to manage the operation
1368
- * @param[in] tensor Tensor instance representing the GPU buffer to copy from
1369
1371
* @param[out] data Pointer to the CPU memory to copy the data to
1370
1372
* @param[in] bufferSize Size of the data buffer in bytes
1371
1373
* @param[in] op StagingBuffer instance to manage the operation
1374
+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
1372
1375
*
1373
1376
* @code
1374
1377
* toCPU(ctx, tensor, data, bufferSize);
1375
1378
* @endcode
1376
1379
*/
1377
- inline std::future<void > toCPUAsync (Context &ctx, Tensor &tensor, void *data,
1378
- size_t bufferSize, CopyData &op) {
1380
+
1381
+ // NOTE: I think this one is redundant? CopyData not used externally.
1382
+ inline std::future<void > toCPUAsync (Context &ctx, void *data, size_t bufferSize,
1383
+ CopyData &op, size_t sourceOffset = 0 ) {
1379
1384
// Submit the command buffer and release it.
1380
1385
wgpuQueueSubmit (ctx.queue , 1 , &op.commandBuffer );
1381
1386
wgpuCommandBufferRelease (op.commandBuffer );
@@ -1388,8 +1393,8 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
1388
1393
CallbackData *cbData = new CallbackData{
1389
1394
op.readbackBuffer , // The GPU buffer to be read back.
1390
1395
bufferSize,
1391
- data, // CPU memory destination.
1392
- promise // The promise to be signaled.
1396
+ data, // CPU memory destination.
1397
+ promise, // The promise to be signaled.
1393
1398
};
1394
1399
1395
1400
// Set up the work-done callback to initiate the buffer mapping.
@@ -1402,6 +1407,11 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
1402
1407
// Begin the asynchronous chain by registering the queue work-done callback.
1403
1408
wgpuQueueOnSubmittedWorkDone (ctx.queue , workDoneCallbackInfo);
1404
1409
1410
+ // Release the readback buffer as it is no longer needed.
1411
+ if (op.readbackBuffer ) {
1412
+ wgpuBufferRelease (op.readbackBuffer );
1413
+ }
1414
+
1405
1415
return promise->get_future ();
1406
1416
}
1407
1417
@@ -1417,11 +1427,13 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
1417
1427
*
1418
1428
* @param[in] ctx Context instance to manage the operation
1419
1429
* @param[in] tensor Tensor instance representing the GPU buffer to copy from
1420
- * @param[in] bufferSize Size of the data buffer in bytes
1430
+ * @param[in] bufferSize Size to read in bytes as out data.
1421
1431
* @param[out] data Pointer to the CPU memory to copy the data to
1432
+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
1422
1433
*/
1423
1434
inline std::future<void > toCPUAsync (Context &ctx, Tensor &tensor, void *data,
1424
- size_t bufferSize) {
1435
+ size_t bufferSize,
1436
+ size_t sourceOffset = 0 ) {
1425
1437
// Create a promise that will later be satisfied when the async copy
1426
1438
// completes.
1427
1439
auto promise = std::make_shared<std::promise<void >>();
@@ -1430,16 +1442,17 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
1430
1442
WGPUBufferDescriptor readbackBufferDescriptor = {
1431
1443
.label = {.data = nullptr , .length = 0 },
1432
1444
.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
1433
- .size = bufferSize,
1445
+ .size = bufferSize, // Size of the readback buffer.
1434
1446
};
1435
1447
WGPUBuffer readbackBuffer =
1436
1448
wgpuDeviceCreateBuffer (ctx.device , &readbackBufferDescriptor);
1437
1449
1438
1450
// Create a command encoder and record a copy from the tensor GPU buffer
1439
1451
WGPUCommandEncoder commandEncoder =
1440
1452
wgpuDeviceCreateCommandEncoder (ctx.device , nullptr );
1441
- wgpuCommandEncoderCopyBufferToBuffer (commandEncoder, tensor.data .buffer , 0 ,
1442
- readbackBuffer, 0 , bufferSize);
1453
+ wgpuCommandEncoderCopyBufferToBuffer (commandEncoder, tensor.data .buffer ,
1454
+ sourceOffset, readbackBuffer, 0 ,
1455
+ bufferSize);
1443
1456
// Finish recording by creating a command buffer and release the encoder.
1444
1457
WGPUCommandBuffer commandBuffer =
1445
1458
wgpuCommandEncoderFinish (commandEncoder, nullptr );
@@ -1472,13 +1485,16 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
1472
1485
// queueWorkDoneCallback.
1473
1486
wgpuQueueOnSubmittedWorkDone (ctx.queue , workDoneCallbackInfo);
1474
1487
1488
+ if (readbackBuffer) {
1489
+ wgpuBufferRelease (readbackBuffer);
1490
+ }
1491
+
1475
1492
return promise->get_future ();
1476
1493
}
1477
1494
1478
1495
inline std::future<void > toCPUAsync (Context &ctx, WGPUBuffer buffer, void *data,
1479
- size_t size) {
1480
- // The size (in bytes) for the copy.
1481
- uint64_t bufferSize = size;
1496
+ size_t bufferSize,
1497
+ size_t sourceOffset = 0 ) {
1482
1498
1483
1499
// Create an operation structure (here we reuse CopyData solely for its
1484
1500
// members that we need to create a readback buffer and command buffer).
@@ -1503,7 +1519,7 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
1503
1519
{
1504
1520
WGPUCommandEncoder commandEncoder =
1505
1521
wgpuDeviceCreateCommandEncoder (ctx.device , nullptr );
1506
- wgpuCommandEncoderCopyBufferToBuffer (commandEncoder, buffer, 0 ,
1522
+ wgpuCommandEncoderCopyBufferToBuffer (commandEncoder, buffer, sourceOffset ,
1507
1523
op.readbackBuffer , 0 , bufferSize);
1508
1524
op.commandBuffer = wgpuCommandEncoderFinish (commandEncoder, nullptr );
1509
1525
wgpuCommandEncoderRelease (commandEncoder);
@@ -1516,10 +1532,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
1516
1532
1517
1533
// Allocate callback data
1518
1534
CallbackData *cbData = new CallbackData{
1519
- op.readbackBuffer , // The readback buffer created above.
1520
- static_cast < size_t >( bufferSize), // Size of the copy.
1521
- data, // Destination CPU memory .
1522
- promise // Our promise to satisfy when done.
1535
+ op.readbackBuffer , // The readback buffer created above.
1536
+ bufferSize, // Size of the copy.
1537
+ data, // Destination CPU memory. // Offset in the GPU buffer .
1538
+ promise // Our promise to satisfy when done.
1523
1539
};
1524
1540
1525
1541
// Set up the queue work-done callback info.
@@ -1532,6 +1548,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
1532
1548
// Start the asynchronous chain by registering the work-done callback.
1533
1549
wgpuQueueOnSubmittedWorkDone (ctx.queue , workDoneCallbackInfo);
1534
1550
1551
+ if (op.readbackBuffer ) {
1552
+ wgpuBufferRelease (op.readbackBuffer );
1553
+ }
1554
+
1535
1555
return promise->get_future ();
1536
1556
}
1537
1557
@@ -1548,9 +1568,11 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
1548
1568
* @endcode
1549
1569
*/
1550
1570
template <size_t N>
1551
- inline std::future<void > toCPUAsync (Context &ctx, Tensor &tensor,
1552
- std::array<float , N> &data) {
1553
- return toCPUAsync (ctx, tensor, data.data (), sizeof (data));
1571
+ inline std::future<void >
1572
+ toCPUAsync (Context &ctx, Tensor &tensor, std::array<float , N> &data,
1573
+ size_t sourceOffset = 0 ) {
1574
+ return toCPUAsync (ctx, tensor, data.data (), sizeof (data), sourceOffset
1575
+ );
1554
1576
}
1555
1577
1556
1578
/* *
@@ -1571,8 +1593,10 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
1571
1593
* toCPU(ctx, tensor, data, bufferSize, instance);
1572
1594
* @endcode
1573
1595
*/
1574
- inline void toCPU (Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
1575
- auto future = toCPUAsync (ctx, tensor, data, bufferSize);
1596
+ inline void toCPU (Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
1597
+ size_t sourceOffset = 0 ) {
1598
+ auto future =
1599
+ toCPUAsync (ctx, tensor, data, bufferSize, sourceOffset);
1576
1600
wait (ctx, future);
1577
1601
}
1578
1602
@@ -1593,8 +1617,9 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
1593
1617
* toCPU(ctx, buffer, data, size, instance);
1594
1618
* @endcode
1595
1619
*/
1596
- inline void toCPU (Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
1597
- auto future = toCPUAsync (ctx, buffer, data, size);
1620
+ inline void toCPU (Context &ctx, WGPUBuffer buffer, void *data, size_t size,
1621
+ size_t sourceOffset = 0 ) {
1622
+ auto future = toCPUAsync (ctx, buffer, data, size, sourceOffset);
1598
1623
wait (ctx, future);
1599
1624
}
1600
1625
@@ -1616,8 +1641,9 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
1616
1641
* @endcode
1617
1642
*/
1618
1643
template <size_t N>
1619
- inline void toCPU (Context &ctx, Tensor &tensor, std::array<float , N> &data) {
1620
- auto future = toCPUAsync (ctx, tensor, data);
1644
+ inline void toCPU (Context &ctx, Tensor &tensor, std::array<float , N> &data,
1645
+ size_t sourceOffset = 0 ) {
1646
+ auto future = toCPUAsync (ctx, tensor, data, sourceOffset);
1621
1647
wait (ctx, future);
1622
1648
}
1623
1649
0 commit comments