Update the alpaka tests to use the new helper functions

cms-sw · Sep 30, 2022 · 58de740 · 58de740
1 parent acba0e7
commit 58de740
Showing 1 changed file with 18 additions and 10 deletions.
diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
@@ -8,37 +8,45 @@
 #include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 
 #include "TestAlgo.h"
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
+  using namespace cms::alpakatools;
+
   class TestAlgoKernel {
   public:
-    template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
+    template <typename TAcc, typename = std::enable_if_t<is_accelerator_v<TAcc>>>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, portabletest::TestDeviceCollection::View view, int32_t size) const {
-      // this example accepts an arbitrary number of blocks and threads, and always uses 1 element per thread
+      // global index of the thread within the grid
       const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
-      const int32_t stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
       const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
 
+      // set this only once in the whole kernel grid
       if (thread == 0) {
         view.r() = 1.;
       }
-      for (auto i = thread; i < size; i += stride) {
+
+      // make a strided loop over the kernel grid, covering up to "size" elements
+      for (int32_t i : elements_with_stride(acc, size)) {
         view[i] = {0., 0., 0., i, matrix * i};
       }
     }
   };
 
   void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection) const {
-    auto const& deviceProperties = alpaka::getAccDevProps<Acc1D>(alpaka::getDev(queue));
-    uint32_t maxThreadsPerBlock = deviceProperties.m_blockThreadExtentMax[0];
+    // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
+    uint32_t items = 64;
+
+    // use as many groups as needed to cover the whole problem
+    uint32_t groups = divide_up_by(collection->metadata().size(), items);
 
-    uint32_t threadsPerBlock = maxThreadsPerBlock;
-    uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;
-    uint32_t elementsPerThread = 1;
-    auto workDiv = WorkDiv1D{blocksPerGrid, threadsPerBlock, elementsPerThread};
+    // map items to
+    //   - threads with a single element per thread on a GPU backend
+    //   - elements within a single thread on a CPU backend
+    auto workDiv = make_workdiv<Acc1D>(groups, items);
 
     alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view(), collection->metadata().size());
   }