Skip to content

Commit

Permalink
Update the alpaka tests to use the new helper functions
Browse files Browse the repository at this point in the history
  • Loading branch information
fwyzard committed Sep 30, 2022
1 parent acba0e7 commit 58de740
Showing 1 changed file with 18 additions and 10 deletions.
28 changes: 18 additions & 10 deletions HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
Expand Up @@ -8,37 +8,45 @@
#include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"

#include "TestAlgo.h"

namespace ALPAKA_ACCELERATOR_NAMESPACE {

using namespace cms::alpakatools;

class TestAlgoKernel {
public:
template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
template <typename TAcc, typename = std::enable_if_t<is_accelerator_v<TAcc>>>
ALPAKA_FN_ACC void operator()(TAcc const& acc, portabletest::TestDeviceCollection::View view, int32_t size) const {
// this example accepts an arbitrary number of blocks and threads, and always uses 1 element per thread
// global index of the thread within the grid
const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
const int32_t stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};

// set this only once in the whole kernel grid
if (thread == 0) {
view.r() = 1.;
}
for (auto i = thread; i < size; i += stride) {

// make a strided loop over the kernel grid, covering up to "size" elements
for (int32_t i : elements_with_stride(acc, size)) {
view[i] = {0., 0., 0., i, matrix * i};
}
}
};

void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection) const {
auto const& deviceProperties = alpaka::getAccDevProps<Acc1D>(alpaka::getDev(queue));
uint32_t maxThreadsPerBlock = deviceProperties.m_blockThreadExtentMax[0];
// use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
uint32_t items = 64;

// use as many groups as needed to cover the whole problem
uint32_t groups = divide_up_by(collection->metadata().size(), items);

uint32_t threadsPerBlock = maxThreadsPerBlock;
uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;
uint32_t elementsPerThread = 1;
auto workDiv = WorkDiv1D{blocksPerGrid, threadsPerBlock, elementsPerThread};
// map items to
// - threads with a single element per thread on a GPU backend
// - elements within a single thread on a CPU backend
auto workDiv = make_workdiv<Acc1D>(groups, items);

alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view(), collection->metadata().size());
}
Expand Down

0 comments on commit 58de740

Please sign in to comment.