Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add helper functions for alpaka kernels #39546

Merged
merged 3 commits into from Oct 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
42 changes: 42 additions & 0 deletions HeterogeneousCore/AlpakaInterface/interface/vec.h
@@ -0,0 +1,42 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_vec_h
#define HeterogeneousCore_AlpakaInterface_interface_vec_h

#include <type_traits>

#include <alpaka/alpaka.hpp>

namespace alpaka {

//! \return The element-wise minimum of one or more vectors.
ALPAKA_NO_HOST_ACC_WARNING
template <typename TDim,
typename TVal,
typename... Vecs,
typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> {
Vec<TDim, TVal> r;
if constexpr (TDim::value > 0) {
for (typename TDim::value_type i = 0; i < TDim::value; ++i)
r[i] = std::min({p[i], qs[i]...});
}
return r;
}

//! \return The element-wise maximum of one or more vectors.
ALPAKA_NO_HOST_ACC_WARNING
template <typename TDim,
typename TVal,
typename... Vecs,
typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>>
ALPAKA_FN_HOST_ACC constexpr auto elementwise_max(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> {
Vec<TDim, TVal> r;
if constexpr (TDim::value > 0) {
for (typename TDim::value_type i = 0; i < TDim::value; ++i)
r[i] = std::max({p[i], qs[i]...});
}
return r;
}

} // namespace alpaka

#endif // HeterogeneousCore_AlpakaInterface_interface_vec_h
266 changes: 266 additions & 0 deletions HeterogeneousCore/AlpakaInterface/interface/workdivision.h
@@ -0,0 +1,266 @@
#ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h
#define HeterogeneousCore_AlpakaInterface_interface_workdivision_h

#include <type_traits>

#include <alpaka/alpaka.hpp>

#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
#include "HeterogeneousCore/AlpakaInterface/interface/vec.h"

namespace cms::alpakatools {

using namespace alpaka_common;

// If the first argument is not a multiple of the second argument, round it up to the next multiple
inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; }

// Return the integer division of the first argument by the second argument, rounded up to the next integer
inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }

// Create an accelerator-dependent work division for 1-dimensional kernels
template <typename TAcc,
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim1D, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim1D>(blocks, elements, Idx{1});
} else
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#if ALPAKA_ACC_GPU_HIP_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim1D, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim1D>(blocks, elements, Idx{1});
} else
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
{
// On CPU backends, run serially with a single thread per block:
// - the number of threads per block is always 1;
// - the number of elements per thread is "elements".
return WorkDiv<Dim1D>(blocks, Idx{1}, elements);
}
}

// Create the accelerator-dependent workdiv for N-dimensional kernels
template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
const Vec<alpaka::Dim<TAcc>>& elements) {
using Dim = alpaka::Dim<TAcc>;
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
} else
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
} else
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
{
// On CPU backends, run serially with a single thread per block:
// - the number of threads per block is always 1;
// - the number of elements per thread is "elements".
return WorkDiv<Dim>(blocks, Vec<Dim>::ones(), elements);
}
}

template <typename TAcc,
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
class elements_with_stride {
public:
ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
extent_{stride_} {}

ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_},
extent_{extent} {}

class iterator {
friend class elements_with_stride;

ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first)
: elements_{elements},
stride_{stride},
extent_{extent},
first_{std::min(first, extent)},
index_{first_},
last_{std::min(first + elements, extent)} {}

public:
ALPAKA_FN_ACC inline Idx operator*() const { return index_; }

// pre-increment the iterator
ALPAKA_FN_ACC inline iterator& operator++() {
// increment the index along the elements processed by the current thread
++index_;
if (index_ < last_)
return *this;

// increment the thread index with the grid stride
first_ += stride_ * elements_;
index_ = first_;
last_ = std::min(first_ + elements_, extent_);
if (index_ < extent_)
return *this;

// the iterator has reached or passed the end of the extent, clamp it to the extent
first_ = extent_;
index_ = extent_;
last_ = extent_;
return *this;
}

// post-increment the iterator
ALPAKA_FN_ACC inline iterator operator++(int) {
iterator old = *this;
++(*this);
return old;
}

ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
return (index_ == other.index_) and (first_ == other.first_);
}

ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }

private:
// non-const to support iterator copy and assignment
Idx elements_;
Idx stride_;
Idx extent_;
// modified by the pre/post-increment operator
Idx first_;
Idx index_;
Idx last_;
};

ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }

ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }

private:
const Idx elements_;
const Idx first_;
const Idx stride_;
const Idx extent_;
};

template <typename TAcc,
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
class elements_with_stride_nd {
public:
using Dim = alpaka::Dim<TAcc>;
using Vec = alpaka::Vec<Dim, Idx>;

ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
extent_{stride_} {}

ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent)
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)},
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_},
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_},
extent_{extent} {}

class iterator {
friend class elements_with_stride_nd;
constexpr static const auto last_dimension = Dim::value - 1;

ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first)
: elements_{elements},
stride_{stride},
extent_{extent},
first_{alpaka::elementwise_min(first, extent)},
index_{first_},
last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {}

public:
ALPAKA_FN_ACC inline Vec operator*() const { return index_; }

// pre-increment the iterator
ALPAKA_FN_ACC inline iterator& operator++() {
// increment the index along the elements processed by the current thread
++index_[last_dimension];
if (index_[last_dimension] < last_)
return *this;

// increment the thread index along with the last dimension with the grid stride
first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension];
index_[last_dimension] = first_[last_dimension];
last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]);
if (index_[last_dimension] < extent_[last_dimension])
return *this;

// increment the thread index along the outer dimensions with the grid stride
if constexpr (last_dimension > 0)
for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) {
first_[dimension] += stride_[dimension];
index_[dimension] = first_[dimension];
if (index_[dimension] < extent_[dimension])
return *this;
}

// the iterator has reached or passed the end of the extent, clamp it to the extent
first_ = extent_;
index_ = extent_;
last_ = extent_[last_dimension];
return *this;
}

// post-increment the iterator
ALPAKA_FN_ACC inline iterator operator++(int) {
iterator old = *this;
++(*this);
return old;
}

ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
return (index_ == other.index_) and (first_ == other.first_);
}

ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }

private:
// non-const to support iterator copy and assignment
Vec elements_;
Vec stride_;
Vec extent_;
// modified by the pre/post-increment operator
Vec first_;
Vec index_;
Idx last_;
};

ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }

ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }

private:
const Vec elements_;
const Vec first_;
const Vec stride_;
const Vec extent_;
};

} // namespace cms::alpakatools

#endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h
6 changes: 6 additions & 0 deletions HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -0,0 +1,6 @@
<bin name="alpakaTestVec" file="alpaka/testVec.cc">
<use name="alpaka"/>
<use name="catch2"/>
<use name="HeterogeneousCore/AlpakaInterface"/>
<flags ALPAKA_BACKENDS="1"/>
</bin>
25 changes: 25 additions & 0 deletions HeterogeneousCore/AlpakaInterface/test/alpaka/testVec.cc
@@ -0,0 +1,25 @@
#include <alpaka/alpaka.hpp>

#define CATCH_CONFIG_MAIN
#include <catch.hpp>

#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
#include "HeterogeneousCore/AlpakaInterface/interface/vec.h"

static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec) "]";

TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec), s_tag) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test does not seem to have any backend-dependence, i.e. currently it could also be placed directly in test. Do you foresee this test to gain a backend-dependence in the future?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's say that it's a test of the alpaka-specific test machinery... we did not have any so far!

SECTION("Vec1D") {
alpaka_common::Vec1D s1 = 100;
alpaka_common::Vec1D s2 = 42;
alpaka_common::Vec1D si = 42;
REQUIRE(alpaka::elementwise_min(s1, s2) == si);
}

SECTION("Vec3D") {
alpaka_common::Vec3D v1{100, 10, 10};
alpaka_common::Vec3D v2{42, 42, 1};
alpaka_common::Vec3D vi{42, 10, 1};
REQUIRE(alpaka::elementwise_min(v1, v2) == vi);
}
}
28 changes: 18 additions & 10 deletions HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
Expand Up @@ -8,37 +8,45 @@
#include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"

#include "TestAlgo.h"

namespace ALPAKA_ACCELERATOR_NAMESPACE {

using namespace cms::alpakatools;

class TestAlgoKernel {
public:
template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
template <typename TAcc, typename = std::enable_if_t<is_accelerator_v<TAcc>>>
ALPAKA_FN_ACC void operator()(TAcc const& acc, portabletest::TestDeviceCollection::View view, int32_t size) const {
// this example accepts an arbitrary number of blocks and threads, and always uses 1 element per thread
// global index of the thread within the grid
const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
const int32_t stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u];
const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};

// set this only once in the whole kernel grid
if (thread == 0) {
view.r() = 1.;
}
for (auto i = thread; i < size; i += stride) {

// make a strided loop over the kernel grid, covering up to "size" elements
for (int32_t i : elements_with_stride(acc, size)) {
Copy link
Contributor

@makortel makortel Sep 30, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a thought for future, not providing a custom "element-parallel for loop" makes it less clean to add #pragma ivdep or #pragma omp simd for the CPU backends (e.g. Kokkos uses #pragma ivdep and requires the parallel_for loop elements to be independent). Correctness of both of course depend on what happens inside the loop, so one option would be to define our own macro (yay) for a user to declare a loop to have independent iterations.

Copy link
Contributor Author

@fwyzard fwyzard Oct 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mhm, I see.

We do have FWCore/Utilities/interface/CMSUnrollLoop.h that defines CMS_UNROLL_LOOP and friends.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, from a quick poll during the hackathon people seem to prefer the explicit for loop approach, and I think it does give a bit more flexibility.

However, we do have loops that take the body as a lambda (or other function object) in pixeltrack-standalone, so we should re-check them and import them in CMSSW as well.

view[i] = {0., 0., 0., i, matrix * i};
}
}
};

void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection) const {
auto const& deviceProperties = alpaka::getAccDevProps<Acc1D>(alpaka::getDev(queue));
uint32_t maxThreadsPerBlock = deviceProperties.m_blockThreadExtentMax[0];
// use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
uint32_t items = 64;

// use as many groups as needed to cover the whole problem
uint32_t groups = divide_up_by(collection->metadata().size(), items);

uint32_t threadsPerBlock = maxThreadsPerBlock;
uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;
uint32_t elementsPerThread = 1;
auto workDiv = WorkDiv1D{blocksPerGrid, threadsPerBlock, elementsPerThread};
// map items to
// - threads with a single element per thread on a GPU backend
// - elements within a single thread on a CPU backend
auto workDiv = make_workdiv<Acc1D>(groups, items);

alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view(), collection->metadata().size());
}
Expand Down