New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add helper functions for alpaka kernels #39546
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#ifndef HeterogeneousCore_AlpakaInterface_interface_vec_h | ||
#define HeterogeneousCore_AlpakaInterface_interface_vec_h | ||
|
||
#include <type_traits> | ||
|
||
#include <alpaka/alpaka.hpp> | ||
|
||
namespace alpaka { | ||
|
||
//! \return The element-wise minimum of one or more vectors. | ||
ALPAKA_NO_HOST_ACC_WARNING | ||
template <typename TDim, | ||
typename TVal, | ||
typename... Vecs, | ||
typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>> | ||
ALPAKA_FN_HOST_ACC constexpr auto elementwise_min(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> { | ||
Vec<TDim, TVal> r; | ||
if constexpr (TDim::value > 0) { | ||
for (typename TDim::value_type i = 0; i < TDim::value; ++i) | ||
r[i] = std::min({p[i], qs[i]...}); | ||
} | ||
return r; | ||
} | ||
|
||
//! \return The element-wise maximum of one or more vectors. | ||
ALPAKA_NO_HOST_ACC_WARNING | ||
template <typename TDim, | ||
typename TVal, | ||
typename... Vecs, | ||
typename = std::enable_if_t<(std::is_same_v<Vec<TDim, TVal>, Vecs> && ...)>> | ||
ALPAKA_FN_HOST_ACC constexpr auto elementwise_max(Vec<TDim, TVal> const& p, Vecs const&... qs) -> Vec<TDim, TVal> { | ||
Vec<TDim, TVal> r; | ||
if constexpr (TDim::value > 0) { | ||
for (typename TDim::value_type i = 0; i < TDim::value; ++i) | ||
r[i] = std::max({p[i], qs[i]...}); | ||
} | ||
return r; | ||
} | ||
|
||
} // namespace alpaka | ||
|
||
#endif // HeterogeneousCore_AlpakaInterface_interface_vec_h |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,266 @@ | ||
#ifndef HeterogeneousCore_AlpakaInterface_interface_workdivision_h | ||
#define HeterogeneousCore_AlpakaInterface_interface_workdivision_h | ||
|
||
#include <type_traits> | ||
|
||
#include <alpaka/alpaka.hpp> | ||
|
||
#include "HeterogeneousCore/AlpakaInterface/interface/config.h" | ||
#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" | ||
#include "HeterogeneousCore/AlpakaInterface/interface/vec.h" | ||
|
||
namespace cms::alpakatools { | ||
|
||
using namespace alpaka_common; | ||
|
||
// If the first argument is not a multiple of the second argument, round it up to the next multiple | ||
inline constexpr Idx round_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor * divisor; } | ||
|
||
// Return the integer division of the first argument by the second argument, rounded up to the next integer | ||
inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; } | ||
|
||
// Create an accelerator-dependent work division for 1-dimensional kernels | ||
template <typename TAcc, | ||
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>> | ||
inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) { | ||
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED | ||
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim1D, Idx>>) { | ||
// On GPU backends, each thread is looking at a single element: | ||
// - the number of threads per block is "elements"; | ||
// - the number of elements per thread is always 1. | ||
return WorkDiv<Dim1D>(blocks, elements, Idx{1}); | ||
} else | ||
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED | ||
#if ALPAKA_ACC_GPU_HIP_ENABLED | ||
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim1D, Idx>>) { | ||
// On GPU backends, each thread is looking at a single element: | ||
// - the number of threads per block is "elements"; | ||
// - the number of elements per thread is always 1. | ||
return WorkDiv<Dim1D>(blocks, elements, Idx{1}); | ||
} else | ||
#endif // ALPAKA_ACC_GPU_HIP_ENABLED | ||
{ | ||
// On CPU backends, run serially with a single thread per block: | ||
// - the number of threads per block is always 1; | ||
// - the number of elements per thread is "elements". | ||
return WorkDiv<Dim1D>(blocks, Idx{1}, elements); | ||
} | ||
} | ||
|
||
// Create the accelerator-dependent workdiv for N-dimensional kernels | ||
template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>> | ||
inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks, | ||
const Vec<alpaka::Dim<TAcc>>& elements) { | ||
using Dim = alpaka::Dim<TAcc>; | ||
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED | ||
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim, Idx>>) { | ||
// On GPU backends, each thread is looking at a single element: | ||
// - the number of threads per block is "elements"; | ||
// - the number of elements per thread is always 1. | ||
return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones()); | ||
} else | ||
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED | ||
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED | ||
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim, Idx>>) { | ||
// On GPU backends, each thread is looking at a single element: | ||
// - the number of threads per block is "elements"; | ||
// - the number of elements per thread is always 1. | ||
return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones()); | ||
} else | ||
#endif // ALPAKA_ACC_GPU_HIP_ENABLED | ||
{ | ||
// On CPU backends, run serially with a single thread per block: | ||
// - the number of threads per block is always 1; | ||
// - the number of elements per thread is "elements". | ||
return WorkDiv<Dim>(blocks, Vec<Dim>::ones(), elements); | ||
} | ||
} | ||
|
||
template <typename TAcc, | ||
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>> | ||
class elements_with_stride { | ||
public: | ||
ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc) | ||
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]}, | ||
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_}, | ||
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_}, | ||
extent_{stride_} {} | ||
|
||
ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc, Idx extent) | ||
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]}, | ||
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_}, | ||
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u] * elements_}, | ||
extent_{extent} {} | ||
|
||
class iterator { | ||
friend class elements_with_stride; | ||
|
||
ALPAKA_FN_ACC inline iterator(Idx elements, Idx stride, Idx extent, Idx first) | ||
: elements_{elements}, | ||
stride_{stride}, | ||
extent_{extent}, | ||
first_{std::min(first, extent)}, | ||
index_{first_}, | ||
last_{std::min(first + elements, extent)} {} | ||
|
||
public: | ||
ALPAKA_FN_ACC inline Idx operator*() const { return index_; } | ||
|
||
// pre-increment the iterator | ||
ALPAKA_FN_ACC inline iterator& operator++() { | ||
// increment the index along the elements processed by the current thread | ||
++index_; | ||
if (index_ < last_) | ||
return *this; | ||
|
||
// increment the thread index with the grid stride | ||
first_ += stride_ * elements_; | ||
index_ = first_; | ||
last_ = std::min(first_ + elements_, extent_); | ||
if (index_ < extent_) | ||
return *this; | ||
|
||
// the iterator has reached or passed the end of the extent, clamp it to the extent | ||
first_ = extent_; | ||
index_ = extent_; | ||
last_ = extent_; | ||
return *this; | ||
} | ||
|
||
// post-increment the iterator | ||
ALPAKA_FN_ACC inline iterator operator++(int) { | ||
iterator old = *this; | ||
++(*this); | ||
return old; | ||
} | ||
|
||
ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { | ||
return (index_ == other.index_) and (first_ == other.first_); | ||
} | ||
|
||
ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } | ||
|
||
private: | ||
// non-const to support iterator copy and assignment | ||
Idx elements_; | ||
Idx stride_; | ||
Idx extent_; | ||
// modified by the pre/post-increment operator | ||
Idx first_; | ||
Idx index_; | ||
Idx last_; | ||
}; | ||
|
||
ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } | ||
|
||
ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } | ||
|
||
private: | ||
const Idx elements_; | ||
const Idx first_; | ||
const Idx stride_; | ||
const Idx extent_; | ||
}; | ||
|
||
template <typename TAcc, | ||
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and (alpaka::Dim<TAcc>::value > 0)>> | ||
class elements_with_stride_nd { | ||
public: | ||
using Dim = alpaka::Dim<TAcc>; | ||
using Vec = alpaka::Vec<Dim, Idx>; | ||
|
||
ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc) | ||
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}, | ||
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}, | ||
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}, | ||
extent_{stride_} {} | ||
|
||
ALPAKA_FN_ACC inline elements_with_stride_nd(TAcc const& acc, Vec extent) | ||
: elements_{alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)}, | ||
first_{alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc) * elements_}, | ||
stride_{alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc) * elements_}, | ||
extent_{extent} {} | ||
|
||
class iterator { | ||
friend class elements_with_stride_nd; | ||
constexpr static const auto last_dimension = Dim::value - 1; | ||
|
||
ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first) | ||
: elements_{elements}, | ||
stride_{stride}, | ||
extent_{extent}, | ||
first_{alpaka::elementwise_min(first, extent)}, | ||
index_{first_}, | ||
last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {} | ||
|
||
public: | ||
ALPAKA_FN_ACC inline Vec operator*() const { return index_; } | ||
|
||
// pre-increment the iterator | ||
ALPAKA_FN_ACC inline iterator& operator++() { | ||
// increment the index along the elements processed by the current thread | ||
++index_[last_dimension]; | ||
if (index_[last_dimension] < last_) | ||
return *this; | ||
|
||
// increment the thread index along with the last dimension with the grid stride | ||
first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension]; | ||
index_[last_dimension] = first_[last_dimension]; | ||
last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]); | ||
if (index_[last_dimension] < extent_[last_dimension]) | ||
return *this; | ||
|
||
// increment the thread index along the outer dimensions with the grid stride | ||
if constexpr (last_dimension > 0) | ||
for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) { | ||
first_[dimension] += stride_[dimension]; | ||
index_[dimension] = first_[dimension]; | ||
if (index_[dimension] < extent_[dimension]) | ||
return *this; | ||
} | ||
|
||
// the iterator has reached or passed the end of the extent, clamp it to the extent | ||
first_ = extent_; | ||
index_ = extent_; | ||
last_ = extent_[last_dimension]; | ||
return *this; | ||
} | ||
|
||
// post-increment the iterator | ||
ALPAKA_FN_ACC inline iterator operator++(int) { | ||
iterator old = *this; | ||
++(*this); | ||
return old; | ||
} | ||
|
||
ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { | ||
return (index_ == other.index_) and (first_ == other.first_); | ||
} | ||
|
||
ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } | ||
|
||
private: | ||
// non-const to support iterator copy and assignment | ||
Vec elements_; | ||
Vec stride_; | ||
Vec extent_; | ||
// modified by the pre/post-increment operator | ||
Vec first_; | ||
Vec index_; | ||
Idx last_; | ||
}; | ||
|
||
ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } | ||
|
||
ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } | ||
|
||
private: | ||
const Vec elements_; | ||
const Vec first_; | ||
const Vec stride_; | ||
const Vec extent_; | ||
}; | ||
|
||
} // namespace cms::alpakatools | ||
|
||
#endif // HeterogeneousCore_AlpakaInterface_interface_workdivision_h |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
<bin name="alpakaTestVec" file="alpaka/testVec.cc"> | ||
<use name="alpaka"/> | ||
<use name="catch2"/> | ||
<use name="HeterogeneousCore/AlpakaInterface"/> | ||
<flags ALPAKA_BACKENDS="1"/> | ||
</bin> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#include <alpaka/alpaka.hpp> | ||
|
||
#define CATCH_CONFIG_MAIN | ||
#include <catch.hpp> | ||
|
||
#include "HeterogeneousCore/AlpakaInterface/interface/config.h" | ||
#include "HeterogeneousCore/AlpakaInterface/interface/vec.h" | ||
|
||
static constexpr auto s_tag = "[" ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec) "]"; | ||
|
||
TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestVec), s_tag) { | ||
SECTION("Vec1D") { | ||
alpaka_common::Vec1D s1 = 100; | ||
alpaka_common::Vec1D s2 = 42; | ||
alpaka_common::Vec1D si = 42; | ||
REQUIRE(alpaka::elementwise_min(s1, s2) == si); | ||
} | ||
|
||
SECTION("Vec3D") { | ||
alpaka_common::Vec3D v1{100, 10, 10}; | ||
alpaka_common::Vec3D v2{42, 42, 1}; | ||
alpaka_common::Vec3D vi{42, 10, 1}; | ||
REQUIRE(alpaka::elementwise_min(v1, v2) == vi); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,37 +8,45 @@ | |
#include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h" | ||
#include "HeterogeneousCore/AlpakaInterface/interface/config.h" | ||
#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" | ||
#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" | ||
|
||
#include "TestAlgo.h" | ||
|
||
namespace ALPAKA_ACCELERATOR_NAMESPACE { | ||
|
||
using namespace cms::alpakatools; | ||
|
||
class TestAlgoKernel { | ||
public: | ||
template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>> | ||
template <typename TAcc, typename = std::enable_if_t<is_accelerator_v<TAcc>>> | ||
ALPAKA_FN_ACC void operator()(TAcc const& acc, portabletest::TestDeviceCollection::View view, int32_t size) const { | ||
// this example accepts an arbitrary number of blocks and threads, and always uses 1 element per thread | ||
// global index of the thread within the grid | ||
const int32_t thread = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]; | ||
const int32_t stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0u]; | ||
const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}}; | ||
|
||
// set this only once in the whole kernel grid | ||
if (thread == 0) { | ||
view.r() = 1.; | ||
} | ||
for (auto i = thread; i < size; i += stride) { | ||
|
||
// make a strided loop over the kernel grid, covering up to "size" elements | ||
for (int32_t i : elements_with_stride(acc, size)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a thought for future, not providing a custom "element-parallel for loop" makes it less clean to add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mhm, I see. We do have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By the way, from a quick poll during the hackathon people seem to prefer the explicit for loop approach, and I think it does give a bit more flexibility. However, we do have loops that take the body as a lambda (or other function object) in pixeltrack-standalone, so we should re-check them and import them in CMSSW as well. |
||
view[i] = {0., 0., 0., i, matrix * i}; | ||
} | ||
} | ||
}; | ||
|
||
void TestAlgo::fill(Queue& queue, portabletest::TestDeviceCollection& collection) const { | ||
auto const& deviceProperties = alpaka::getAccDevProps<Acc1D>(alpaka::getDev(queue)); | ||
uint32_t maxThreadsPerBlock = deviceProperties.m_blockThreadExtentMax[0]; | ||
// use 64 items per group (this value is arbitrary, but it's a reasonable starting point) | ||
uint32_t items = 64; | ||
|
||
// use as many groups as needed to cover the whole problem | ||
uint32_t groups = divide_up_by(collection->metadata().size(), items); | ||
|
||
uint32_t threadsPerBlock = maxThreadsPerBlock; | ||
uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock; | ||
uint32_t elementsPerThread = 1; | ||
auto workDiv = WorkDiv1D{blocksPerGrid, threadsPerBlock, elementsPerThread}; | ||
// map items to | ||
// - threads with a single element per thread on a GPU backend | ||
// - elements within a single thread on a CPU backend | ||
auto workDiv = make_workdiv<Acc1D>(groups, items); | ||
|
||
alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernel{}, collection.view(), collection->metadata().size()); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test does not seem to have any backend-dependence, i.e. currently it could also be placed directly in
test
. Do you foresee this test to gain a backend-dependence in the future?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's say that it's a test of the alpaka-specific test machinery... we did not have any so far!