Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix N-dimensional strided loops and extend the alpaka tests #40598

Merged
merged 3 commits into from Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
205 changes: 125 additions & 80 deletions HeterogeneousCore/AlpakaInterface/interface/workdivision.h
Expand Up @@ -19,27 +19,34 @@ namespace cms::alpakatools {
// Return the integer division of the first argument by the second argument, rounded up to the next integer
inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }

// Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
struct requires_single_thread_per_block : public std::true_type {};

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
template <typename TDim>
struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED

#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
template <typename TDim>
struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
#endif // ALPAKA_ACC_GPU_HIP_ENABLED

// Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
template <typename TAcc, typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc>>>
inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;

// Create an accelerator-dependent work division for 1-dimensional kernels
template <typename TAcc,
typename = std::enable_if_t<cms::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim1D, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim1D>(blocks, elements, Idx{1});
} else
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#if ALPAKA_ACC_GPU_HIP_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim1D, Idx>>) {
if constexpr (not requires_single_thread_per_block_v<TAcc>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim1D>(blocks, elements, Idx{1});
} else
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
{
} else {
// On CPU backends, run serially with a single thread per block:
// - the number of threads per block is always 1;
// - the number of elements per thread is "elements".
Expand All @@ -52,23 +59,12 @@ namespace cms::alpakatools {
inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
const Vec<alpaka::Dim<TAcc>>& elements) {
using Dim = alpaka::Dim<TAcc>;
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim, Idx>>) {
if constexpr (not requires_single_thread_per_block_v<TAcc>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
} else
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim, Idx>>) {
// On GPU backends, each thread is looking at a single element:
// - the number of threads per block is "elements";
// - the number of elements per thread is always 1.
return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
} else
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
{
} else {
// On CPU backends, run serially with a single thread per block:
// - the number of threads per block is always 1;
// - the number of elements per thread is "elements".
Expand Down Expand Up @@ -108,10 +104,12 @@ namespace cms::alpakatools {

// pre-increment the iterator
ALPAKA_FN_ACC inline iterator& operator++() {
// increment the index along the elements processed by the current thread
++index_;
if (index_ < last_)
return *this;
if constexpr (requires_single_thread_per_block_v<TAcc>) {
// increment the index along the elements processed by the current thread
++index_;
if (index_ < last_)
return *this;
}

// increment the thread index with the grid stride
first_ += stride_;
Expand Down Expand Up @@ -183,76 +181,123 @@ namespace cms::alpakatools {

class iterator {
friend class elements_with_stride_nd;
constexpr static const auto last_dimension = Dim::value - 1;

ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first)
: elements_{elements},
stride_{stride},
extent_{extent},
first_{alpaka::elementwise_min(first, extent)},
index_{first_},
last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {}

public:
ALPAKA_FN_ACC inline Vec operator*() const { return index_; }

// pre-increment the iterator
ALPAKA_FN_ACC inline iterator& operator++() {
// increment the index along the elements processed by the current thread
++index_[last_dimension];
if (index_[last_dimension] < last_)
return *this;

// increment the thread index along with the last dimension with the grid stride
first_[last_dimension] += stride_[last_dimension];
index_[last_dimension] = first_[last_dimension];
last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]);
if (index_[last_dimension] < extent_[last_dimension])
return *this;

// increment the thread index along the outer dimensions with the grid stride
if constexpr (last_dimension > 0)
for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) {
first_[dimension] += stride_[dimension];
index_[dimension] = first_[dimension];
if (index_[dimension] < extent_[dimension])
return *this;
}

// the iterator has reached or passed the end of the extent, clamp it to the extent
first_ = extent_;
index_ = extent_;
last_ = extent_[last_dimension];
ALPAKA_FN_ACC constexpr inline iterator operator++() {
increment();
return *this;
}

// post-increment the iterator
ALPAKA_FN_ACC inline iterator operator++(int) {
ALPAKA_FN_ACC constexpr inline iterator operator++(int) {
iterator old = *this;
++(*this);
increment();
return old;
}

ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
return (index_ == other.index_) and (first_ == other.first_);
}
ALPAKA_FN_ACC constexpr inline bool operator==(iterator const& other) const { return (index_ == other.index_); }

ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); }

private:
// non-const to support iterator copy and assignment
Vec elements_;
Vec stride_;
Vec extent_;
// private, explicit constructor
ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first)
: loop_{loop},
thread_{alpaka::elementwise_min(first, loop->extent_)},
range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
index_{thread_} {}

template <size_t I>
ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
bool overflow = false;
++index_[I];
if (index_[I] >= range_[I]) {
index_[I] = thread_[I];
overflow = true;
}
return overflow;
}

template <size_t N>
ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
if constexpr (N == 0) {
// overflow
return true;
} else {
if (not nth_elements_loop<N - 1>()) {
return false;
} else {
return do_elements_loops<N - 1>();
}
}
}

template <size_t I>
ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
bool overflow = false;
thread_[I] += loop_->stride_[I];
if (thread_[I] >= loop_->extent_[I]) {
thread_[I] = loop_->first_[I];
overflow = true;
}
index_[I] = thread_[I];
range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]);
return overflow;
}

template <size_t N>
ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
if constexpr (N == 0) {
// overflow
return true;
} else {
if (not nth_strided_loop<N - 1>()) {
return false;
} else {
return do_strided_loops<N - 1>();
}
}
}

// increment the iterator
ALPAKA_FN_ACC inline constexpr void increment() {
if constexpr (requires_single_thread_per_block_v<TAcc>) {
// linear N-dimensional loops over the elements associated to the thread;
// do_elements_loops<>() returns true if any of those loops overflows
if (not do_elements_loops<Dim::value>()) {
// the elements loops did not overflow, return the next index
return;
}
}

// strided N-dimensional loop over the threads in the kernel launch grid;
// do_strided_loops<>() returns true if any of those loops overflows
if (not do_strided_loops<Dim::value>()) {
// the strided loops did not overflow, return the next index
return;
}

// the iterator has reached or passed the end of the extent, clamp it to the extent
thread_ = loop_->extent_;
range_ = loop_->extent_;
index_ = loop_->extent_;
}

// const pointer to the elements_with_stride_nd that the iterator refers to
const elements_with_stride_nd* loop_;

// modified by the pre/post-increment operator
Vec first_;
Vec index_;
Idx last_;
Vec thread_; // first element processed by this thread
Vec range_; // last element processed by this thread
Vec index_; // current element processed by this thread
};

ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; }

ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; }

private:
const Vec elements_;
Expand Down
82 changes: 82 additions & 0 deletions HeterogeneousCore/AlpakaInterface/test/alpaka/testKernel.dev.cc
Expand Up @@ -37,6 +37,17 @@ struct VectorAddKernel1D {
}
};

struct VectorAddKernel3D {
template <typename TAcc, typename T>
ALPAKA_FN_ACC void operator()(
TAcc const& acc, T const* __restrict__ in1, T const* __restrict__ in2, T* __restrict__ out, Vec3D size) const {
for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size)) {
auto index = (ndindex[0] * size[1] + ndindex[1]) * size[2] + ndindex[2];
out[index] = in1[index] + in2[index];
}
}
};

TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag) {
SECTION("VectorAddKernel") {
// get the list of devices on the current platform
Expand Down Expand Up @@ -125,3 +136,74 @@ TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel), s_tag)
}
}
}

TEST_CASE("Standard checks of " ALPAKA_TYPE_ALIAS_NAME(alpakaTestKernel3D), s_tag) {
SECTION("VectorAddKernel3D") {
// get the list of devices on the current platform
auto const& devices = cms::alpakatools::devices<Platform>();
if (devices.empty()) {
std::cout << "No devices available on the platform " << EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE)
<< ", the test will be skipped.\n";
return;
}

// random number generator with a gaussian distribution
std::random_device rd{};
std::default_random_engine rand{rd()};
std::normal_distribution<float> dist{0., 1.};

// tolerance
constexpr float epsilon = 0.000001;

// 3-dimensional and linearised buffer size
constexpr Vec3D ndsize = {50, 125, 16};
constexpr size_t size = ndsize.prod();

// allocate input and output host buffers
auto in1_h = cms::alpakatools::make_host_buffer<float[]>(size);
auto in2_h = cms::alpakatools::make_host_buffer<float[]>(size);
auto out_h = cms::alpakatools::make_host_buffer<float[]>(size);

// fill the input buffers with random data, and the output buffer with zeros
for (size_t i = 0; i < size; ++i) {
in1_h[i] = dist(rand);
in2_h[i] = dist(rand);
out_h[i] = 0.;
}

// run the test on each device
for (auto const& device : devices) {
std::cout << "Test 3D vector addition on " << alpaka::getName(device) << '\n';
auto queue = Queue(device);

// allocate input and output buffers on the device
auto in1_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
auto in2_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);

// copy the input data to the device; the size is known from the buffer objects
alpaka::memcpy(queue, in1_d, in1_h);
alpaka::memcpy(queue, in2_d, in2_h);

// fill the output buffer with zeros; the size is known from the buffer objects
alpaka::memset(queue, out_d, 0.);

// launch the 3-dimensional kernel
auto div = cms::alpakatools::make_workdiv<Acc3D>({5, 5, 1}, {4, 4, 4});
alpaka::exec<Acc3D>(queue, div, VectorAddKernel3D{}, in1_d.data(), in2_d.data(), out_d.data(), ndsize);

// copy the results from the device to the host
alpaka::memcpy(queue, out_h, out_d);

// wait for all the operations to complete
alpaka::wait(queue);

// check the results
for (size_t i = 0; i < size; ++i) {
float sum = in1_h[i] + in2_h[i];
REQUIRE(out_h[i] < sum + epsilon);
REQUIRE(out_h[i] > sum - epsilon);
}
}
}
}