[OpenMP] Replace utilities with 'gpuintrin.h' definitions #131644

jhuber6 · 2025-03-17T16:55:52Z

Summary:
Port more instructions. AMD version is at https://gist.github.com/jhuber6/235d7ee95f747c75f9a3cfd8eedac6aa

Summary: This patch ports the runtime to use `gpuintrin.h` instead of calling the builtins for most things. The `lanemask_gt` stuff was left for now with a fallback.

Summary: Follows from the previous one in llvm#131631.

llvmbot · 2025-03-17T16:56:29Z

@llvm/pr-subscribers-offload

Author: Joseph Huber (jhuber6)

Changes

[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers
[OpenMP] Replace utilities with 'gpuintrin.h' definitions

Full diff: https://github.com/llvm/llvm-project/pull/131644.diff

4 Files Affected:

(modified) offload/DeviceRTL/include/Mapping.h (+3-3)
(modified) offload/DeviceRTL/src/DeviceUtils.cpp (+15-84)
(modified) offload/DeviceRTL/src/Mapping.cpp (+46-209)
(modified) offload/DeviceRTL/src/Misc.cpp (+10-37)

diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index f892a025159d4..8ba018b5314aa 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -19,9 +19,9 @@ namespace ompx {
 namespace mapping {
 
 enum {
-  DIM_X = 0,
-  DIM_Y = 1,
-  DIM_Z = 2,
+  DIM_X = __GPU_X_DIM,
+  DIM_Y = __GPU_Y_DIM,
+  DIM_Z = __GPU_Z_DIM,
 };
 
 inline constexpr uint32_t MaxThreadsPerTeam = 1024;
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
index d8109537832e9..d6f8c499c8904 100644
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -14,117 +14,48 @@
 #include "Debug.h"
 #include "Interface.h"
 #include "Mapping.h"
+#include "gpuintrin.h"
 
 using namespace ompx;
 
-namespace impl {
-
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
-  static_assert(sizeof(unsigned long) == 8, "");
-  *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
-  *HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
-}
-
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
-  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
-}
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
-                    int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
-  int Self = mapping::getThreadIdInWarp();
-  int Index = SrcLane + (Self & ~(Width - 1));
-  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
-                    int32_t Width) {
-  int Self = mapping::getThreadIdInWarp();
-  int Index = Self + LaneDelta;
-  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
-  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
-  return Mask & __builtin_amdgcn_ballot_w64(Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) {
-  return __builtin_amdgcn_is_shared(
-      (const __attribute__((address_space(0))) void *)Ptr);
-}
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
-  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
-  int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
-  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-}
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
-  return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
-
-#endif
-///}
-} // namespace impl
-
 uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
-  return impl::Pack(LowBits, HighBits);
+  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
 }
 
 void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
-  impl::Unpack(Val, &LowBits, &HighBits);
+  static_assert(sizeof(unsigned long) == 8, "");
+  LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
+  HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
 }
 
 int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
                        int32_t Width) {
-  return impl::shuffle(Mask, Var, SrcLane, Width);
+  return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width);
 }
 
 int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
                            int32_t Width) {
-  return impl::shuffleDown(Mask, Var, Delta, Width);
+  int32_t Self = mapping::getThreadIdInWarp();
+  int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
+  return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
 }
 
 int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
                            int32_t Width) {
-  uint32_t Lo, Hi;
-  utils::unpack(Var, Lo, Hi);
-  Hi = impl::shuffleDown(Mask, Hi, Delta, Width);
-  Lo = impl::shuffleDown(Mask, Lo, Delta, Width);
-  return utils::pack(Lo, Hi);
+  int32_t Self = mapping::getThreadIdInWarp();
+  int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
+  return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
 }
 
 uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
-  return impl::ballotSync(Mask, Pred);
+  return __gpu_ballot(Mask, Pred);
 }
 
-bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
+bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
 
 extern "C" {
 int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
-  return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
+  return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
 }
 
 int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 641be81cca3ed..a9e027727b04b 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -14,198 +14,12 @@
 #include "DeviceUtils.h"
 #include "Interface.h"
 #include "State.h"
+#include "gpuintrin.h"
 
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 using namespace ompx;
 
-namespace ompx {
-namespace impl {
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
-LaneMaskTy lanemaskLT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
-  return Mask & Ballot;
-}
-
-LaneMaskTy lanemaskGT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  if (Lane == (mapping::getWarpSize() - 1))
-    return 0;
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
-  return Mask & Ballot;
-}
-
-uint32_t getThreadIdInWarp() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workitem_id_x();
-  case 1:
-    return __builtin_amdgcn_workitem_id_y();
-  case 2:
-    return __builtin_amdgcn_workitem_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
-         __builtin_amdgcn_grid_size_z();
-}
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_id_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_id_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ntid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ntid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ntid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_tid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_tid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_tid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_nctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_nctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_nctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInBlock(0) *
-         impl::getNumberOfBlocksInKernel(0) *
-         impl::getNumberOfThreadsInBlock(1) *
-         impl::getNumberOfBlocksInKernel(1) *
-         impl::getNumberOfThreadsInBlock(2) *
-         impl::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
-         mapping::getWarpSize();
-}
-
-#endif
-///}
-
-} // namespace impl
-} // namespace ompx
-
-/// We have to be deliberate about the distinction of `mapping::` and `impl::`
-/// below to avoid repeating assumptions or including irrelevant ones.
-///{
-
 static bool isInLastWarp() {
   uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
                      ~(mapping::getWarpSize() - 1);
@@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() {
   return utils::popc(Active & LaneMaskLT) == 0;
 }
 
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
+LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
 
-LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+LaneMaskTy mapping::lanemaskLT() {
+#ifdef __NVPTX__
+  return __nvvm_read_ptx_sreg_lanemask_lt();
+#else
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+  return Mask & Ballot;
+#endif
+}
 
-LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+LaneMaskTy mapping::lanemaskGT() {
+#ifdef __NVPTX__
+  return __nvvm_read_ptx_sreg_lanemask_gt();
+#else
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  if (Lane == (mapping::getWarpSize() - 1))
+    return 0;
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+  return Mask & Ballot;
+#endif
+}
 
 uint32_t mapping::getThreadIdInWarp() {
-  uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
-  ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
+  uint32_t ThreadIdInWarp = __gpu_lane_id();
+  ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
   return ThreadIdInWarp;
 }
 
 uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
-  uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
+  uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
   return ThreadIdInBlock;
 }
 
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
 
 uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
   uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
   // If we are in SPMD mode, remove one warp.
-  return BlockSize - (!IsSPMD * impl::getWarpSize());
+  return BlockSize - (!IsSPMD * mapping::getWarpSize());
 }
 uint32_t mapping::getMaxTeamThreads() {
   return mapping::getMaxTeamThreads(mapping::isSPMDMode());
 }
 
 uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
-  return impl::getNumberOfThreadsInBlock(Dim);
+  return __gpu_num_threads(Dim);
 }
 
 uint32_t mapping::getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInKernel();
+  return mapping::getNumberOfThreadsInBlock(0) *
+         mapping::getNumberOfBlocksInKernel(0) *
+         mapping::getNumberOfThreadsInBlock(1) *
+         mapping::getNumberOfBlocksInKernel(1) *
+         mapping::getNumberOfThreadsInBlock(2) *
+         mapping::getNumberOfBlocksInKernel(2);
 }
 
 uint32_t mapping::getWarpIdInBlock() {
-  uint32_t WarpID = impl::getWarpIdInBlock();
-  ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
+  uint32_t WarpID =
+      mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
+  ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
   return WarpID;
 }
 
 uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
-  uint32_t BlockId = impl::getBlockIdInKernel(Dim);
-  ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
+  uint32_t BlockId = __gpu_block_id(Dim);
+  ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
   return BlockId;
 }
 
 uint32_t mapping::getNumberOfWarpsInBlock() {
-  uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
-  ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
-  return NumberOfWarpsInBlocks;
+  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
+         mapping::getWarpSize();
 }
 
 uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
-  uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
-  ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
-  return NumberOfBlocks;
+  return __gpu_num_blocks(Dim);
 }
 
 uint32_t mapping::getNumberOfProcessorElements() {
@@ -326,11 +163,11 @@ extern "C" {
 }
 
 [[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
-  return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
+  return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
 }
 
 [[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
-  return impl::getWarpSize();
+  return mapping::getWarpSize();
 }
 }
 
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index 734e937f03920..a89f8b2a74531 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -20,41 +20,6 @@
 namespace ompx {
 namespace impl {
 
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-double getWTick() {
-  // The number of ticks per second for the AMDGPU clock varies by card and can
-  // only be retrieved by querying the driver. We rely on the device environment
-  // to inform us what the proper frequency is.
-  return 1.0 / config::getClockFrequency();
-}
-
-double getWTime() {
-  return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
-}
-
-#endif
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-double getWTick() {
-  // Timer precision is 1ns
-  return ((double)1E-9);
-}
-
-double getWTime() {
-  uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
-  return static_cast<double>(nsecs) * getWTick();
-}
-
-#endif
-
 /// Lookup a device-side function using a host pointer /p HstPtr using the table
 /// provided by the device plugin. The table is an ordered pair of host and
 /// device pointers sorted on the value of the host pointer.
@@ -112,9 +77,17 @@ int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
 
 int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
 
-double omp_get_wtick(void) { return ompx::impl::getWTick(); }
+double omp_get_wtick(void) {
+  // The number of ticks per second for the AMDGPU clock varies by card and can
+  // only be retrieved by querying the driver. We rely on the device environment
+  // to inform us what the proper frequency is. NVPTX uses a nanosecond
+  // resolution, we could omit the global read but this makes it consistent.
+  return 1.0 / ompx::config::getClockFrequency();
+}
 
-double omp_get_wtime(void) { return ompx::impl::getWTime(); }
+double omp_get_wtime(void) {
+  return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick();
+}
 
 void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
   return ompx::impl::indirectCallLookup(HstPtr);

jhuber6 · 2025-03-17T17:39:24Z

@ronlieb @jplehr AMD version https://gist.github.com/jhuber6/235d7ee95f747c75f9a3cfd8eedac6aa

JonChesterfield · 2025-03-18T20:37:30Z

This PR has two commits in it, looks like the first one is #131631

JonChesterfield

Great, thanks!

jhuber6 added 2 commits March 17, 2025 10:43

[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers

71d68ed

Summary: This patch ports the runtime to use `gpuintrin.h` instead of calling the builtins for most things. The `lanemask_gt` stuff was left for now with a fallback.

[OpenMP] Replace utilities with 'gpuintrin.h' definitions

881a69a

Summary: Follows from the previous one in llvm#131631.

jhuber6 requested review from jdoerfert, JonChesterfield, jplehr and shiltian March 17, 2025 16:55

llvmbot added the offload label Mar 17, 2025

jhuber6 mentioned this pull request Mar 18, 2025

[openmp][nfc] Use clang gpuintrin for some dispatch to target intrinsics #131907

Closed

Merge branch 'main' into utils

78cf263

JonChesterfield approved these changes Mar 18, 2025

View reviewed changes

jhuber6 merged commit cb493d2 into llvm:main Mar 19, 2025
9 checks passed

jhuber6 deleted the utils branch March 19, 2025 15:47

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[OpenMP] Replace utilities with 'gpuintrin.h' definitions #131644

[OpenMP] Replace utilities with 'gpuintrin.h' definitions #131644

Uh oh!

jhuber6 commented Mar 17, 2025 •

edited

Loading

Uh oh!

llvmbot commented Mar 17, 2025

Uh oh!

jhuber6 commented Mar 17, 2025

Uh oh!

JonChesterfield commented Mar 18, 2025

Uh oh!

JonChesterfield left a comment

Uh oh!

Uh oh!

Uh oh!

[OpenMP] Replace utilities with 'gpuintrin.h' definitions #131644

[OpenMP] Replace utilities with 'gpuintrin.h' definitions #131644

Uh oh!

Conversation

jhuber6 commented Mar 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 17, 2025

Uh oh!

jhuber6 commented Mar 17, 2025

Uh oh!

JonChesterfield commented Mar 18, 2025

Uh oh!

JonChesterfield left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

jhuber6 commented Mar 17, 2025 •

edited

Loading