Skip to content

Commit ead2d86

Browse files
committed
Revert "[OpenMP] Ensure memory fences are created with barriers for AMDGPUs"
This reverts commit 36d6217.
1 parent 36d6217 commit ead2d86

File tree

5 files changed

+47
-154
lines changed

5 files changed

+47
-154
lines changed

openmp/libomptarget/DeviceRTL/include/Synchronization.h

Lines changed: 28 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,34 @@
1616

1717
namespace ompx {
1818

19+
namespace synchronize {
20+
21+
/// Initialize the synchronization machinery. Must be called by all threads.
22+
void init(bool IsSPMD);
23+
24+
/// Synchronize all threads in a warp identified by \p Mask.
25+
void warp(LaneMaskTy Mask);
26+
27+
/// Synchronize all threads in a block.
28+
void threads();
29+
30+
/// Synchronizing threads is allowed even if they all hit different instances of
31+
/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
32+
/// restrictive in that it requires all threads to hit the same instance. The
33+
/// noinline is removed by the openmp-opt pass and helps to preserve the
34+
/// information till then.
35+
///{
36+
#pragma omp begin assumes ext_aligned_barrier
37+
38+
/// Synchronize all threads in a block, they are are reaching the same
39+
/// instruction (hence all threads in the block are "aligned").
40+
__attribute__((noinline)) void threadsAligned();
41+
42+
#pragma omp end assumes
43+
///}
44+
45+
} // namespace synchronize
46+
1947
namespace atomic {
2048

2149
enum OrderingTy {
@@ -83,38 +111,6 @@ ATOMIC_FP_OP(double)
83111

84112
} // namespace atomic
85113

86-
namespace synchronize {
87-
88-
/// Initialize the synchronization machinery. Must be called by all threads.
89-
void init(bool IsSPMD);
90-
91-
/// Synchronize all threads in a warp identified by \p Mask.
92-
void warp(LaneMaskTy Mask);
93-
94-
/// Synchronize all threads in a block and perform a fence before and after the
95-
/// barrier according to \p Ordering. Note that the fence might be part of the
96-
/// barrier.
97-
void threads(atomic::OrderingTy Ordering);
98-
99-
/// Synchronizing threads is allowed even if they all hit different instances of
100-
/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
101-
/// restrictive in that it requires all threads to hit the same instance. The
102-
/// noinline is removed by the openmp-opt pass and helps to preserve the
103-
/// information till then.
104-
///{
105-
#pragma omp begin assumes ext_aligned_barrier
106-
107-
/// Synchronize all threads in a block, they are reaching the same instruction
108-
/// (hence all threads in the block are "aligned"). Also perform a fence before
109-
/// and after the barrier according to \p Ordering. Note that the
110-
/// fence might be part of the barrier if the target offers this.
111-
__attribute__((noinline)) void threadsAligned(atomic::OrderingTy Ordering);
112-
113-
#pragma omp end assumes
114-
///}
115-
116-
} // namespace synchronize
117-
118114
namespace fence {
119115

120116
/// Memory fence with \p Ordering semantics for the team.

openmp/libomptarget/DeviceRTL/src/Kernel.cpp

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ static void genericStateMachine(IdentTy *Ident) {
4040
ParallelRegionFnTy WorkFn = nullptr;
4141

4242
// Wait for the signal that we have a new work function.
43-
synchronize::threads(atomic::seq_cst);
43+
synchronize::threads();
4444

4545
// Retrieve the work function from the runtime.
4646
bool IsActive = __kmpc_kernel_parallel(&WorkFn);
@@ -56,7 +56,7 @@ static void genericStateMachine(IdentTy *Ident) {
5656
__kmpc_kernel_end_parallel();
5757
}
5858

59-
synchronize::threads(atomic::seq_cst);
59+
synchronize::threads();
6060

6161
} while (true);
6262
}
@@ -74,7 +74,7 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
7474
Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
7575
if (IsSPMD) {
7676
inititializeRuntime(/* IsSPMD */ true);
77-
synchronize::threadsAligned(atomic::relaxed);
77+
synchronize::threadsAligned();
7878
} else {
7979
inititializeRuntime(/* IsSPMD */ false);
8080
// No need to wait since only the main threads will execute user
@@ -83,10 +83,6 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
8383

8484
if (IsSPMD) {
8585
state::assumeInitialState(IsSPMD);
86-
87-
// Synchronize to ensure the assertions above are in an aligned region.
88-
// The barrier is eliminated later.
89-
synchronize::threadsAligned(atomic::relaxed);
9086
return -1;
9187
}
9288

@@ -136,11 +132,7 @@ void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode) {
136132
FunctionTracingRAII();
137133
const bool IsSPMD =
138134
Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
139-
140-
synchronize::threadsAligned(atomic::acq_rel);
141135
state::assumeInitialState(IsSPMD);
142-
synchronize::threadsAligned(atomic::relaxed);
143-
144136
if (IsSPMD)
145137
return;
146138

openmp/libomptarget/DeviceRTL/src/Parallelism.cpp

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
113113
if (mapping::isSPMDMode()) {
114114
// Avoid the race between the read of the `icv::Level` above and the write
115115
// below by synchronizing all threads here.
116-
synchronize::threadsAligned(atomic::seq_cst);
116+
synchronize::threadsAligned();
117117
{
118118
// Note that the order here is important. `icv::Level` has to be updated
119119
// last or the other updates will cause a thread specific state to be
@@ -128,36 +128,28 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
128128

129129
// Synchronize all threads after the main thread (TId == 0) set up the
130130
// team state properly.
131-
synchronize::threadsAligned(atomic::acq_rel);
131+
synchronize::threadsAligned();
132132

133133
state::ParallelTeamSize.assert_eq(NumThreads, ident,
134134
/* ForceTeamState */ true);
135135
icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
136136
icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
137137

138-
// Ensure we synchronize before we run user code to avoid invalidating the
139-
// assumptions above.
140-
synchronize::threadsAligned(atomic::relaxed);
141-
142138
if (TId < NumThreads)
143139
invokeMicrotask(TId, 0, fn, args, nargs);
144140

145141
// Synchronize all threads at the end of a parallel region.
146-
synchronize::threadsAligned(atomic::seq_cst);
142+
synchronize::threadsAligned();
147143
}
148144

149145
// Synchronize all threads to make sure every thread exits the scope above;
150146
// otherwise the following assertions and the assumption in
151147
// __kmpc_target_deinit may not hold.
152-
synchronize::threadsAligned(atomic::acq_rel);
148+
synchronize::threadsAligned();
153149

154150
state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
155151
icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
156152
icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
157-
158-
// Ensure we synchronize to create an aligned region around the assumptions.
159-
synchronize::threadsAligned(atomic::relaxed);
160-
161153
return;
162154
}
163155

@@ -251,9 +243,9 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
251243
/* ForceTeamState */ true);
252244

253245
// Master signals work to activate workers.
254-
synchronize::threads(atomic::seq_cst);
246+
synchronize::threads();
255247
// Master waits for workers to signal.
256-
synchronize::threads(atomic::seq_cst);
248+
synchronize::threads();
257249
}
258250

259251
if (nargs)

openmp/libomptarget/DeviceRTL/src/Synchronization.cpp

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,8 @@ void fenceTeam(atomic::OrderingTy Ordering);
123123
void fenceKernel(atomic::OrderingTy Ordering);
124124
void fenceSystem(atomic::OrderingTy Ordering);
125125
void syncWarp(__kmpc_impl_lanemask_t);
126-
void syncThreads(atomic::OrderingTy Ordering);
127-
void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
126+
void syncThreads();
127+
void syncThreadsAligned() { syncThreads(); }
128128
void unsetLock(omp_lock_t *);
129129
int testLock(omp_lock_t *);
130130
void initLock(omp_lock_t *);
@@ -261,16 +261,8 @@ void syncWarp(__kmpc_impl_lanemask_t) {
261261
// AMDGCN doesn't need to sync threads in a warp
262262
}
263263

264-
void syncThreads(atomic::OrderingTy Ordering) {
265-
if (Ordering != atomic::relaxed)
266-
fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
267-
268-
__builtin_amdgcn_s_barrier();
269-
270-
if (Ordering != atomic::relaxed)
271-
fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst);
272-
}
273-
void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
264+
void syncThreads() { __builtin_amdgcn_s_barrier(); }
265+
void syncThreadsAligned() { syncThreads(); }
274266

275267
// TODO: Don't have wavefront lane locks. Possibly can't have them.
276268
void unsetLock(omp_lock_t *) { __builtin_trap(); }
@@ -335,12 +327,12 @@ void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
335327

336328
void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
337329

338-
void syncThreads(atomic::OrderingTy Ordering) {
330+
void syncThreads() {
339331
constexpr int BarrierNo = 8;
340332
asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
341333
}
342334

343-
void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
335+
void syncThreadsAligned() { __syncthreads(); }
344336

345337
constexpr uint32_t OMP_SPIN = 1000;
346338
constexpr uint32_t UNSET = 0;
@@ -389,13 +381,9 @@ void synchronize::init(bool IsSPMD) {
389381

390382
void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
391383

392-
void synchronize::threads(atomic::OrderingTy Ordering) {
393-
impl::syncThreads(Ordering);
394-
}
384+
void synchronize::threads() { impl::syncThreads(); }
395385

396-
void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
397-
impl::syncThreadsAligned(Ordering);
398-
}
386+
void synchronize::threadsAligned() { impl::syncThreadsAligned(); }
399387

400388
void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
401389

@@ -516,13 +504,13 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
516504
__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
517505
int32_t TId) {
518506
FunctionTracingRAII();
519-
synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
507+
synchronize::threadsAligned();
520508
}
521509

522510
__attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc,
523511
int32_t TId) {
524512
FunctionTracingRAII();
525-
synchronize::threads(atomic::OrderingTy::seq_cst);
513+
synchronize::threads();
526514
}
527515

528516
int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {

openmp/libomptarget/test/offloading/barrier_fence.c

Lines changed: 0 additions & 75 deletions
This file was deleted.

0 commit comments

Comments
 (0)