Skip to content

Commit b09a5e5

Browse files
committed
Revert "Add mean_anyway to hpc config"
my bad, wrong repo ,so sorry. This reverts commit 0b9350f.
1 parent 2a67014 commit b09a5e5

File tree

8 files changed

+27
-172
lines changed

8 files changed

+27
-172
lines changed

openmp/libomptarget/DeviceRTL/src/Mapping.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include "Utils.h"
1717

1818
#pragma omp begin declare target device_type(nohost)
19-
extern const uint16_t __oclc_ABI_version;
19+
2020
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
2121

2222
using namespace _OMP;

openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
// identifier) and contains more up to date values for the enum checked here.
1212
// rtl.cpp uses the system elf.h.
1313
#include "llvm/BinaryFormat/ELF.h"
14-
using namespace llvm::ELF;
1514

1615
const char *get_elf_mach_gfx_name(uint32_t EFlags) {
1716
using namespace llvm::ELF;
@@ -79,8 +78,3 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags) {
7978
return "--unknown gfx";
8079
}
8180
}
82-
83-
const uint16_t implicitArgsSize(uint16_t Version) {
84-
return Version < ELFABIVERSION_AMDGPU_HSA_V5 ? IMPLICITARGS::COV4_SIZE
85-
: IMPLICITARGS::COV5_SIZE;
86-
}

openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -12,49 +12,4 @@
1212

1313
const char *get_elf_mach_gfx_name(uint32_t EFlags);
1414

15-
enum IMPLICITARGS : uint16_t {
16-
COV4_SIZE = 56,
17-
COV4_HOSTCALL_PTR_OFFSET = 24,
18-
HOSTCALL_PTR_SIZE = 8,
19-
20-
COV5_SIZE = 256,
21-
22-
COV5_BLOCK_COUNT_X_OFFSET = 0,
23-
COV5_BLOCK_COUNT_X_SIZE = 4,
24-
25-
COV5_BLOCK_COUNT_Y_OFFSET = 4,
26-
COV5_BLOCK_COUNT_Y_SIZE = 4,
27-
28-
COV5_BLOCK_COUNT_Z_OFFSET = 8,
29-
COV5_BLOCK_COUNT_Z_SIZE = 4,
30-
31-
COV5_GROUP_SIZE_X_OFFSET = 12,
32-
COV5_GROUP_SIZE_X_SIZE = 2,
33-
34-
COV5_GROUP_SIZE_Y_OFFSET = 14,
35-
COV5_GROUP_SIZE_Y_SIZE = 2,
36-
37-
COV5_GROUP_SIZE_Z_OFFSET = 16,
38-
COV5_GROUP_SIZE_Z_SIZE = 2,
39-
40-
COV5_REMAINDER_X_OFFSET = 18,
41-
COV5_REMAINDER_X_SIZE = 2,
42-
43-
COV5_REMAINDER_Y_OFFSET = 20,
44-
COV5_REMAINDER_Y_SIZE = 2,
45-
46-
COV5_REMAINDER_Z_OFFSET = 22,
47-
COV5_REMAINDER_Z_SIZE = 2,
48-
49-
COV5_GRID_DIMS_OFFSET = 64,
50-
COV5_GRID_DIMS_SIZE = 2,
51-
52-
COV5_HOSTCALL_PTR_OFFSET = 80,
53-
54-
COV5_HEAPV1_PTR_OFFSET = 96,
55-
COV5_HEAPV1_PTR_SIZE = 8
56-
};
57-
58-
const uint16_t implicitArgsSize(uint16_t Version);
59-
6015
#endif

openmp/libomptarget/plugins/amdgpu/impl/internal.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,17 @@
3333

3434
#define MAX_NUM_KERNELS (1024 * 16)
3535

36+
typedef struct impl_implicit_args_s {
37+
uint64_t offset_x;
38+
uint64_t offset_y;
39+
uint64_t offset_z;
40+
uint64_t hostcall_ptr;
41+
uint64_t unused0;
42+
uint64_t unused1;
43+
uint64_t unused2;
44+
} impl_implicit_args_t;
45+
static_assert(sizeof(impl_implicit_args_t) == 56, "");
46+
3647
// ---------------------- Kernel Start -------------
3748
typedef struct atl_kernel_info_s {
3849
uint64_t kernel_object;

openmp/libomptarget/plugins/amdgpu/impl/system.cpp

Lines changed: 4 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -67,17 +67,6 @@ class KernelArgMD {
6767
HiddenMultiGridSyncArg,
6868
HiddenHostcallBuffer,
6969
HiddenHeapV1,
70-
HiddenBlockCountX,
71-
HiddenBlockCountY,
72-
HiddenBlockCountZ,
73-
HiddenGroupSizeX,
74-
HiddenGroupSizeY,
75-
HiddenGroupSizeZ,
76-
HiddenRemainderX,
77-
HiddenRemainderY,
78-
HiddenRemainderZ,
79-
HiddenGridDims,
80-
HiddenQueuePtr,
8170
Unknown
8271
};
8372

@@ -113,19 +102,7 @@ static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
113102
{"hidden_multigrid_sync_arg",
114103
KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
115104
{"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
116-
{"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1},
117-
{"hidden_block_count_x", KernelArgMD::ValueKind::HiddenBlockCountX},
118-
{"hidden_block_count_y", KernelArgMD::ValueKind::HiddenBlockCountY},
119-
{"hidden_block_count_z", KernelArgMD::ValueKind::HiddenBlockCountZ},
120-
{"hidden_group_size_x", KernelArgMD::ValueKind::HiddenGroupSizeX},
121-
{"hidden_group_size_y", KernelArgMD::ValueKind::HiddenGroupSizeY},
122-
{"hidden_group_size_z", KernelArgMD::ValueKind::HiddenGroupSizeZ},
123-
{"hidden_remainder_x", KernelArgMD::ValueKind::HiddenRemainderX},
124-
{"hidden_remainder_y", KernelArgMD::ValueKind::HiddenRemainderY},
125-
{"hidden_remainder_z", KernelArgMD::ValueKind::HiddenRemainderZ},
126-
{"hidden_grid_dims", KernelArgMD::ValueKind::HiddenGridDims},
127-
{"hidden_queue_ptr", KernelArgMD::ValueKind::HiddenQueuePtr},
128-
};
105+
{"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}};
129106

130107
namespace core {
131108

@@ -187,17 +164,6 @@ static bool isImplicit(KernelArgMD::ValueKind value_kind) {
187164
case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
188165
case KernelArgMD::ValueKind::HiddenHostcallBuffer:
189166
case KernelArgMD::ValueKind::HiddenHeapV1:
190-
case KernelArgMD::ValueKind::HiddenBlockCountX:
191-
case KernelArgMD::ValueKind::HiddenBlockCountY:
192-
case KernelArgMD::ValueKind::HiddenBlockCountZ:
193-
case KernelArgMD::ValueKind::HiddenGroupSizeX:
194-
case KernelArgMD::ValueKind::HiddenGroupSizeY:
195-
case KernelArgMD::ValueKind::HiddenGroupSizeZ:
196-
case KernelArgMD::ValueKind::HiddenRemainderX:
197-
case KernelArgMD::ValueKind::HiddenRemainderY:
198-
case KernelArgMD::ValueKind::HiddenRemainderZ:
199-
case KernelArgMD::ValueKind::HiddenGridDims:
200-
case KernelArgMD::ValueKind::HiddenQueuePtr:
201167
return true;
202168
default:
203169
return false;
@@ -507,29 +473,26 @@ static hsa_status_t get_code_object_custom_metadata(
507473
size_t new_offset = lcArg.offset_;
508474
size_t padding = new_offset - offset;
509475
offset = new_offset;
510-
476+
DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_,
477+
lcArg.offset_);
511478
offset += lcArg.size_;
512479

513480
// check if the arg is a hidden/implicit arg
514481
// this logic assumes that all hidden args are 8-byte aligned
515482
if (!isImplicit(lcArg.valueKind_)) {
516483
info.explicit_argument_count++;
517484
kernel_explicit_args_size += lcArg.size_;
518-
DP("Explicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i,
519-
lcArg.name_.c_str(), lcArg.size_, lcArg.offset_);
520485
} else {
521486
info.implicit_argument_count++;
522487
hasHiddenArgs = true;
523-
DP("Implicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i,
524-
lcArg.name_.c_str(), lcArg.size_, lcArg.offset_);
525488
}
526489
kernel_explicit_args_size += padding;
527490
}
528491
}
529492

530493
// TODO: Probably don't want this arithmetic
531494
info.kernel_segment_size =
532-
(!hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
495+
(hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
533496
DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
534497
kernel_segment_size, info.kernel_segment_size);
535498

openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Lines changed: 9 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,9 @@ struct KernelArgPool {
124124
uint32_t KernargSegmentSize;
125125
void *KernargRegion = nullptr;
126126
std::queue<int> FreeKernargSegments;
127-
uint16_t CodeObjectVersion;
128127

129128
uint32_t kernargSizeIncludingImplicit() {
130-
return KernargSegmentSize + implicitArgsSize(CodeObjectVersion);
129+
return KernargSegmentSize + sizeof(impl_implicit_args_t);
131130
}
132131

133132
~KernelArgPool() {
@@ -144,10 +143,8 @@ struct KernelArgPool {
144143
KernelArgPool(const KernelArgPool &) = delete;
145144
KernelArgPool(KernelArgPool &&) = delete;
146145

147-
KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool,
148-
uint16_t CodeObjectVersion)
149-
: KernargSegmentSize(KernargSegmentSize),
150-
CodeObjectVersion(CodeObjectVersion) {
146+
KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
147+
: KernargSegmentSize(KernargSegmentSize) {
151148

152149
// impl uses one pool per kernel for all gpus, with a fixed upper size
153150
// preserving that exact scheme here, including the queue<int>
@@ -231,16 +228,16 @@ struct KernelTy {
231228
KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
232229
int32_t DeviceId, void *CallStackAddr, const char *Name,
233230
uint32_t KernargSegmentSize,
234-
hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion)
231+
hsa_amd_memory_pool_t &KernArgMemoryPool)
235232
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
236233
DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
237234
DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
238235

239236
std::string N(Name);
240237
if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
241-
KernelArgPoolMap.insert(std::make_pair(
242-
N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
243-
KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion))));
238+
KernelArgPoolMap.insert(
239+
std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
240+
KernargSegmentSize, KernArgMemoryPool))));
244241
}
245242
}
246243
};
@@ -477,7 +474,6 @@ class RTLDeviceInfoTy : HSALifetime {
477474
std::vector<int> WarpSize;
478475
std::vector<std::string> GPUName;
479476
std::vector<std::string> TargetID;
480-
uint16_t CodeObjectVersion;
481477

482478
// OpenMP properties
483479
std::vector<int> NumTeams;
@@ -491,7 +487,6 @@ class RTLDeviceInfoTy : HSALifetime {
491487

492488
// Resource pools
493489
SignalPoolT FreeSignalPool;
494-
std::vector<void *> PreallocatedDeviceHeap;
495490

496491
bool HostcallRequired = false;
497492

@@ -866,6 +861,7 @@ class RTLDeviceInfoTy : HSALifetime {
866861
"Unexpected device id!");
867862
FuncGblEntries[DeviceId].emplace_back();
868863
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
864+
// KernelArgPoolMap.clear();
869865
E.Entries.clear();
870866
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
871867
}
@@ -1036,7 +1032,6 @@ class RTLDeviceInfoTy : HSALifetime {
10361032
SymbolInfoTable.resize(NumberOfDevices);
10371033
DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
10381034
DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
1039-
PreallocatedDeviceHeap.resize(NumberOfDevices);
10401035

10411036
Err = setupDevicePools(HSAAgents);
10421037
if (Err != HSA_STATUS_SUCCESS) {
@@ -1366,27 +1361,6 @@ static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
13661361
return PacketId;
13671362
}
13681363

1369-
const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) {
1370-
char *ImageBegin = (char *)Image->ImageStart;
1371-
size_t ImageSize = (char *)Image->ImageEnd - ImageBegin;
1372-
1373-
StringRef Buffer = StringRef(ImageBegin, ImageSize);
1374-
auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
1375-
/*InitContent=*/false);
1376-
if (!ElfOrErr) {
1377-
REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str());
1378-
return 1;
1379-
}
1380-
1381-
if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get())) {
1382-
auto Header = ELFObj->getELFFile().getHeader();
1383-
uint16_t Version = (uint8_t)(Header.e_ident[EI_ABIVERSION]);
1384-
DP("ELFABIVERSION Version: %u\n", Version);
1385-
return Version;
1386-
}
1387-
return 0;
1388-
}
1389-
13901364
int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
13911365
ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
13921366
int32_t ThreadLimit, uint64_t LoopTripcount) {
@@ -1464,7 +1438,6 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
14641438
}
14651439
uint64_t PacketId = acquireAvailablePacketId(Queue);
14661440

1467-
uint16_t CodeObjectVersion = DeviceInfo().CodeObjectVersion;
14681441
const uint32_t Mask = Queue->size - 1; // size is a power of 2
14691442
hsa_kernel_dispatch_packet_t *Packet =
14701443
(hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
@@ -2187,40 +2160,6 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
21872160
return Res;
21882161
}
21892162

2190-
static void preAllocateHeapMemoryForCov5() {
2191-
void *DevPtr;
2192-
for (int I = 0; I < DeviceInfo().NumberOfDevices; I++) {
2193-
DevPtr = nullptr;
2194-
size_t PreAllocSize = 131072; // 128KB per device
2195-
2196-
hsa_amd_memory_pool_t MemoryPool =
2197-
DeviceInfo().DeviceCoarseGrainedMemoryPools[I];
2198-
hsa_status_t Err =
2199-
hsa_amd_memory_pool_allocate(MemoryPool, PreAllocSize, 0, &DevPtr);
2200-
if (Err != HSA_STATUS_SUCCESS) {
2201-
DP("Error allocating preallocated heap device memory: %s\n",
2202-
get_error_string(Err));
2203-
}
2204-
2205-
Err = hsa_amd_agents_allow_access(1, &DeviceInfo().HSAAgents[I], NULL,
2206-
DevPtr);
2207-
if (Err != HSA_STATUS_SUCCESS) {
2208-
DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
2209-
get_error_string(Err));
2210-
}
2211-
2212-
uint64_t Rounded =
2213-
sizeof(uint32_t) * ((PreAllocSize + 3) / sizeof(uint32_t));
2214-
Err = hsa_amd_memory_fill(DevPtr, 0, Rounded / sizeof(uint32_t));
2215-
if (Err != HSA_STATUS_SUCCESS) {
2216-
DP("Error zero-initializing preallocated heap device memory:%s\n",
2217-
get_error_string(Err));
2218-
}
2219-
2220-
DeviceInfo().PreallocatedDeviceHeap[I] = DevPtr;
2221-
}
2222-
}
2223-
22242163
__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
22252164
__tgt_device_image *Image) {
22262165
// This function loads the device image onto gpu[DeviceId] and does other
@@ -2255,12 +2194,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
22552194
if (!elfMachineIdIsAmdgcn(Image))
22562195
return NULL;
22572196

2258-
DeviceInfo().CodeObjectVersion = getCodeObjectVersionFromELF(Image);
2259-
if (DeviceInfo().CodeObjectVersion >=
2260-
llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) {
2261-
preAllocateHeapMemoryForCov5();
2262-
}
2263-
22642197
{
22652198
auto Env =
22662199
DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
@@ -2584,8 +2517,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
25842517

25852518
KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
25862519
CallStackAddr, E->name, KernargSegmentSize,
2587-
DeviceInfo().KernArgPool,
2588-
DeviceInfo().CodeObjectVersion));
2520+
DeviceInfo().KernArgPool));
25892521
__tgt_offload_entry Entry = *E;
25902522
Entry.addr = (void *)&KernelsList.back();
25912523
DeviceInfo().addOffloadEntry(DeviceId, Entry);

openmp/runtime/cmake/LibompHandleFlags.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ function(libomp_get_ldflags ldflags)
100100
libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
101101
IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG)
102102
libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
103-
libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
103+
libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
104104
libomp_append(ldflags_local "-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858
105105
libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
106106
libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)

openmp/runtime/cmake/config-ix.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ if(WIN32)
131131
elseif(NOT APPLE)
132132
libomp_check_linker_flag(-Wl,-x LIBOMP_HAVE_X_FLAG)
133133
libomp_check_linker_flag(-Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
134-
libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
134+
libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
135135
libomp_check_linker_flag("-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858
136136
libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
137137
libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)

0 commit comments

Comments
 (0)