@@ -124,9 +124,10 @@ struct KernelArgPool {
124
124
uint32_t KernargSegmentSize;
125
125
void *KernargRegion = nullptr ;
126
126
std::queue<int > FreeKernargSegments;
127
+ uint16_t CodeObjectVersion;
127
128
128
129
uint32_t kernargSizeIncludingImplicit () {
129
- return KernargSegmentSize + sizeof ( impl_implicit_args_t );
130
+ return KernargSegmentSize + implicitArgsSize (CodeObjectVersion );
130
131
}
131
132
132
133
~KernelArgPool () {
@@ -143,8 +144,10 @@ struct KernelArgPool {
143
144
KernelArgPool (const KernelArgPool &) = delete ;
144
145
KernelArgPool (KernelArgPool &&) = delete ;
145
146
146
- KernelArgPool (uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
147
- : KernargSegmentSize(KernargSegmentSize) {
147
+ KernelArgPool (uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool,
148
+ uint16_t CodeObjectVersion)
149
+ : KernargSegmentSize(KernargSegmentSize),
150
+ CodeObjectVersion (CodeObjectVersion) {
148
151
149
152
// impl uses one pool per kernel for all gpus, with a fixed upper size
150
153
// preserving that exact scheme here, including the queue<int>
@@ -228,16 +231,16 @@ struct KernelTy {
228
231
KernelTy (llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
229
232
int32_t DeviceId, void *CallStackAddr, const char *Name,
230
233
uint32_t KernargSegmentSize,
231
- hsa_amd_memory_pool_t &KernArgMemoryPool)
234
+ hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion )
232
235
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
233
236
DeviceId (DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
234
237
DP (" Construct kernelinfo: ExecMode %d\n " , ExecutionMode);
235
238
236
239
std::string N (Name);
237
240
if (KernelArgPoolMap.find (N) == KernelArgPoolMap.end ()) {
238
- KernelArgPoolMap.insert (
239
- std::make_pair ( N, std::unique_ptr<KernelArgPool>(new KernelArgPool (
240
- KernargSegmentSize, KernArgMemoryPool))));
241
+ KernelArgPoolMap.insert (std::make_pair (
242
+ N, std::unique_ptr<KernelArgPool>(new KernelArgPool (
243
+ KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion ))));
241
244
}
242
245
}
243
246
};
@@ -474,6 +477,7 @@ class RTLDeviceInfoTy : HSALifetime {
474
477
std::vector<int > WarpSize;
475
478
std::vector<std::string> GPUName;
476
479
std::vector<std::string> TargetID;
480
+ uint16_t CodeObjectVersion;
477
481
478
482
// OpenMP properties
479
483
std::vector<int > NumTeams;
@@ -487,6 +491,7 @@ class RTLDeviceInfoTy : HSALifetime {
487
491
488
492
// Resource pools
489
493
SignalPoolT FreeSignalPool;
494
+ std::vector<void *> PreallocatedDeviceHeap;
490
495
491
496
bool HostcallRequired = false ;
492
497
@@ -861,7 +866,6 @@ class RTLDeviceInfoTy : HSALifetime {
861
866
" Unexpected device id!" );
862
867
FuncGblEntries[DeviceId].emplace_back ();
863
868
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back ();
864
- // KernelArgPoolMap.clear();
865
869
E.Entries .clear ();
866
870
E.Table .EntriesBegin = E.Table .EntriesEnd = 0 ;
867
871
}
@@ -1032,6 +1036,7 @@ class RTLDeviceInfoTy : HSALifetime {
1032
1036
SymbolInfoTable.resize (NumberOfDevices);
1033
1037
DeviceCoarseGrainedMemoryPools.resize (NumberOfDevices);
1034
1038
DeviceFineGrainedMemoryPools.resize (NumberOfDevices);
1039
+ PreallocatedDeviceHeap.resize (NumberOfDevices);
1035
1040
1036
1041
Err = setupDevicePools (HSAAgents);
1037
1042
if (Err != HSA_STATUS_SUCCESS) {
@@ -1361,6 +1366,27 @@ static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1361
1366
return PacketId;
1362
1367
}
1363
1368
1369
+ const uint16_t getCodeObjectVersionFromELF (__tgt_device_image *Image) {
1370
+ char *ImageBegin = (char *)Image->ImageStart ;
1371
+ size_t ImageSize = (char *)Image->ImageEnd - ImageBegin;
1372
+
1373
+ StringRef Buffer = StringRef (ImageBegin, ImageSize);
1374
+ auto ElfOrErr = ObjectFile::createELFObjectFile (MemoryBufferRef (Buffer, " " ),
1375
+ /* InitContent=*/ false );
1376
+ if (!ElfOrErr) {
1377
+ REPORT (" Failed to load ELF: %s\n " , toString (ElfOrErr.takeError ()).c_str ());
1378
+ return 1 ;
1379
+ }
1380
+
1381
+ if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get ())) {
1382
+ auto Header = ELFObj->getELFFile ().getHeader ();
1383
+ uint16_t Version = (uint8_t )(Header.e_ident [EI_ABIVERSION]);
1384
+ DP (" ELFABIVERSION Version: %u\n " , Version);
1385
+ return Version;
1386
+ }
1387
+ return 0 ;
1388
+ }
1389
+
1364
1390
int32_t runRegionLocked (int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1365
1391
ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1366
1392
int32_t ThreadLimit, uint64_t LoopTripcount) {
@@ -1438,6 +1464,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1438
1464
}
1439
1465
uint64_t PacketId = acquireAvailablePacketId (Queue);
1440
1466
1467
+ uint16_t CodeObjectVersion = DeviceInfo ().CodeObjectVersion ;
1441
1468
const uint32_t Mask = Queue->size - 1 ; // size is a power of 2
1442
1469
hsa_kernel_dispatch_packet_t *Packet =
1443
1470
(hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
@@ -2160,6 +2187,40 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2160
2187
return Res;
2161
2188
}
2162
2189
2190
+ static void preAllocateHeapMemoryForCov5 () {
2191
+ void *DevPtr;
2192
+ for (int I = 0 ; I < DeviceInfo ().NumberOfDevices ; I++) {
2193
+ DevPtr = nullptr ;
2194
+ size_t PreAllocSize = 131072 ; // 128KB per device
2195
+
2196
+ hsa_amd_memory_pool_t MemoryPool =
2197
+ DeviceInfo ().DeviceCoarseGrainedMemoryPools [I];
2198
+ hsa_status_t Err =
2199
+ hsa_amd_memory_pool_allocate (MemoryPool, PreAllocSize, 0 , &DevPtr);
2200
+ if (Err != HSA_STATUS_SUCCESS) {
2201
+ DP (" Error allocating preallocated heap device memory: %s\n " ,
2202
+ get_error_string (Err));
2203
+ }
2204
+
2205
+ Err = hsa_amd_agents_allow_access (1 , &DeviceInfo ().HSAAgents [I], NULL ,
2206
+ DevPtr);
2207
+ if (Err != HSA_STATUS_SUCCESS) {
2208
+ DP (" hsa allow_access_to_all_gpu_agents failed: %s\n " ,
2209
+ get_error_string (Err));
2210
+ }
2211
+
2212
+ uint64_t Rounded =
2213
+ sizeof (uint32_t ) * ((PreAllocSize + 3 ) / sizeof (uint32_t ));
2214
+ Err = hsa_amd_memory_fill (DevPtr, 0 , Rounded / sizeof (uint32_t ));
2215
+ if (Err != HSA_STATUS_SUCCESS) {
2216
+ DP (" Error zero-initializing preallocated heap device memory:%s\n " ,
2217
+ get_error_string (Err));
2218
+ }
2219
+
2220
+ DeviceInfo ().PreallocatedDeviceHeap [I] = DevPtr;
2221
+ }
2222
+ }
2223
+
2163
2224
__tgt_target_table *__tgt_rtl_load_binary_locked (int32_t DeviceId,
2164
2225
__tgt_device_image *Image) {
2165
2226
// This function loads the device image onto gpu[DeviceId] and does other
@@ -2194,6 +2255,12 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2194
2255
if (!elfMachineIdIsAmdgcn (Image))
2195
2256
return NULL ;
2196
2257
2258
+ DeviceInfo ().CodeObjectVersion = getCodeObjectVersionFromELF (Image);
2259
+ if (DeviceInfo ().CodeObjectVersion >=
2260
+ llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) {
2261
+ preAllocateHeapMemoryForCov5 ();
2262
+ }
2263
+
2197
2264
{
2198
2265
auto Env =
2199
2266
DeviceEnvironment (DeviceId, DeviceInfo ().NumberOfDevices ,
@@ -2517,7 +2584,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2517
2584
2518
2585
KernelsList.push_back (KernelTy (ExecModeVal, WGSizeVal, DeviceId,
2519
2586
CallStackAddr, E->name , KernargSegmentSize,
2520
- DeviceInfo ().KernArgPool ));
2587
+ DeviceInfo ().KernArgPool ,
2588
+ DeviceInfo ().CodeObjectVersion ));
2521
2589
__tgt_offload_entry Entry = *E;
2522
2590
Entry.addr = (void *)&KernelsList.back ();
2523
2591
DeviceInfo ().addOffloadEntry (DeviceId, Entry);
0 commit comments