@@ -124,10 +124,9 @@ struct KernelArgPool {
124
124
uint32_t KernargSegmentSize;
125
125
void *KernargRegion = nullptr ;
126
126
std::queue<int > FreeKernargSegments;
127
- uint16_t CodeObjectVersion;
128
127
129
128
uint32_t kernargSizeIncludingImplicit () {
130
- return KernargSegmentSize + implicitArgsSize (CodeObjectVersion );
129
+ return KernargSegmentSize + sizeof ( impl_implicit_args_t );
131
130
}
132
131
133
132
~KernelArgPool () {
@@ -144,10 +143,8 @@ struct KernelArgPool {
144
143
KernelArgPool (const KernelArgPool &) = delete ;
145
144
KernelArgPool (KernelArgPool &&) = delete ;
146
145
147
- KernelArgPool (uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool,
148
- uint16_t CodeObjectVersion)
149
- : KernargSegmentSize(KernargSegmentSize),
150
- CodeObjectVersion (CodeObjectVersion) {
146
+ KernelArgPool (uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
147
+ : KernargSegmentSize(KernargSegmentSize) {
151
148
152
149
// impl uses one pool per kernel for all gpus, with a fixed upper size
153
150
// preserving that exact scheme here, including the queue<int>
@@ -231,16 +228,16 @@ struct KernelTy {
231
228
KernelTy (llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
232
229
int32_t DeviceId, void *CallStackAddr, const char *Name,
233
230
uint32_t KernargSegmentSize,
234
- hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion )
231
+ hsa_amd_memory_pool_t &KernArgMemoryPool)
235
232
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
236
233
DeviceId (DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
237
234
DP (" Construct kernelinfo: ExecMode %d\n " , ExecutionMode);
238
235
239
236
std::string N (Name);
240
237
if (KernelArgPoolMap.find (N) == KernelArgPoolMap.end ()) {
241
- KernelArgPoolMap.insert (std::make_pair (
242
- N, std::unique_ptr<KernelArgPool>(new KernelArgPool (
243
- KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion ))));
238
+ KernelArgPoolMap.insert (
239
+ std::make_pair ( N, std::unique_ptr<KernelArgPool>(new KernelArgPool (
240
+ KernargSegmentSize, KernArgMemoryPool))));
244
241
}
245
242
}
246
243
};
@@ -477,7 +474,6 @@ class RTLDeviceInfoTy : HSALifetime {
477
474
std::vector<int > WarpSize;
478
475
std::vector<std::string> GPUName;
479
476
std::vector<std::string> TargetID;
480
- uint16_t CodeObjectVersion;
481
477
482
478
// OpenMP properties
483
479
std::vector<int > NumTeams;
@@ -491,7 +487,6 @@ class RTLDeviceInfoTy : HSALifetime {
491
487
492
488
// Resource pools
493
489
SignalPoolT FreeSignalPool;
494
- std::vector<void *> PreallocatedDeviceHeap;
495
490
496
491
bool HostcallRequired = false ;
497
492
@@ -866,6 +861,7 @@ class RTLDeviceInfoTy : HSALifetime {
866
861
" Unexpected device id!" );
867
862
FuncGblEntries[DeviceId].emplace_back ();
868
863
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back ();
864
+ // KernelArgPoolMap.clear();
869
865
E.Entries .clear ();
870
866
E.Table .EntriesBegin = E.Table .EntriesEnd = 0 ;
871
867
}
@@ -1036,7 +1032,6 @@ class RTLDeviceInfoTy : HSALifetime {
1036
1032
SymbolInfoTable.resize (NumberOfDevices);
1037
1033
DeviceCoarseGrainedMemoryPools.resize (NumberOfDevices);
1038
1034
DeviceFineGrainedMemoryPools.resize (NumberOfDevices);
1039
- PreallocatedDeviceHeap.resize (NumberOfDevices);
1040
1035
1041
1036
Err = setupDevicePools (HSAAgents);
1042
1037
if (Err != HSA_STATUS_SUCCESS) {
@@ -1366,27 +1361,6 @@ static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1366
1361
return PacketId;
1367
1362
}
1368
1363
1369
- const uint16_t getCodeObjectVersionFromELF (__tgt_device_image *Image) {
1370
- char *ImageBegin = (char *)Image->ImageStart ;
1371
- size_t ImageSize = (char *)Image->ImageEnd - ImageBegin;
1372
-
1373
- StringRef Buffer = StringRef (ImageBegin, ImageSize);
1374
- auto ElfOrErr = ObjectFile::createELFObjectFile (MemoryBufferRef (Buffer, " " ),
1375
- /* InitContent=*/ false );
1376
- if (!ElfOrErr) {
1377
- REPORT (" Failed to load ELF: %s\n " , toString (ElfOrErr.takeError ()).c_str ());
1378
- return 1 ;
1379
- }
1380
-
1381
- if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get ())) {
1382
- auto Header = ELFObj->getELFFile ().getHeader ();
1383
- uint16_t Version = (uint8_t )(Header.e_ident [EI_ABIVERSION]);
1384
- DP (" ELFABIVERSION Version: %u\n " , Version);
1385
- return Version;
1386
- }
1387
- return 0 ;
1388
- }
1389
-
1390
1364
int32_t runRegionLocked (int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1391
1365
ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1392
1366
int32_t ThreadLimit, uint64_t LoopTripcount) {
@@ -1464,7 +1438,6 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1464
1438
}
1465
1439
uint64_t PacketId = acquireAvailablePacketId (Queue);
1466
1440
1467
- uint16_t CodeObjectVersion = DeviceInfo ().CodeObjectVersion ;
1468
1441
const uint32_t Mask = Queue->size - 1 ; // size is a power of 2
1469
1442
hsa_kernel_dispatch_packet_t *Packet =
1470
1443
(hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
@@ -2187,40 +2160,6 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2187
2160
return Res;
2188
2161
}
2189
2162
2190
- static void preAllocateHeapMemoryForCov5 () {
2191
- void *DevPtr;
2192
- for (int I = 0 ; I < DeviceInfo ().NumberOfDevices ; I++) {
2193
- DevPtr = nullptr ;
2194
- size_t PreAllocSize = 131072 ; // 128KB per device
2195
-
2196
- hsa_amd_memory_pool_t MemoryPool =
2197
- DeviceInfo ().DeviceCoarseGrainedMemoryPools [I];
2198
- hsa_status_t Err =
2199
- hsa_amd_memory_pool_allocate (MemoryPool, PreAllocSize, 0 , &DevPtr);
2200
- if (Err != HSA_STATUS_SUCCESS) {
2201
- DP (" Error allocating preallocated heap device memory: %s\n " ,
2202
- get_error_string (Err));
2203
- }
2204
-
2205
- Err = hsa_amd_agents_allow_access (1 , &DeviceInfo ().HSAAgents [I], NULL ,
2206
- DevPtr);
2207
- if (Err != HSA_STATUS_SUCCESS) {
2208
- DP (" hsa allow_access_to_all_gpu_agents failed: %s\n " ,
2209
- get_error_string (Err));
2210
- }
2211
-
2212
- uint64_t Rounded =
2213
- sizeof (uint32_t ) * ((PreAllocSize + 3 ) / sizeof (uint32_t ));
2214
- Err = hsa_amd_memory_fill (DevPtr, 0 , Rounded / sizeof (uint32_t ));
2215
- if (Err != HSA_STATUS_SUCCESS) {
2216
- DP (" Error zero-initializing preallocated heap device memory:%s\n " ,
2217
- get_error_string (Err));
2218
- }
2219
-
2220
- DeviceInfo ().PreallocatedDeviceHeap [I] = DevPtr;
2221
- }
2222
- }
2223
-
2224
2163
__tgt_target_table *__tgt_rtl_load_binary_locked (int32_t DeviceId,
2225
2164
__tgt_device_image *Image) {
2226
2165
// This function loads the device image onto gpu[DeviceId] and does other
@@ -2255,12 +2194,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2255
2194
if (!elfMachineIdIsAmdgcn (Image))
2256
2195
return NULL ;
2257
2196
2258
- DeviceInfo ().CodeObjectVersion = getCodeObjectVersionFromELF (Image);
2259
- if (DeviceInfo ().CodeObjectVersion >=
2260
- llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) {
2261
- preAllocateHeapMemoryForCov5 ();
2262
- }
2263
-
2264
2197
{
2265
2198
auto Env =
2266
2199
DeviceEnvironment (DeviceId, DeviceInfo ().NumberOfDevices ,
@@ -2584,8 +2517,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2584
2517
2585
2518
KernelsList.push_back (KernelTy (ExecModeVal, WGSizeVal, DeviceId,
2586
2519
CallStackAddr, E->name , KernargSegmentSize,
2587
- DeviceInfo ().KernArgPool ,
2588
- DeviceInfo ().CodeObjectVersion ));
2520
+ DeviceInfo ().KernArgPool ));
2589
2521
__tgt_offload_entry Entry = *E;
2590
2522
Entry.addr = (void *)&KernelsList.back ();
2591
2523
DeviceInfo ().addOffloadEntry (DeviceId, Entry);
0 commit comments