Revert "Add mean_anyway to hpc config"

ronlieb · ronlieb · commit b09a5e5cb380 · 2022-11-29T15:20:23.000-06:00
my bad, wrong repo ,so sorry. This reverts commit 0b9350f.
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -16,7 +16,7 @@
 #include "Utils.h"
 
 #pragma omp begin declare target device_type(nohost)
-extern const uint16_t __oclc_ABI_version;
+
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 using namespace _OMP;
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
@@ -11,7 +11,6 @@
 // identifier) and contains more up to date values for the enum checked here.
 // rtl.cpp uses the system elf.h.
 #include "llvm/BinaryFormat/ELF.h"
-using namespace llvm::ELF;
 
 const char *get_elf_mach_gfx_name(uint32_t EFlags) {
   using namespace llvm::ELF;
@@ -79,8 +78,3 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags) {
     return "--unknown gfx";
   }
 }
-
-const uint16_t implicitArgsSize(uint16_t Version) {
-  return Version < ELFABIVERSION_AMDGPU_HSA_V5 ? IMPLICITARGS::COV4_SIZE
-                                               : IMPLICITARGS::COV5_SIZE;
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
@@ -12,49 +12,4 @@
 
 const char *get_elf_mach_gfx_name(uint32_t EFlags);
 
-enum IMPLICITARGS : uint16_t {
-  COV4_SIZE = 56,
-  COV4_HOSTCALL_PTR_OFFSET = 24,
-  HOSTCALL_PTR_SIZE = 8,
-
-  COV5_SIZE = 256,
-
-  COV5_BLOCK_COUNT_X_OFFSET = 0,
-  COV5_BLOCK_COUNT_X_SIZE = 4,
-
-  COV5_BLOCK_COUNT_Y_OFFSET = 4,
-  COV5_BLOCK_COUNT_Y_SIZE = 4,
-
-  COV5_BLOCK_COUNT_Z_OFFSET = 8,
-  COV5_BLOCK_COUNT_Z_SIZE = 4,
-
-  COV5_GROUP_SIZE_X_OFFSET = 12,
-  COV5_GROUP_SIZE_X_SIZE = 2,
-
-  COV5_GROUP_SIZE_Y_OFFSET = 14,
-  COV5_GROUP_SIZE_Y_SIZE = 2,
-
-  COV5_GROUP_SIZE_Z_OFFSET = 16,
-  COV5_GROUP_SIZE_Z_SIZE = 2,
-
-  COV5_REMAINDER_X_OFFSET = 18,
-  COV5_REMAINDER_X_SIZE = 2,
-
-  COV5_REMAINDER_Y_OFFSET = 20,
-  COV5_REMAINDER_Y_SIZE = 2,
-
-  COV5_REMAINDER_Z_OFFSET = 22,
-  COV5_REMAINDER_Z_SIZE = 2,
-
-  COV5_GRID_DIMS_OFFSET = 64,
-  COV5_GRID_DIMS_SIZE = 2,
-
-  COV5_HOSTCALL_PTR_OFFSET = 80,
-
-  COV5_HEAPV1_PTR_OFFSET = 96,
-  COV5_HEAPV1_PTR_SIZE = 8
-};
-
-const uint16_t implicitArgsSize(uint16_t Version);
-
 #endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
@@ -33,6 +33,17 @@
 
 #define MAX_NUM_KERNELS (1024 * 16)
 
+typedef struct impl_implicit_args_s {
+  uint64_t offset_x;
+  uint64_t offset_y;
+  uint64_t offset_z;
+  uint64_t hostcall_ptr;
+  uint64_t unused0;
+  uint64_t unused1;
+  uint64_t unused2;
+} impl_implicit_args_t;
+static_assert(sizeof(impl_implicit_args_t) == 56, "");
+
 // ---------------------- Kernel Start -------------
 typedef struct atl_kernel_info_s {
   uint64_t kernel_object;
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -67,17 +67,6 @@ class KernelArgMD {
     HiddenMultiGridSyncArg,
     HiddenHostcallBuffer,
     HiddenHeapV1,
-    HiddenBlockCountX,
-    HiddenBlockCountY,
-    HiddenBlockCountZ,
-    HiddenGroupSizeX,
-    HiddenGroupSizeY,
-    HiddenGroupSizeZ,
-    HiddenRemainderX,
-    HiddenRemainderY,
-    HiddenRemainderZ,
-    HiddenGridDims,
-    HiddenQueuePtr,
     Unknown
   };
 
@@ -113,19 +102,7 @@ static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
     {"hidden_multigrid_sync_arg",
      KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
     {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
-    {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1},
-    {"hidden_block_count_x", KernelArgMD::ValueKind::HiddenBlockCountX},
-    {"hidden_block_count_y", KernelArgMD::ValueKind::HiddenBlockCountY},
-    {"hidden_block_count_z", KernelArgMD::ValueKind::HiddenBlockCountZ},
-    {"hidden_group_size_x", KernelArgMD::ValueKind::HiddenGroupSizeX},
-    {"hidden_group_size_y", KernelArgMD::ValueKind::HiddenGroupSizeY},
-    {"hidden_group_size_z", KernelArgMD::ValueKind::HiddenGroupSizeZ},
-    {"hidden_remainder_x", KernelArgMD::ValueKind::HiddenRemainderX},
-    {"hidden_remainder_y", KernelArgMD::ValueKind::HiddenRemainderY},
-    {"hidden_remainder_z", KernelArgMD::ValueKind::HiddenRemainderZ},
-    {"hidden_grid_dims", KernelArgMD::ValueKind::HiddenGridDims},
-    {"hidden_queue_ptr", KernelArgMD::ValueKind::HiddenQueuePtr},
-};
+    {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}};
 
 namespace core {
 
@@ -187,17 +164,6 @@ static bool isImplicit(KernelArgMD::ValueKind value_kind) {
   case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
   case KernelArgMD::ValueKind::HiddenHostcallBuffer:
   case KernelArgMD::ValueKind::HiddenHeapV1:
-  case KernelArgMD::ValueKind::HiddenBlockCountX:
-  case KernelArgMD::ValueKind::HiddenBlockCountY:
-  case KernelArgMD::ValueKind::HiddenBlockCountZ:
-  case KernelArgMD::ValueKind::HiddenGroupSizeX:
-  case KernelArgMD::ValueKind::HiddenGroupSizeY:
-  case KernelArgMD::ValueKind::HiddenGroupSizeZ:
-  case KernelArgMD::ValueKind::HiddenRemainderX:
-  case KernelArgMD::ValueKind::HiddenRemainderY:
-  case KernelArgMD::ValueKind::HiddenRemainderZ:
-  case KernelArgMD::ValueKind::HiddenGridDims:
-  case KernelArgMD::ValueKind::HiddenQueuePtr:
     return true;
   default:
     return false;
@@ -507,29 +473,26 @@ static hsa_status_t get_code_object_custom_metadata(
         size_t new_offset = lcArg.offset_;
         size_t padding = new_offset - offset;
         offset = new_offset;
-
+        DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_,
+           lcArg.offset_);
         offset += lcArg.size_;
 
         // check if the arg is a hidden/implicit arg
         // this logic assumes that all hidden args are 8-byte aligned
         if (!isImplicit(lcArg.valueKind_)) {
           info.explicit_argument_count++;
           kernel_explicit_args_size += lcArg.size_;
-          DP("Explicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i,
-             lcArg.name_.c_str(), lcArg.size_, lcArg.offset_);
         } else {
           info.implicit_argument_count++;
           hasHiddenArgs = true;
-          DP("Implicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i,
-             lcArg.name_.c_str(), lcArg.size_, lcArg.offset_);
         }
         kernel_explicit_args_size += padding;
       }
     }
 
     // TODO: Probably don't want this arithmetic
     info.kernel_segment_size =
-        (!hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
+        (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
     DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
        kernel_segment_size, info.kernel_segment_size);
 
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -124,10 +124,9 @@ struct KernelArgPool {
   uint32_t KernargSegmentSize;
   void *KernargRegion = nullptr;
   std::queue<int> FreeKernargSegments;
-  uint16_t CodeObjectVersion;
 
   uint32_t kernargSizeIncludingImplicit() {
-    return KernargSegmentSize + implicitArgsSize(CodeObjectVersion);
+    return KernargSegmentSize + sizeof(impl_implicit_args_t);
   }
 
   ~KernelArgPool() {
@@ -144,10 +143,8 @@ struct KernelArgPool {
   KernelArgPool(const KernelArgPool &) = delete;
   KernelArgPool(KernelArgPool &&) = delete;
 
-  KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool,
-                uint16_t CodeObjectVersion)
-      : KernargSegmentSize(KernargSegmentSize),
-        CodeObjectVersion(CodeObjectVersion) {
+  KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
+      : KernargSegmentSize(KernargSegmentSize) {
 
     // impl uses one pool per kernel for all gpus, with a fixed upper size
     // preserving that exact scheme here, including the queue<int>
@@ -231,16 +228,16 @@ struct KernelTy {
   KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
            int32_t DeviceId, void *CallStackAddr, const char *Name,
            uint32_t KernargSegmentSize,
-           hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion)
+           hsa_amd_memory_pool_t &KernArgMemoryPool)
       : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
         DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
 
     std::string N(Name);
     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
-      KernelArgPoolMap.insert(std::make_pair(
-          N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
-                 KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion))));
+      KernelArgPoolMap.insert(
+          std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
+                                KernargSegmentSize, KernArgMemoryPool))));
     }
   }
 };
@@ -477,7 +474,6 @@ class RTLDeviceInfoTy : HSALifetime {
   std::vector<int> WarpSize;
   std::vector<std::string> GPUName;
   std::vector<std::string> TargetID;
-  uint16_t CodeObjectVersion;
 
   // OpenMP properties
   std::vector<int> NumTeams;
@@ -491,7 +487,6 @@ class RTLDeviceInfoTy : HSALifetime {
 
   // Resource pools
   SignalPoolT FreeSignalPool;
-  std::vector<void *> PreallocatedDeviceHeap;
 
   bool HostcallRequired = false;
 
@@ -866,6 +861,7 @@ class RTLDeviceInfoTy : HSALifetime {
            "Unexpected device id!");
     FuncGblEntries[DeviceId].emplace_back();
     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
+    // KernelArgPoolMap.clear();
     E.Entries.clear();
     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
   }
@@ -1036,7 +1032,6 @@ class RTLDeviceInfoTy : HSALifetime {
     SymbolInfoTable.resize(NumberOfDevices);
     DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
     DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
-    PreallocatedDeviceHeap.resize(NumberOfDevices);
 
     Err = setupDevicePools(HSAAgents);
     if (Err != HSA_STATUS_SUCCESS) {
@@ -1366,27 +1361,6 @@ static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
   return PacketId;
 }
 
-const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) {
-  char *ImageBegin = (char *)Image->ImageStart;
-  size_t ImageSize = (char *)Image->ImageEnd - ImageBegin;
-
-  StringRef Buffer = StringRef(ImageBegin, ImageSize);
-  auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
-                                                  /*InitContent=*/false);
-  if (!ElfOrErr) {
-    REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str());
-    return 1;
-  }
-
-  if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get())) {
-    auto Header = ELFObj->getELFFile().getHeader();
-    uint16_t Version = (uint8_t)(Header.e_ident[EI_ABIVERSION]);
-    DP("ELFABIVERSION Version: %u\n", Version);
-    return Version;
-  }
-  return 0;
-}
-
 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
                         ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
                         int32_t ThreadLimit, uint64_t LoopTripcount) {
@@ -1464,7 +1438,6 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
     }
     uint64_t PacketId = acquireAvailablePacketId(Queue);
 
-    uint16_t CodeObjectVersion = DeviceInfo().CodeObjectVersion;
     const uint32_t Mask = Queue->size - 1; // size is a power of 2
     hsa_kernel_dispatch_packet_t *Packet =
         (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
@@ -2187,40 +2160,6 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
   return Res;
 }
 
-static void preAllocateHeapMemoryForCov5() {
-  void *DevPtr;
-  for (int I = 0; I < DeviceInfo().NumberOfDevices; I++) {
-    DevPtr = nullptr;
-    size_t PreAllocSize = 131072; // 128KB per device
-
-    hsa_amd_memory_pool_t MemoryPool =
-        DeviceInfo().DeviceCoarseGrainedMemoryPools[I];
-    hsa_status_t Err =
-        hsa_amd_memory_pool_allocate(MemoryPool, PreAllocSize, 0, &DevPtr);
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("Error allocating preallocated heap device memory: %s\n",
-         get_error_string(Err));
-    }
-
-    Err = hsa_amd_agents_allow_access(1, &DeviceInfo().HSAAgents[I], NULL,
-                                      DevPtr);
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
-         get_error_string(Err));
-    }
-
-    uint64_t Rounded =
-        sizeof(uint32_t) * ((PreAllocSize + 3) / sizeof(uint32_t));
-    Err = hsa_amd_memory_fill(DevPtr, 0, Rounded / sizeof(uint32_t));
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("Error zero-initializing preallocated heap device memory:%s\n",
-         get_error_string(Err));
-    }
-
-    DeviceInfo().PreallocatedDeviceHeap[I] = DevPtr;
-  }
-}
-
 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
                                                  __tgt_device_image *Image) {
   // This function loads the device image onto gpu[DeviceId] and does other
@@ -2255,12 +2194,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
   if (!elfMachineIdIsAmdgcn(Image))
     return NULL;
 
-  DeviceInfo().CodeObjectVersion = getCodeObjectVersionFromELF(Image);
-  if (DeviceInfo().CodeObjectVersion >=
-      llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) {
-    preAllocateHeapMemoryForCov5();
-  }
-
   {
     auto Env =
         DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
@@ -2584,8 +2517,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
 
     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
                                    CallStackAddr, E->name, KernargSegmentSize,
-                                   DeviceInfo().KernArgPool,
-                                   DeviceInfo().CodeObjectVersion));
+                                   DeviceInfo().KernArgPool));
     __tgt_offload_entry Entry = *E;
     Entry.addr = (void *)&KernelsList.back();
     DeviceInfo().addOffloadEntry(DeviceId, Entry);
diff --git a/openmp/runtime/cmake/LibompHandleFlags.cmake b/openmp/runtime/cmake/LibompHandleFlags.cmake
@@ -100,7 +100,7 @@ function(libomp_get_ldflags ldflags)
   libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
     IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG)
   libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
-  libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version"  LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+  libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
   libomp_append(ldflags_local "-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG)  # FIXME issue #58858
   libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
   libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
diff --git a/openmp/runtime/cmake/config-ix.cmake b/openmp/runtime/cmake/config-ix.cmake
@@ -131,7 +131,7 @@ if(WIN32)
 elseif(NOT APPLE)
   libomp_check_linker_flag(-Wl,-x LIBOMP_HAVE_X_FLAG)
   libomp_check_linker_flag(-Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
-  libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+  libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
   libomp_check_linker_flag("-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG)  # FIXME issue #58858
   libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
   libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)