Skip to content

Commit

Permalink
SWDEV-249870 - Short-Term solution for Pre-Compiled Headers for Onlin…
Browse files Browse the repository at this point in the history
…e Compilation

Change-Id: Ibcb365ce2ff27c4c2379609964078da42e1226b1
  • Loading branch information
agodavar authored and Anusha Godavarthy Surya committed Sep 9, 2020
1 parent bf0d8f3 commit 9e2fa6e
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ set(BUILD_SHARED_LIBS ON CACHE BOOL "Build shared library (.so) or static lib (

set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

if(NOT ${BUILD_SHARED_LIBS} AND NOT DEFINED ENABLE_HIP_PCH)
set(ENABLE_HIP_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers")
endif()

#############################
# Options
#############################
Expand Down
58 changes: 58 additions & 0 deletions bin/hip_embed_pch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/bash

#set -x

ROCM_PATH=${ROCM_PATH:-/opt/rocm}
tmp=/tmp/hip_pch.$$
mkdir -p $tmp

cat >$tmp/hip_macros.h <<EOF
#define __device__ __attribute__((device))
#define __host__ __attribute__((host))
#define __global__ __attribute__((global))
#define __constant__ __attribute__((constant))
#define __shared__ __attribute__((shared))
#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
#define select_impl_(_1, _2, impl_, ...) impl_
#define __launch_bounds__(...) \
select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
// Macro to replace extern __shared__ declarations
// to local variable definitions
#define HIP_DYNAMIC_SHARED(type, var) \
type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
EOF

cat >$tmp/hip_pch.h <<EOF
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
EOF


cat >$tmp/hip_pch.mcin <<EOF
.type __hip_pch,@object
.section .hip_pch,"aMS",@progbits,1
.data
.globl __hip_pch
.globl __hip_pch_size
.p2align 3
__hip_pch:
.incbin "$tmp/hip.pch"
__hip_pch_size:
.long __hip_pch_size - __hip_pch
EOF

$ROCM_PATH/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui

cat $tmp/hip_macros.h >> $tmp/pch.cui

$ROCM_PATH/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui

$ROCM_PATH/llvm/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj

rm -rf $tmp
36 changes: 36 additions & 0 deletions bin/hip_gen_pch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

#set -x

cat >/tmp/hip_macros.h <<EOF
#define __device__ __attribute__((device))
#define __host__ __attribute__((host))
#define __global__ __attribute__((global))
#define __constant__ __attribute__((constant))
#define __shared__ __attribute__((shared))
#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
#define select_impl_(_1, _2, impl_, ...) impl_
#define __launch_bounds__(...) \
select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
// Macro to replace extern __shared__ declarations
// to local variable definitions
#define HIP_DYNAMIC_SHARED(type, var) \
type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
EOF

cat >/tmp/hip_pch.h <<EOF
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
EOF

/opt/rocm/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip /tmp/hip_pch.h -E >/tmp/pch.cui

cat /tmp/hip_macros.h >> /tmp/pch.cui

/opt/rocm/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip.pch -x hip-cpp-output - </tmp/pch.cui
6 changes: 6 additions & 0 deletions include/hip/hcc_detail/hip_runtime_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,12 @@ typedef struct hipLaunchParams_t {
hipStream_t stream; ///< Stream identifier
} hipLaunchParams;

// Pre-Compiled header for online compilation
#ifdef ENABLE_HIP_PCH
extern const char* __hip_pch;
extern unsigned __hip_pch_size;
void __hipGetPCH(const char** pch, unsigned int*size);
#endif

// Doxygen end group GlobalDefs
/** @} */
Expand Down
20 changes: 15 additions & 5 deletions rocclr/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,15 @@ if(ROCclr_FOUND)
$<TARGET_PROPERTY:amdrocclr_static,INTERFACE_COMPILE_DEFINITIONS>)
endif()

# Enable profiling API
# Short-Term solution for pre-compiled headers for online compilation
# Enable pre compiled header
if(${ENABLE_HIP_PCH})
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_gen_pch.sh")
execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh")
add_definitions(-DENABLE_HIP_PCH)
endif()

# Enable profiling API
if(USE_PROF_API EQUAL 1)
find_path(PROF_API_HEADER_DIR prof_protocol.h
HINTS
Expand Down Expand Up @@ -205,17 +213,21 @@ target_link_libraries(host INTERFACE hip::amdhip64)

add_library(device INTERFACE)
target_link_libraries(device INTERFACE host)

# Short-Term solution for pre-compiled headers for online compilation
if(${ENABLE_HIP_PCH})
target_link_libraries(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o)
endif()

# TODO: we may create host_static and device_static to let app
# link amdhip64_static

# FIXME: Linux convention is to create static library with same base
# filename.

if(${BUILD_SHARED_LIBS})
target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64)
INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
else()

target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr)
# combine objects of vid and hip into amdhip64_static
add_custom_target(
Expand All @@ -228,9 +240,7 @@ else()
DEPENDS amdhip64 amdrocclr_static # To make sure this is the last step
COMMENT "Combining static libs into amdhip64_static"
)

INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)

endif()

INSTALL(TARGETS amdhip64 host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR})
Expand Down
7 changes: 7 additions & 0 deletions rocclr/hip_global.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
#include "hip_code_object.hpp"
#include "platform/program.hpp"

#ifdef ENABLE_HIP_PCH
void __hipGetPCH(const char** pch, unsigned int *size) {
*pch = __hip_pch;
*size = __hip_pch_size;
}
#endif

namespace hip {

//Device Vars
Expand Down
1 change: 1 addition & 0 deletions rocclr/hip_hcc.map.in
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ global:
hipMemcpyAtoH;
hipMemcpyHtoA;
hipMemcpyParam2DAsync;
__hipGetPCH;
};
local:
*;
Expand Down

0 comments on commit 9e2fa6e

Please sign in to comment.