diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs index 647cc0ec752..a1a2aa105b0 100644 --- a/Tools/GNUMake/Make.defs +++ b/Tools/GNUMake/Make.defs @@ -267,6 +267,9 @@ ifeq ($(USE_CUDA),TRUE) # Limit the maximum number of registers available. CUDA_MAXREGCOUNT ?= 255 + # Link-time optimization + CUDA_LTO ?= FALSE + # Enable verbosity in the CUDA compilation. CUDA_VERBOSE ?= TRUE endif @@ -1176,7 +1179,7 @@ else ifeq ($(USE_CUDA),TRUE) endif ifneq ($(LINK_WITH_FORTRAN_COMPILER),TRUE) - LINKFLAGS = $(NVCC_FLAGS) $(CXXFLAGS_FROM_HOST) + LINKFLAGS = $(NVCC_FLAGS) $(NVCC_ARCH_LINK_FLAGS) $(CXXFLAGS_FROM_HOST) AMREX_LINKER = nvcc endif diff --git a/Tools/GNUMake/comps/nvcc.mak b/Tools/GNUMake/comps/nvcc.mak index bd79969a29a..fe9af60f45c 100644 --- a/Tools/GNUMake/comps/nvcc.mak +++ b/Tools/GNUMake/comps/nvcc.mak @@ -90,7 +90,17 @@ else CFLAGS_FROM_HOST := $(CXXFLAGS_FROM_HOST) endif -NVCC_FLAGS = -Wno-deprecated-gpu-targets -m64 $(foreach arch,$(CUDA_ARCH),--generate-code arch=compute_$(arch),code=sm_$(arch)) -maxrregcount=$(CUDA_MAXREGCOUNT) --expt-relaxed-constexpr --expt-extended-lambda --forward-unknown-to-host-compiler +NVCC_ARCH_FLAGS = $(foreach arch,$(CUDA_ARCH),--generate-code arch=compute_$(arch),code=sm_$(arch)) + +ifeq ($(CUDA_LTO),TRUE) + NVCC_ARCH_COMPILE_FLAGS = $(subst sm,lto,$(NVCC_ARCH_FLAGS)) + NVCC_ARCH_LINK_FLAGS = -dlto $(NVCC_ARCH_FLAGS) +else + NVCC_ARCH_COMPILE_FLAGS = $(NVCC_ARCH_FLAGS) + NVCC_ARCH_LINK_FLAGS = $(NVCC_ARCH_FLAGS) +endif + +NVCC_FLAGS = -Wno-deprecated-gpu-targets -m64 -maxrregcount=$(CUDA_MAXREGCOUNT) --expt-relaxed-constexpr --expt-extended-lambda --forward-unknown-to-host-compiler # This is to work around a bug with nvcc, see: https://github.com/kokkos/kokkos/issues/1473 NVCC_FLAGS += -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored # and another bug related to implicit returns with if constexpr, see: https://stackoverflow.com/questions/64523302/cuda-missing-return-statement-at-end-of-non-void-function-in-constexpr-if-fun @@ -145,8 +155,8 @@ ifeq ($(nvcc_diag_error),1) NVCC_FLAGS += --display-error-number --diag-error 20092 endif -CXXFLAGS = $(CXXFLAGS_FROM_HOST) $(NVCC_FLAGS) -x cu -CFLAGS = $(CFLAGS_FROM_HOST) $(NVCC_FLAGS) -x cu +CXXFLAGS = $(CXXFLAGS_FROM_HOST) $(NVCC_FLAGS) $(NVCC_ARCH_COMPILE_FLAGS) -x cu +CFLAGS = $(CFLAGS_FROM_HOST) $(NVCC_FLAGS) $(NVCC_ARCH_COMPILE_FLAGS) -x cu ifeq ($(USE_GPU_RDC),TRUE) CXXFLAGS += -dc