-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[clang][AMDGPU] Enable module splitting by default #128509
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-clang Author: Pierre van Houtryve (Pierre-vh) ChangesThe default number of partitions is the number of cores on the machine with a cap at 16, as going above 16 is unlikely to be useful in the common case. Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting. Fixes SWDEV-506214 Full diff: https://github.com/llvm/llvm-project/pull/128509.diff 8 Files Affected:
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index e521cbf678d93..6cb2fd1b755c1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1393,6 +1393,8 @@ def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">,
HelpText<"Compile HIP source to relocatable">;
def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
HelpText<"Do not override toolchain to compile HIP source to relocatable">;
+def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>,
+ HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">;
}
// Clang specific/exclusive options for OpenACC.
@@ -3158,7 +3160,7 @@ def modules_reduced_bmi : Flag<["-"], "fmodules-reduced-bmi">,
HelpText<"Generate the reduced BMI">,
MarshallingInfoFlag<FrontendOpts<"GenReducedBMI">>;
-def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">,
+def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">,
Group<f_Group>, Visibility<[ClangOption, CC1Option]>, Alias<modules_reduced_bmi>;
def fmodules_embed_all_files : Joined<["-"], "fmodules-embed-all-files">,
@@ -7417,7 +7419,7 @@ def fuse_register_sized_bitfield_access: Flag<["-"], "fuse-register-sized-bitfie
def relaxed_aliasing : Flag<["-"], "relaxed-aliasing">,
HelpText<"Turn off Type Based Alias Analysis">,
MarshallingInfoFlag<CodeGenOpts<"RelaxedAliasing">>;
-defm pointer_tbaa: BoolOption<"", "pointer-tbaa", CodeGenOpts<"PointerTBAA">,
+defm pointer_tbaa: BoolOption<"", "pointer-tbaa", CodeGenOpts<"PointerTBAA">,
DefaultTrue,
PosFlag<SetTrue, [], [ClangOption], "Enable">,
NegFlag<SetFalse, [], [ClangOption], "Disable">,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 6a35a2feabc9b..820a335a4b384 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -21,6 +21,7 @@
#include "llvm/Support/LineIterator.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
+#include "llvm/Support/Threading.h"
#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/TargetParser/Host.h"
#include <optional>
@@ -630,8 +631,11 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
getToolChain().AddFilePathLibArgs(Args, CmdArgs);
AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
if (C.getDriver().isUsingLTO()) {
- addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
- C.getDriver().getLTOMode() == LTOK_Thin);
+ const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin);
+ addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO);
+
+ if (!ThinLTO)
+ addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs);
} else if (Args.hasArg(options::OPT_mcpu_EQ)) {
CmdArgs.push_back(Args.MakeArgString(
"-plugin-opt=mcpu=" +
@@ -708,6 +712,34 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
options::OPT_m_amdgpu_Features_Group);
}
+static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) {
+ const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ);
+ // In the absence of an option, use the number of available threads with a cap
+ // at 16 partitions. More than 16 partitions rarely benefits code splitting
+ // and can lead to more empty/small modules each with their own overhead.
+ if (!A)
+ return std::max(16u, llvm::hardware_concurrency().compute_thread_count());
+ int Value;
+ if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) {
+ D.Diag(diag::err_drv_invalid_int_value)
+ << A->getAsString(Args) << A->getValue();
+ return 1;
+ }
+
+ return Value;
+}
+
+void amdgpu::addFullLTOPartitionOption(const Driver &D,
+ const llvm::opt::ArgList &Args,
+ llvm::opt::ArgStringList &CmdArgs) {
+ // TODO: restrict to gpu-rdc only?
+
+ if (unsigned NumParts = GetFullLTOPartitions(D, Args); NumParts > 1) {
+ CmdArgs.push_back(
+ Args.MakeArgString("--lto-partitions=" + std::to_string(NumParts)));
+ }
+}
+
/// AMDGPU Toolchain
AMDGPUToolChain::AMDGPUToolChain(const Driver &D, const llvm::Triple &Triple,
const ArgList &Args)
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index bc941a40445ad..08bd4fa556f78 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -41,6 +41,8 @@ void getAMDGPUTargetFeatures(const Driver &D, const llvm::Triple &Triple,
const llvm::opt::ArgList &Args,
std::vector<StringRef> &Features);
+void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args,
+ llvm::opt::ArgStringList &CmdArgs);
} // end namespace amdgpu
} // end namespace tools
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 271626ed54aed..4f0511b272a98 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -116,6 +116,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
addLinkerCompressDebugSectionsOption(TC, Args, LldArgs);
+ amdgpu::addFullLTOPartitionOption(D, Args, LldArgs);
+
// Given that host and device linking happen in separate processes, the device
// linker doesn't always have the visibility as to which device symbols are
// needed by a program, especially for the device symbol dependencies that are
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index c1c5aa8e90e68..6617108e59fcf 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -19,10 +19,12 @@
// AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all"
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
-// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s
+// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s
+// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
+// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions={{[0-9]+}}"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
+
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
// RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s
-// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
@@ -36,3 +38,17 @@
// RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \
// RUN: -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s
// STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o"
+
+// Check --flto-partitions
+
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
+// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s
+// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42"
+
+// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
+// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s
+// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
+
+// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
+// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s
+// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
diff --git a/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip
new file mode 100644
index 0000000000000..e345bd3f5be6b
--- /dev/null
+++ b/clang/test/Driver/hip-toolchain-rdc-flto-partitions.hip
@@ -0,0 +1,35 @@
+// RUN: %clang -### --target=x86_64-linux-gnu \
+// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=42 \
+// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
+// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
+// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS
+
+// FIXED-PARTS-NOT: "*.llvm-link"
+// FIXED-PARTS-NOT: ".*opt"
+// FIXED-PARTS-NOT: ".*llc"
+// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
+// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803"
+// FIXED-PARTS-SAME: "--lto-partitions=42"
+// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}"
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=a \
+// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
+// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
+// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0
+
+// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=0 \
+// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
+// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
+// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
+// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1
+
+// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
diff --git a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
index 5276faf31bdc2..6f38a06f7cf31 100644
--- a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
+++ b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
@@ -49,6 +49,7 @@
// CHECK-NOT: ".*llc"
// CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// CHECK-SAME: "-plugin-opt=mcpu=gfx803"
+// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]
// generate image for device side path on gfx900
@@ -77,6 +78,7 @@
// CHECK-NOT: ".*llc"
// CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
+// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
// CHECK-SAME: "--whole-archive"
// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]]
// CHECK-SAME: "--no-whole-archive"
diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip
index 96da423144c1c..9015702e3211a 100644
--- a/clang/test/Driver/hip-toolchain-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-rdc.hip
@@ -147,6 +147,7 @@
// CHECK-NOT: ".*llc"
// CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
+// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
// CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]]
// combine images generated into hip fat binary object
|
e58e1b4
to
10c286a
Compare
void amdgpu::addFullLTOPartitionOption(const Driver &D, | ||
const llvm::opt::ArgList &Args, | ||
llvm::opt::ArgStringList &CmdArgs) { | ||
// TODO: restrict to gpu-rdc only? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we support no-gpu-rdc and full LTO? I thought gpu-rdc turns on LTO.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No I meant restricting to LTO + fgpu-rdc.
Do we/can we use LTO without fgpu-rdc ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
-fno-gpu-rdc
mode doesn't use LTO because the clang -cc1
job just emits an ELF directly. However you could easily make the ELF job emit IR and it would go through the linker just fine.
The default number of partitions is the number of cores on the machine with a cap at 16, as going above 16 is unlikely to be useful in the common case. Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting. Fixes SWDEV-506214
315617d
to
6db0c7d
Compare
I'm okay with this change, but did you run a PSDB or even a full testing cycle? |
I'll get it tested internally ASAP. I wasn't confident in the driver changes so I was waiting for more feedback before doing it |
@shiltian PSDB passed, is it enough to land? |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/162/builds/18769 Here is the relevant piece of the build log for the reference
|
This seems to have broken LTO linking / introduced some non-determinism. I noticed that the LLVM libc If we take this and then run it through the LTO pass things go wrong. > ld.lld --lto-emit-llvm start.o -r --lto-partitions=8 -plugin-opt=O3 --lto-CGO3 -o test.o This either results in some definitions being missing or a corrupted LLVM-IR file so it's non-deterministic. > opt -S test.o
opt: test.o: error: can't skip to bit 50048 from 42112 Can I revert this? |
Reverting this patch locally also fixes one of the |
@@ -708,6 +711,33 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D, | |||
options::OPT_m_amdgpu_Features_Group); | |||
} | |||
|
|||
static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) { | |||
const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be getLastArgValue
with a default of "8"
.
Thinking on this, might just be a very poor interaction between |
def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>, | ||
HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we added a top level option for this it probably should've been made to work for non-AMDGPU uses.
I'm thinking this should be default only for HIP (maybe OpenMP later) because the option is HIP only and it confounds attempts to override it by inserting a linker option otherwise. |
Summary: The llvm#128509 patch introduced `--flto-partitions`. This was marked as a HIP only argument, and was also spelled and handled incorrectly for an `-f` option. This patch makes the handling generic for `ld.lld` consumers. This also fixes some issues with emitting the flags being put after the default arguments, preventing users from overriding them. Also, forwards things properly for the new driver so we can test this.
…#133283) Summary: The #128509 patch introduced `--flto-partitions`. This was marked as a HIP only argument, and was also spelled and handled incorrectly for an `-f` option. This patch makes the handling generic for `ld.lld` consumers. This also fixes some issues with emitting the flags being put after the default arguments, preventing users from overriding them. Also, forwards things properly for the new driver so we can test this.
…it properly (#133283) Summary: The llvm/llvm-project#128509 patch introduced `--flto-partitions`. This was marked as a HIP only argument, and was also spelled and handled incorrectly for an `-f` option. This patch makes the handling generic for `ld.lld` consumers. This also fixes some issues with emitting the flags being put after the default arguments, preventing users from overriding them. Also, forwards things properly for the new driver so we can test this.
The default number of partitions is 8.
Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting.
Fixes SWDEV-506214