-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[AMDGPU] fix amdgpu_max_num_work_groups in templates #141633
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Yaxun (Sam) Liu (yxsamliu) ChangesClang does not instantiate amdgpu_max_num_work_groups attribute with one template argument, causing Fixes: #139570 Full diff: https://github.com/llvm/llvm-project/pull/141633.diff 2 Files Affected:
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 44700a446dfac..174c8fc59e4fa 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -648,21 +648,30 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
EnterExpressionEvaluationContext Unevaluated(
S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
- ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
- if (!ResultX.isUsable())
- return;
- ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
- if (!ResultY.isUsable())
- return;
- ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
- if (!ResultZ.isUsable())
- return;
+ Expr *XExpr = nullptr;
+ Expr *YExpr = nullptr;
+ Expr *ZExpr = nullptr;
+
+ if (Attr.getMaxNumWorkGroupsX()) {
+ ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
+ if (ResultX.isUsable())
+ XExpr = ResultX.getAs<Expr>();
+ }
+
+ if (Attr.getMaxNumWorkGroupsY()) {
+ ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
+ if (ResultY.isUsable())
+ YExpr = ResultY.getAs<Expr>();
+ }
- Expr *XExpr = ResultX.getAs<Expr>();
- Expr *YExpr = ResultY.getAs<Expr>();
- Expr *ZExpr = ResultZ.getAs<Expr>();
+ if (Attr.getMaxNumWorkGroupsZ()) {
+ ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
+ if (ResultZ.isUsable())
+ ZExpr = ResultZ.getAs<Expr>();
+ }
- S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
+ if (XExpr)
+ S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
// This doesn't take any template parameters, but we have a custom action that
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
index 253ac0898f546..ced0059e69d9b 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -78,6 +78,12 @@ __global__ void template_32_4_a_max_num_work_groups() {}
template __global__ void template_32_4_a_max_num_work_groups<2>();
// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_4_a_max_num_work_groupsILj2EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(a)))
+__global__ void template_a_max_num_work_groups() {}
+template __global__ void template_a_max_num_work_groups<32>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z30template_a_max_num_work_groupsILj32EEvv() [[MAX_NUM_WORK_GROUPS_32_1_1]]
+
// Make sure this is silently accepted on other targets.
// NAMD-NOT: "amdgpu-flat-work-group-size"
// NAMD-NOT: "amdgpu-waves-per-eu"
|
@llvm/pr-subscribers-clang Author: Yaxun (Sam) Liu (yxsamliu) ChangesClang does not instantiate amdgpu_max_num_work_groups attribute with one template argument, causing Fixes: #139570 Full diff: https://github.com/llvm/llvm-project/pull/141633.diff 2 Files Affected:
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 44700a446dfac..174c8fc59e4fa 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -648,21 +648,30 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
EnterExpressionEvaluationContext Unevaluated(
S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
- ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
- if (!ResultX.isUsable())
- return;
- ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
- if (!ResultY.isUsable())
- return;
- ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
- if (!ResultZ.isUsable())
- return;
+ Expr *XExpr = nullptr;
+ Expr *YExpr = nullptr;
+ Expr *ZExpr = nullptr;
+
+ if (Attr.getMaxNumWorkGroupsX()) {
+ ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
+ if (ResultX.isUsable())
+ XExpr = ResultX.getAs<Expr>();
+ }
+
+ if (Attr.getMaxNumWorkGroupsY()) {
+ ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
+ if (ResultY.isUsable())
+ YExpr = ResultY.getAs<Expr>();
+ }
- Expr *XExpr = ResultX.getAs<Expr>();
- Expr *YExpr = ResultY.getAs<Expr>();
- Expr *ZExpr = ResultZ.getAs<Expr>();
+ if (Attr.getMaxNumWorkGroupsZ()) {
+ ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
+ if (ResultZ.isUsable())
+ ZExpr = ResultZ.getAs<Expr>();
+ }
- S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
+ if (XExpr)
+ S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
// This doesn't take any template parameters, but we have a custom action that
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
index 253ac0898f546..ced0059e69d9b 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -78,6 +78,12 @@ __global__ void template_32_4_a_max_num_work_groups() {}
template __global__ void template_32_4_a_max_num_work_groups<2>();
// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_4_a_max_num_work_groupsILj2EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(a)))
+__global__ void template_a_max_num_work_groups() {}
+template __global__ void template_a_max_num_work_groups<32>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z30template_a_max_num_work_groupsILj32EEvv() [[MAX_NUM_WORK_GROUPS_32_1_1]]
+
// Make sure this is silently accepted on other targets.
// NAMD-NOT: "amdgpu-flat-work-group-size"
// NAMD-NOT: "amdgpu-waves-per-eu"
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Copilot encountered an error and was unable to review this pull request. You can try again by re-requesting a review.
Clang does not instantiate amdgpu_max_num_work_groups attribute with one template argument, causing assertion codegen. Fixes: llvm#139570
Clang does not instantiate amdgpu_max_num_work_groups attribute with one template argument, causing assertion codegen. Fixes: llvm#139570
Clang does not instantiate amdgpu_max_num_work_groups attribute with one template argument, causing
assertion codegen.
Fixes: #139570