-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP] Sort PHIs by ExtractElements when relevant #131229
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Jeffrey Byrnes (jrbyrnes) ChangesConsidering the PHIs in order of element extracted can lead to better shuffles Full diff: https://github.com/llvm/llvm-project/pull/131229.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a9f61d7a9798a..46677db052897 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22687,8 +22687,41 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
- if (S && !S.isAltShuffle())
+ if (S && !S.isAltShuffle()) {
+ if (!isa<ExtractElementInst>(I1) || !isa<ExtractElementInst>(I2))
+ continue;
+
+ auto E1 = cast<ExtractElementInst>(I1);
+ auto E2 = cast<ExtractElementInst>(I2);
+ // Sort on ExtractElementInsts primarily by vector operands. Prefer
+ // program order of the vector operands
+ if (E1->getVectorOperand() != E2->getVectorOperand()) {
+ Instruction *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
+ Instruction *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
+ if (!V1 || !V2)
+ continue;
+ if (V1->getParent() != V2->getParent())
+ continue;
+ return V1->comesBefore(V2);
+ }
+ // If we have the same vector operand, try to sort by constant index
+ auto Id1 = E1->getIndexOperand();
+ auto Id2 = E2->getIndexOperand();
+ // Bring constants to the top
+ if (isa<ConstantInt>(Id1) && !isa<ConstantInt>(Id2))
+ return true;
+ if (!isa<ConstantInt>(Id1) && isa<ConstantInt>(Id2))
+ return false;
+ if (isa<ConstantInt>(Id1) && isa<ConstantInt>(Id2)) {
+ auto C1 = cast<ConstantInt>(Id1);
+ auto C2 = cast<ConstantInt>(Id2);
+ // First elements first
+ return C1->getValue().getZExtValue() <
+ C2->getValue().getZExtValue();
+ }
+
continue;
+ }
return I1->getOpcode() < I2->getOpcode();
}
if (I1)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll
new file mode 100644
index 0000000000000..c585a7f08ad0c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=GFX9
+
+define protected amdgpu_kernel void @myfun(i32 %in, ptr addrspace(1) %aptr1, ptr addrspace(1) %bptr1, ptr addrspace(1) %aptr2, ptr addrspace(1) %bptr2) {
+; GFX9-LABEL: define protected amdgpu_kernel void @myfun(
+; GFX9-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[APTR1:%.*]], ptr addrspace(1) [[BPTR1:%.*]], ptr addrspace(1) [[APTR2:%.*]], ptr addrspace(1) [[BPTR2:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX9-NEXT: [[ENTRY:.*]]:
+; GFX9-NEXT: [[VEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR1]], align 16
+; GFX9-NEXT: [[BVEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR1]], align 16
+; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT: br label %[[DO_BODY:.*]]
+; GFX9: [[DO_BODY]]:
+; GFX9-NEXT: [[ADD:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEWADD:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP31:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP33:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP36:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP37:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP16:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP17:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP18:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP19:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP18]], <8 x i16> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP21:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP20]], <8 x i16> [[TMP21]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; GFX9-NEXT: [[TMP23:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP26:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP27:%.*]] = shufflevector <8 x i16> [[TMP25]], <8 x i16> [[TMP26]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP28:%.*]] = shufflevector <2 x i16> [[TMP15]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP29:%.*]] = shufflevector <8 x i16> [[TMP27]], <8 x i16> [[TMP28]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; GFX9-NEXT: [[RES:%.*]] = add <8 x i16> [[TMP22]], [[TMP29]]
+; GFX9-NEXT: [[VEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR2]], align 16
+; GFX9-NEXT: [[BVEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR2]], align 16
+; GFX9-NEXT: [[NEWADD]] = add i32 [[ADD]], 1
+; GFX9-NEXT: [[COND:%.*]] = icmp sgt i32 [[NEWADD]], [[IN]]
+; GFX9-NEXT: [[TMP30]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT: [[TMP31]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT: [[TMP32]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT: [[TMP33]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT: [[TMP34]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT: [[TMP35]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT: [[TMP36]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT: [[TMP37]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT: br i1 [[COND]], label %[[DO_BODY]], label %[[END:.*]]
+; GFX9: [[END]]:
+; GFX9-NEXT: ret void
+;
+entry:
+ %vec1 = load <8 x i16>, ptr addrspace(1) %aptr1
+ %el0 = extractelement <8 x i16> %vec1, i64 0
+ %el1 = extractelement <8 x i16> %vec1, i64 1
+ %el2 = extractelement <8 x i16> %vec1, i64 2
+ %el3 = extractelement <8 x i16> %vec1, i64 3
+ %el4 = extractelement <8 x i16> %vec1, i64 4
+ %el5 = extractelement <8 x i16> %vec1, i64 5
+ %el6 = extractelement <8 x i16> %vec1, i64 6
+ %el7 = extractelement <8 x i16> %vec1, i64 7
+ %bvec1 = load <8 x i16>, ptr addrspace(1) %bptr1
+ %bel0 = extractelement <8 x i16> %bvec1, i64 0
+ %bel1 = extractelement <8 x i16> %bvec1, i64 1
+ %bel2 = extractelement <8 x i16> %bvec1, i64 2
+ %bel3 = extractelement <8 x i16> %bvec1, i64 3
+ %bel4 = extractelement <8 x i16> %bvec1, i64 4
+ %bel5 = extractelement <8 x i16> %bvec1, i64 5
+ %bel6 = extractelement <8 x i16> %bvec1, i64 6
+ %bel7 = extractelement <8 x i16> %bvec1, i64 7
+ br label %do.body
+
+do.body:
+ %a_thread_buf1 = phi i16 [%el1, %entry], [%newel1, %do.body]
+ %a_thread_buf2 = phi i16 [%el2, %entry], [%newel2, %do.body]
+ %a_thread_buf3 = phi i16 [%el3, %entry], [%newel3, %do.body]
+ %a_thread_buf4 = phi i16 [%el4, %entry], [%newel4, %do.body]
+ %a_thread_buf5 = phi i16 [%el5, %entry], [%newel5, %do.body]
+ %a_thread_buf6 = phi i16 [%el6, %entry], [%newel6, %do.body]
+ %a_thread_buf7 = phi i16 [%el7, %entry], [%newel7, %do.body]
+ %b_thread_buf1 = phi i16 [%bel1, %entry], [%bnewel1, %do.body]
+ %b_thread_buf2 = phi i16 [%bel2, %entry], [%bnewel2, %do.body]
+ %b_thread_buf3 = phi i16 [%bel3, %entry], [%bnewel3, %do.body]
+ %b_thread_buf4 = phi i16 [%bel4, %entry], [%bnewel4, %do.body]
+ %b_thread_buf5 = phi i16 [%bel5, %entry], [%bnewel5, %do.body]
+ %b_thread_buf6 = phi i16 [%bel6, %entry], [%bnewel6, %do.body]
+ %b_thread_buf7 = phi i16 [%bel7, %entry], [%bnewel7, %do.body]
+ %add = phi i32 [0, %entry], [%newadd, %do.body]
+ %a_thread_buf0 = phi i16 [%el0, %entry], [%newel0, %do.body]
+ %b_thread_buf0 = phi i16 [%bel0, %entry], [%bnewel0, %do.body]
+ %a_thread_vec0 = insertelement <8 x i16> poison, i16 %a_thread_buf0, i64 0
+ %a_thread_vec1 = insertelement <8 x i16> %a_thread_vec0, i16 %a_thread_buf1, i64 1
+ %a_thread_vec2 = insertelement <8 x i16> %a_thread_vec1, i16 %a_thread_buf2, i64 2
+ %a_thread_vec3 = insertelement <8 x i16> %a_thread_vec2, i16 %a_thread_buf3, i64 3
+ %a_thread_vec4 = insertelement <8 x i16> %a_thread_vec3, i16 %a_thread_buf4, i64 4
+ %a_thread_vec5 = insertelement <8 x i16> %a_thread_vec4, i16 %a_thread_buf5, i64 5
+ %a_thread_vec6 = insertelement <8 x i16> %a_thread_vec5, i16 %a_thread_buf6, i64 6
+ %a_thread_vec7 = insertelement <8 x i16> %a_thread_vec6, i16 %a_thread_buf7, i64 7
+ %b_thread_vec0 = insertelement <8 x i16> poison, i16 %b_thread_buf0, i64 0
+ %b_thread_vec1 = insertelement <8 x i16> %b_thread_vec0, i16 %b_thread_buf1, i64 1
+ %b_thread_vec2 = insertelement <8 x i16> %b_thread_vec1, i16 %b_thread_buf2, i64 2
+ %b_thread_vec3 = insertelement <8 x i16> %b_thread_vec2, i16 %b_thread_buf3, i64 3
+ %b_thread_vec4 = insertelement <8 x i16> %b_thread_vec3, i16 %b_thread_buf4, i64 4
+ %b_thread_vec5 = insertelement <8 x i16> %b_thread_vec4, i16 %b_thread_buf5, i64 5
+ %b_thread_vec6 = insertelement <8 x i16> %b_thread_vec5, i16 %b_thread_buf6, i64 6
+ %b_thread_vec7 = insertelement <8 x i16> %b_thread_vec6, i16 %b_thread_buf7, i64 7
+ %res = add <8 x i16> %a_thread_vec7, %b_thread_vec7
+ %vec2 = load <8 x i16>, ptr addrspace(1) %aptr2
+ %newel0 = extractelement <8 x i16> %vec2, i64 0
+ %newel1 = extractelement <8 x i16> %vec2, i64 1
+ %newel2 = extractelement <8 x i16> %vec2, i64 2
+ %newel3 = extractelement <8 x i16> %vec2, i64 3
+ %newel4 = extractelement <8 x i16> %vec2, i64 4
+ %newel5 = extractelement <8 x i16> %vec2, i64 5
+ %newel6 = extractelement <8 x i16> %vec2, i64 6
+ %newel7 = extractelement <8 x i16> %vec2, i64 7
+ %bvec2 = load <8 x i16>, ptr addrspace(1) %bptr2
+ %bnewel0 = extractelement <8 x i16> %bvec2, i64 0
+ %bnewel1 = extractelement <8 x i16> %bvec2, i64 1
+ %bnewel2 = extractelement <8 x i16> %bvec2, i64 2
+ %bnewel3 = extractelement <8 x i16> %bvec2, i64 3
+ %bnewel4 = extractelement <8 x i16> %bvec2, i64 4
+ %bnewel5 = extractelement <8 x i16> %bvec2, i64 5
+ %bnewel6 = extractelement <8 x i16> %bvec2, i64 6
+ %bnewel7 = extractelement <8 x i16> %bvec2, i64 7
+ %newadd = add i32 %add, 1
+ %cond = icmp sgt i32 %newadd, %in
+ br i1 %cond, label %do.body, label %end
+
+end:
+ ret void
+}
+
+
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
index 3b63c1e35610f..a3a4ab948519f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
@@ -49,19 +49,19 @@ bb1:
define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2) {
; CHECK-LABEL: @phis_reverse(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
; CHECK: bb0:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
-; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x half> [ [[TMP2]], [[ENTRY]] ], [ [[TMP9]], [[BB0]] ]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <4 x half> [[TMP8]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index dd7a21198ac1f..651f565412830 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -141,7 +141,7 @@ define ptr @test4() {
; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 0>
+; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
; POWEROF2-NEXT: br label [[TMP8:%.*]]
@@ -156,10 +156,10 @@ define ptr @test4() {
; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
+; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
+; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
|
@llvm/pr-subscribers-backend-amdgpu Author: Jeffrey Byrnes (jrbyrnes) ChangesConsidering the PHIs in order of element extracted can lead to better shuffles Full diff: https://github.com/llvm/llvm-project/pull/131229.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a9f61d7a9798a..46677db052897 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22687,8 +22687,41 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
- if (S && !S.isAltShuffle())
+ if (S && !S.isAltShuffle()) {
+ if (!isa<ExtractElementInst>(I1) || !isa<ExtractElementInst>(I2))
+ continue;
+
+ auto E1 = cast<ExtractElementInst>(I1);
+ auto E2 = cast<ExtractElementInst>(I2);
+ // Sort on ExtractElementInsts primarily by vector operands. Prefer
+ // program order of the vector operands
+ if (E1->getVectorOperand() != E2->getVectorOperand()) {
+ Instruction *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
+ Instruction *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
+ if (!V1 || !V2)
+ continue;
+ if (V1->getParent() != V2->getParent())
+ continue;
+ return V1->comesBefore(V2);
+ }
+ // If we have the same vector operand, try to sort by constant index
+ auto Id1 = E1->getIndexOperand();
+ auto Id2 = E2->getIndexOperand();
+ // Bring constants to the top
+ if (isa<ConstantInt>(Id1) && !isa<ConstantInt>(Id2))
+ return true;
+ if (!isa<ConstantInt>(Id1) && isa<ConstantInt>(Id2))
+ return false;
+ if (isa<ConstantInt>(Id1) && isa<ConstantInt>(Id2)) {
+ auto C1 = cast<ConstantInt>(Id1);
+ auto C2 = cast<ConstantInt>(Id2);
+ // First elements first
+ return C1->getValue().getZExtValue() <
+ C2->getValue().getZExtValue();
+ }
+
continue;
+ }
return I1->getOpcode() < I2->getOpcode();
}
if (I1)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll
new file mode 100644
index 0000000000000..c585a7f08ad0c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=GFX9
+
+define protected amdgpu_kernel void @myfun(i32 %in, ptr addrspace(1) %aptr1, ptr addrspace(1) %bptr1, ptr addrspace(1) %aptr2, ptr addrspace(1) %bptr2) {
+; GFX9-LABEL: define protected amdgpu_kernel void @myfun(
+; GFX9-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[APTR1:%.*]], ptr addrspace(1) [[BPTR1:%.*]], ptr addrspace(1) [[APTR2:%.*]], ptr addrspace(1) [[BPTR2:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX9-NEXT: [[ENTRY:.*]]:
+; GFX9-NEXT: [[VEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR1]], align 16
+; GFX9-NEXT: [[BVEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR1]], align 16
+; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT: br label %[[DO_BODY:.*]]
+; GFX9: [[DO_BODY]]:
+; GFX9-NEXT: [[ADD:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEWADD:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP31:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP33:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP36:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP15:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP37:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT: [[TMP16:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP17:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP18:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP19:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP18]], <8 x i16> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP21:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP20]], <8 x i16> [[TMP21]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; GFX9-NEXT: [[TMP23:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP26:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP27:%.*]] = shufflevector <8 x i16> [[TMP25]], <8 x i16> [[TMP26]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP28:%.*]] = shufflevector <2 x i16> [[TMP15]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT: [[TMP29:%.*]] = shufflevector <8 x i16> [[TMP27]], <8 x i16> [[TMP28]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; GFX9-NEXT: [[RES:%.*]] = add <8 x i16> [[TMP22]], [[TMP29]]
+; GFX9-NEXT: [[VEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR2]], align 16
+; GFX9-NEXT: [[BVEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR2]], align 16
+; GFX9-NEXT: [[NEWADD]] = add i32 [[ADD]], 1
+; GFX9-NEXT: [[COND:%.*]] = icmp sgt i32 [[NEWADD]], [[IN]]
+; GFX9-NEXT: [[TMP30]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT: [[TMP31]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT: [[TMP32]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT: [[TMP33]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT: [[TMP34]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT: [[TMP35]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT: [[TMP36]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT: [[TMP37]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT: br i1 [[COND]], label %[[DO_BODY]], label %[[END:.*]]
+; GFX9: [[END]]:
+; GFX9-NEXT: ret void
+;
+entry:
+ %vec1 = load <8 x i16>, ptr addrspace(1) %aptr1
+ %el0 = extractelement <8 x i16> %vec1, i64 0
+ %el1 = extractelement <8 x i16> %vec1, i64 1
+ %el2 = extractelement <8 x i16> %vec1, i64 2
+ %el3 = extractelement <8 x i16> %vec1, i64 3
+ %el4 = extractelement <8 x i16> %vec1, i64 4
+ %el5 = extractelement <8 x i16> %vec1, i64 5
+ %el6 = extractelement <8 x i16> %vec1, i64 6
+ %el7 = extractelement <8 x i16> %vec1, i64 7
+ %bvec1 = load <8 x i16>, ptr addrspace(1) %bptr1
+ %bel0 = extractelement <8 x i16> %bvec1, i64 0
+ %bel1 = extractelement <8 x i16> %bvec1, i64 1
+ %bel2 = extractelement <8 x i16> %bvec1, i64 2
+ %bel3 = extractelement <8 x i16> %bvec1, i64 3
+ %bel4 = extractelement <8 x i16> %bvec1, i64 4
+ %bel5 = extractelement <8 x i16> %bvec1, i64 5
+ %bel6 = extractelement <8 x i16> %bvec1, i64 6
+ %bel7 = extractelement <8 x i16> %bvec1, i64 7
+ br label %do.body
+
+do.body:
+ %a_thread_buf1 = phi i16 [%el1, %entry], [%newel1, %do.body]
+ %a_thread_buf2 = phi i16 [%el2, %entry], [%newel2, %do.body]
+ %a_thread_buf3 = phi i16 [%el3, %entry], [%newel3, %do.body]
+ %a_thread_buf4 = phi i16 [%el4, %entry], [%newel4, %do.body]
+ %a_thread_buf5 = phi i16 [%el5, %entry], [%newel5, %do.body]
+ %a_thread_buf6 = phi i16 [%el6, %entry], [%newel6, %do.body]
+ %a_thread_buf7 = phi i16 [%el7, %entry], [%newel7, %do.body]
+ %b_thread_buf1 = phi i16 [%bel1, %entry], [%bnewel1, %do.body]
+ %b_thread_buf2 = phi i16 [%bel2, %entry], [%bnewel2, %do.body]
+ %b_thread_buf3 = phi i16 [%bel3, %entry], [%bnewel3, %do.body]
+ %b_thread_buf4 = phi i16 [%bel4, %entry], [%bnewel4, %do.body]
+ %b_thread_buf5 = phi i16 [%bel5, %entry], [%bnewel5, %do.body]
+ %b_thread_buf6 = phi i16 [%bel6, %entry], [%bnewel6, %do.body]
+ %b_thread_buf7 = phi i16 [%bel7, %entry], [%bnewel7, %do.body]
+ %add = phi i32 [0, %entry], [%newadd, %do.body]
+ %a_thread_buf0 = phi i16 [%el0, %entry], [%newel0, %do.body]
+ %b_thread_buf0 = phi i16 [%bel0, %entry], [%bnewel0, %do.body]
+ %a_thread_vec0 = insertelement <8 x i16> poison, i16 %a_thread_buf0, i64 0
+ %a_thread_vec1 = insertelement <8 x i16> %a_thread_vec0, i16 %a_thread_buf1, i64 1
+ %a_thread_vec2 = insertelement <8 x i16> %a_thread_vec1, i16 %a_thread_buf2, i64 2
+ %a_thread_vec3 = insertelement <8 x i16> %a_thread_vec2, i16 %a_thread_buf3, i64 3
+ %a_thread_vec4 = insertelement <8 x i16> %a_thread_vec3, i16 %a_thread_buf4, i64 4
+ %a_thread_vec5 = insertelement <8 x i16> %a_thread_vec4, i16 %a_thread_buf5, i64 5
+ %a_thread_vec6 = insertelement <8 x i16> %a_thread_vec5, i16 %a_thread_buf6, i64 6
+ %a_thread_vec7 = insertelement <8 x i16> %a_thread_vec6, i16 %a_thread_buf7, i64 7
+ %b_thread_vec0 = insertelement <8 x i16> poison, i16 %b_thread_buf0, i64 0
+ %b_thread_vec1 = insertelement <8 x i16> %b_thread_vec0, i16 %b_thread_buf1, i64 1
+ %b_thread_vec2 = insertelement <8 x i16> %b_thread_vec1, i16 %b_thread_buf2, i64 2
+ %b_thread_vec3 = insertelement <8 x i16> %b_thread_vec2, i16 %b_thread_buf3, i64 3
+ %b_thread_vec4 = insertelement <8 x i16> %b_thread_vec3, i16 %b_thread_buf4, i64 4
+ %b_thread_vec5 = insertelement <8 x i16> %b_thread_vec4, i16 %b_thread_buf5, i64 5
+ %b_thread_vec6 = insertelement <8 x i16> %b_thread_vec5, i16 %b_thread_buf6, i64 6
+ %b_thread_vec7 = insertelement <8 x i16> %b_thread_vec6, i16 %b_thread_buf7, i64 7
+ %res = add <8 x i16> %a_thread_vec7, %b_thread_vec7
+ %vec2 = load <8 x i16>, ptr addrspace(1) %aptr2
+ %newel0 = extractelement <8 x i16> %vec2, i64 0
+ %newel1 = extractelement <8 x i16> %vec2, i64 1
+ %newel2 = extractelement <8 x i16> %vec2, i64 2
+ %newel3 = extractelement <8 x i16> %vec2, i64 3
+ %newel4 = extractelement <8 x i16> %vec2, i64 4
+ %newel5 = extractelement <8 x i16> %vec2, i64 5
+ %newel6 = extractelement <8 x i16> %vec2, i64 6
+ %newel7 = extractelement <8 x i16> %vec2, i64 7
+ %bvec2 = load <8 x i16>, ptr addrspace(1) %bptr2
+ %bnewel0 = extractelement <8 x i16> %bvec2, i64 0
+ %bnewel1 = extractelement <8 x i16> %bvec2, i64 1
+ %bnewel2 = extractelement <8 x i16> %bvec2, i64 2
+ %bnewel3 = extractelement <8 x i16> %bvec2, i64 3
+ %bnewel4 = extractelement <8 x i16> %bvec2, i64 4
+ %bnewel5 = extractelement <8 x i16> %bvec2, i64 5
+ %bnewel6 = extractelement <8 x i16> %bvec2, i64 6
+ %bnewel7 = extractelement <8 x i16> %bvec2, i64 7
+ %newadd = add i32 %add, 1
+ %cond = icmp sgt i32 %newadd, %in
+ br i1 %cond, label %do.body, label %end
+
+end:
+ ret void
+}
+
+
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
index 3b63c1e35610f..a3a4ab948519f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
@@ -49,19 +49,19 @@ bb1:
define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2) {
; CHECK-LABEL: @phis_reverse(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
; CHECK: bb0:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
-; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x half> [ [[TMP2]], [[ENTRY]] ], [ [[TMP9]], [[BB0]] ]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <4 x half> [[TMP8]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index dd7a21198ac1f..651f565412830 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -141,7 +141,7 @@ define ptr @test4() {
; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 0>
+; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
; POWEROF2-NEXT: br label [[TMP8:%.*]]
@@ -156,10 +156,10 @@ define ptr @test4() {
; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
+; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
+; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
|
Precommit test for llvm/llvm-project#131229
Change-Id: I62668ebc355c7746ec5ef69249573fe87e1343cc
I'm still studying the code, but I'm wondering whether BoUpSLP::getReorderingData should be handling this case? It has its own phi node sorting, and indeed seems to be properly detecting identity order for the missed optimisation case here:
but, the "no need to reorder" case fires, and so the shuffles are left as-is rather than being removed. |
Generally speaking, implementing it in getReorderingData should be enough, but having it here as the first step also should be fine |
Change-Id: I5bb81e7e3177f3cf196e682786a3de966a7a0014
Change-Id: I8b15558af0c8b23ff8c769d1a01323a87068f6ef
Change-Id: I666a35830bc431b763474dcbb2d7ecae58768a45
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
Okay the bot failures are not caused by this patch and just exist in the slightly older version of llvm that this branch uses. Confirmed locally that the bot fails are fixed by 03614b9 . Thanks for review |
Precommit test for llvm#131229
Considering the PHIs in order of element extracted can lead to better shuffles