[SLP] Sort PHIs by ExtractElements when relevant #131229

jrbyrnes · 2025-03-13T22:09:25Z

Considering the PHIs in order of element extracted can lead to better shuffles

llvmbot · 2025-03-13T22:09:57Z

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Jeffrey Byrnes (jrbyrnes)

Changes

Considering the PHIs in order of element extracted can lead to better shuffles

Full diff: https://github.com/llvm/llvm-project/pull/131229.diff

4 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+34-1)
(added) llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll (+142)
(modified) llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll (+7-7)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll (+4-4)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a9f61d7a9798a..46677db052897 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22687,8 +22687,41 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
           if (NodeI1 != NodeI2)
             return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
           InstructionsState S = getSameOpcode({I1, I2}, *TLI);
-          if (S && !S.isAltShuffle())
+          if (S && !S.isAltShuffle()) {
+            if (!isa<ExtractElementInst>(I1) || !isa<ExtractElementInst>(I2))
+              continue;
+
+            auto E1 = cast<ExtractElementInst>(I1);
+            auto E2 = cast<ExtractElementInst>(I2);
+            // Sort on ExtractElementInsts primarily by vector operands. Prefer
+            // program order of the vector operands
+            if (E1->getVectorOperand() != E2->getVectorOperand()) {
+              Instruction *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
+              Instruction *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
+              if (!V1 || !V2)
+                continue;
+              if (V1->getParent() != V2->getParent())
+                continue;
+              return V1->comesBefore(V2);
+            }
+            // If we have the same vector operand, try to sort by constant index
+            auto Id1 = E1->getIndexOperand();
+            auto Id2 = E2->getIndexOperand();
+            // Bring constants to the top
+            if (isa<ConstantInt>(Id1) && !isa<ConstantInt>(Id2))
+              return true;
+            if (!isa<ConstantInt>(Id1) && isa<ConstantInt>(Id2))
+              return false;
+            if (isa<ConstantInt>(Id1) && isa<ConstantInt>(Id2)) {
+              auto C1 = cast<ConstantInt>(Id1);
+              auto C2 = cast<ConstantInt>(Id2);
+              // First elements first
+              return C1->getValue().getZExtValue() <
+                     C2->getValue().getZExtValue();
+            }
+
             continue;
+          }
           return I1->getOpcode() < I2->getOpcode();
         }
         if (I1)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll
new file mode 100644
index 0000000000000..c585a7f08ad0c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=slp-vectorizer -S  | FileCheck %s --check-prefix=GFX9
+
+define protected amdgpu_kernel void @myfun(i32 %in, ptr addrspace(1) %aptr1, ptr addrspace(1) %bptr1, ptr addrspace(1) %aptr2, ptr addrspace(1) %bptr2)  {
+; GFX9-LABEL: define protected amdgpu_kernel void @myfun(
+; GFX9-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[APTR1:%.*]], ptr addrspace(1) [[BPTR1:%.*]], ptr addrspace(1) [[APTR2:%.*]], ptr addrspace(1) [[BPTR2:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX9-NEXT:  [[ENTRY:.*]]:
+; GFX9-NEXT:    [[VEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR1]], align 16
+; GFX9-NEXT:    [[BVEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR1]], align 16
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT:    br label %[[DO_BODY:.*]]
+; GFX9:       [[DO_BODY]]:
+; GFX9-NEXT:    [[ADD:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEWADD:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP31:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP33:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP36:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP15:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP37:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP18]], <8 x i16> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP20]], <8 x i16> [[TMP21]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; GFX9-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i16> [[TMP25]], <8 x i16> [[TMP26]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i16> [[TMP15]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i16> [[TMP27]], <8 x i16> [[TMP28]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; GFX9-NEXT:    [[RES:%.*]] = add <8 x i16> [[TMP22]], [[TMP29]]
+; GFX9-NEXT:    [[VEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR2]], align 16
+; GFX9-NEXT:    [[BVEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR2]], align 16
+; GFX9-NEXT:    [[NEWADD]] = add i32 [[ADD]], 1
+; GFX9-NEXT:    [[COND:%.*]] = icmp sgt i32 [[NEWADD]], [[IN]]
+; GFX9-NEXT:    [[TMP30]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP31]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP32]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT:    [[TMP33]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT:    [[TMP34]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP35]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP36]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT:    [[TMP37]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT:    br i1 [[COND]], label %[[DO_BODY]], label %[[END:.*]]
+; GFX9:       [[END]]:
+; GFX9-NEXT:    ret void
+;
+entry:
+  %vec1 = load <8 x i16>, ptr addrspace(1) %aptr1
+  %el0 = extractelement <8 x i16> %vec1, i64 0
+  %el1 = extractelement <8 x i16> %vec1, i64 1
+  %el2 = extractelement <8 x i16> %vec1, i64 2
+  %el3 = extractelement <8 x i16> %vec1, i64 3
+  %el4 = extractelement <8 x i16> %vec1, i64 4
+  %el5 = extractelement <8 x i16> %vec1, i64 5
+  %el6 = extractelement <8 x i16> %vec1, i64 6
+  %el7 = extractelement <8 x i16> %vec1, i64 7
+  %bvec1 = load <8 x i16>, ptr addrspace(1) %bptr1
+  %bel0 = extractelement <8 x i16> %bvec1, i64 0
+  %bel1 = extractelement <8 x i16> %bvec1, i64 1
+  %bel2 = extractelement <8 x i16> %bvec1, i64 2
+  %bel3 = extractelement <8 x i16> %bvec1, i64 3
+  %bel4 = extractelement <8 x i16> %bvec1, i64 4
+  %bel5 = extractelement <8 x i16> %bvec1, i64 5
+  %bel6 = extractelement <8 x i16> %bvec1, i64 6
+  %bel7 = extractelement <8 x i16> %bvec1, i64 7
+  br label %do.body
+
+do.body:
+  %a_thread_buf1 = phi i16 [%el1, %entry], [%newel1, %do.body]
+  %a_thread_buf2 = phi i16 [%el2, %entry], [%newel2, %do.body]
+  %a_thread_buf3 = phi i16 [%el3, %entry], [%newel3, %do.body]
+  %a_thread_buf4 = phi i16 [%el4, %entry], [%newel4, %do.body]
+  %a_thread_buf5 = phi i16 [%el5, %entry], [%newel5, %do.body]
+  %a_thread_buf6 = phi i16 [%el6, %entry], [%newel6, %do.body]
+  %a_thread_buf7 = phi i16 [%el7, %entry], [%newel7, %do.body]
+  %b_thread_buf1 = phi i16 [%bel1, %entry], [%bnewel1, %do.body]
+  %b_thread_buf2 = phi i16 [%bel2, %entry], [%bnewel2, %do.body]
+  %b_thread_buf3 = phi i16 [%bel3, %entry], [%bnewel3, %do.body]
+  %b_thread_buf4 = phi i16 [%bel4, %entry], [%bnewel4, %do.body]
+  %b_thread_buf5 = phi i16 [%bel5, %entry], [%bnewel5, %do.body]
+  %b_thread_buf6 = phi i16 [%bel6, %entry], [%bnewel6, %do.body]
+  %b_thread_buf7 = phi i16 [%bel7, %entry], [%bnewel7, %do.body]
+  %add = phi i32 [0, %entry], [%newadd, %do.body]
+  %a_thread_buf0 = phi i16 [%el0, %entry], [%newel0, %do.body]
+  %b_thread_buf0 = phi i16 [%bel0, %entry], [%bnewel0, %do.body]
+  %a_thread_vec0 = insertelement <8 x i16> poison, i16 %a_thread_buf0, i64 0
+  %a_thread_vec1 = insertelement <8 x i16> %a_thread_vec0, i16 %a_thread_buf1, i64 1
+  %a_thread_vec2 = insertelement <8 x i16> %a_thread_vec1, i16 %a_thread_buf2, i64 2
+  %a_thread_vec3 = insertelement <8 x i16> %a_thread_vec2, i16 %a_thread_buf3, i64 3
+  %a_thread_vec4 = insertelement <8 x i16> %a_thread_vec3, i16 %a_thread_buf4, i64 4
+  %a_thread_vec5 = insertelement <8 x i16> %a_thread_vec4, i16 %a_thread_buf5, i64 5
+  %a_thread_vec6 = insertelement <8 x i16> %a_thread_vec5, i16 %a_thread_buf6, i64 6
+  %a_thread_vec7 = insertelement <8 x i16> %a_thread_vec6, i16 %a_thread_buf7, i64 7
+  %b_thread_vec0 = insertelement <8 x i16> poison, i16 %b_thread_buf0, i64 0
+  %b_thread_vec1 = insertelement <8 x i16> %b_thread_vec0, i16 %b_thread_buf1, i64 1
+  %b_thread_vec2 = insertelement <8 x i16> %b_thread_vec1, i16 %b_thread_buf2, i64 2
+  %b_thread_vec3 = insertelement <8 x i16> %b_thread_vec2, i16 %b_thread_buf3, i64 3
+  %b_thread_vec4 = insertelement <8 x i16> %b_thread_vec3, i16 %b_thread_buf4, i64 4
+  %b_thread_vec5 = insertelement <8 x i16> %b_thread_vec4, i16 %b_thread_buf5, i64 5
+  %b_thread_vec6 = insertelement <8 x i16> %b_thread_vec5, i16 %b_thread_buf6, i64 6
+  %b_thread_vec7 = insertelement <8 x i16> %b_thread_vec6, i16 %b_thread_buf7, i64 7
+  %res = add <8 x i16> %a_thread_vec7, %b_thread_vec7
+  %vec2 = load <8 x i16>, ptr addrspace(1) %aptr2
+  %newel0 = extractelement <8 x i16> %vec2, i64 0
+  %newel1 = extractelement <8 x i16> %vec2, i64 1
+  %newel2 = extractelement <8 x i16> %vec2, i64 2
+  %newel3 = extractelement <8 x i16> %vec2, i64 3
+  %newel4 = extractelement <8 x i16> %vec2, i64 4
+  %newel5 = extractelement <8 x i16> %vec2, i64 5
+  %newel6 = extractelement <8 x i16> %vec2, i64 6
+  %newel7 = extractelement <8 x i16> %vec2, i64 7
+  %bvec2 = load <8 x i16>, ptr addrspace(1) %bptr2
+  %bnewel0 = extractelement <8 x i16> %bvec2, i64 0
+  %bnewel1 = extractelement <8 x i16> %bvec2, i64 1
+  %bnewel2 = extractelement <8 x i16> %bvec2, i64 2
+  %bnewel3 = extractelement <8 x i16> %bvec2, i64 3
+  %bnewel4 = extractelement <8 x i16> %bvec2, i64 4
+  %bnewel5 = extractelement <8 x i16> %bvec2, i64 5
+  %bnewel6 = extractelement <8 x i16> %bvec2, i64 6
+  %bnewel7 = extractelement <8 x i16> %bvec2, i64 7
+  %newadd = add i32 %add, 1
+  %cond = icmp sgt i32 %newadd, %in
+  br i1 %cond, label %do.body, label %end
+
+end:
+  ret void
+}
+
+
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
index 3b63c1e35610f..a3a4ab948519f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
@@ -49,19 +49,19 @@ bb1:
 define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis_reverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP2]], [[ENTRY]] ], [ [[TMP9]], [[BB0]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x half> [[TMP8]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index dd7a21198ac1f..651f565412830 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -141,7 +141,7 @@ define ptr @test4() {
 ; POWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
 ; POWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; POWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 0>
+; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
 ; POWEROF2-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
 ; POWEROF2-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
 ; POWEROF2-NEXT:    br label [[TMP8:%.*]]
@@ -156,10 +156,10 @@ define ptr @test4() {
 ; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
 ; POWEROF2-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
 ; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
 ; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
+; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
+; POWEROF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
 ; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
 ; POWEROF2-NEXT:    [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
 ; POWEROF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0

llvmbot · 2025-03-13T22:09:57Z

@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

Changes

Considering the PHIs in order of element extracted can lead to better shuffles

Full diff: https://github.com/llvm/llvm-project/pull/131229.diff

4 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+34-1)
(added) llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll (+142)
(modified) llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll (+7-7)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll (+4-4)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a9f61d7a9798a..46677db052897 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22687,8 +22687,41 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
           if (NodeI1 != NodeI2)
             return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
           InstructionsState S = getSameOpcode({I1, I2}, *TLI);
-          if (S && !S.isAltShuffle())
+          if (S && !S.isAltShuffle()) {
+            if (!isa<ExtractElementInst>(I1) || !isa<ExtractElementInst>(I2))
+              continue;
+
+            auto E1 = cast<ExtractElementInst>(I1);
+            auto E2 = cast<ExtractElementInst>(I2);
+            // Sort on ExtractElementInsts primarily by vector operands. Prefer
+            // program order of the vector operands
+            if (E1->getVectorOperand() != E2->getVectorOperand()) {
+              Instruction *V1 = dyn_cast<Instruction>(E1->getVectorOperand());
+              Instruction *V2 = dyn_cast<Instruction>(E2->getVectorOperand());
+              if (!V1 || !V2)
+                continue;
+              if (V1->getParent() != V2->getParent())
+                continue;
+              return V1->comesBefore(V2);
+            }
+            // If we have the same vector operand, try to sort by constant index
+            auto Id1 = E1->getIndexOperand();
+            auto Id2 = E2->getIndexOperand();
+            // Bring constants to the top
+            if (isa<ConstantInt>(Id1) && !isa<ConstantInt>(Id2))
+              return true;
+            if (!isa<ConstantInt>(Id1) && isa<ConstantInt>(Id2))
+              return false;
+            if (isa<ConstantInt>(Id1) && isa<ConstantInt>(Id2)) {
+              auto C1 = cast<ConstantInt>(Id1);
+              auto C2 = cast<ConstantInt>(Id2);
+              // First elements first
+              return C1->getValue().getZExtValue() <
+                     C2->getValue().getZExtValue();
+            }
+
             continue;
+          }
           return I1->getOpcode() < I2->getOpcode();
         }
         if (I1)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll
new file mode 100644
index 0000000000000..c585a7f08ad0c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=slp-vectorizer -S  | FileCheck %s --check-prefix=GFX9
+
+define protected amdgpu_kernel void @myfun(i32 %in, ptr addrspace(1) %aptr1, ptr addrspace(1) %bptr1, ptr addrspace(1) %aptr2, ptr addrspace(1) %bptr2)  {
+; GFX9-LABEL: define protected amdgpu_kernel void @myfun(
+; GFX9-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[APTR1:%.*]], ptr addrspace(1) [[BPTR1:%.*]], ptr addrspace(1) [[APTR2:%.*]], ptr addrspace(1) [[BPTR2:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX9-NEXT:  [[ENTRY:.*]]:
+; GFX9-NEXT:    [[VEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR1]], align 16
+; GFX9-NEXT:    [[BVEC1:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR1]], align 16
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[VEC1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[BVEC1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT:    br label %[[DO_BODY:.*]]
+; GFX9:       [[DO_BODY]]:
+; GFX9-NEXT:    [[ADD:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEWADD:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP31:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP33:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP36:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP15:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP37:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP18]], <8 x i16> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP20]], <8 x i16> [[TMP21]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; GFX9-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i16> [[TMP25]], <8 x i16> [[TMP26]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i16> [[TMP15]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP29:%.*]] = shufflevector <8 x i16> [[TMP27]], <8 x i16> [[TMP28]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; GFX9-NEXT:    [[RES:%.*]] = add <8 x i16> [[TMP22]], [[TMP29]]
+; GFX9-NEXT:    [[VEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[APTR2]], align 16
+; GFX9-NEXT:    [[BVEC2:%.*]] = load <8 x i16>, ptr addrspace(1) [[BPTR2]], align 16
+; GFX9-NEXT:    [[NEWADD]] = add i32 [[ADD]], 1
+; GFX9-NEXT:    [[COND:%.*]] = icmp sgt i32 [[NEWADD]], [[IN]]
+; GFX9-NEXT:    [[TMP30]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP31]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP32]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT:    [[TMP33]] = shufflevector <8 x i16> [[VEC2]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT:    [[TMP34]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP35]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP36]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; GFX9-NEXT:    [[TMP37]] = shufflevector <8 x i16> [[BVEC2]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; GFX9-NEXT:    br i1 [[COND]], label %[[DO_BODY]], label %[[END:.*]]
+; GFX9:       [[END]]:
+; GFX9-NEXT:    ret void
+;
+entry:
+  %vec1 = load <8 x i16>, ptr addrspace(1) %aptr1
+  %el0 = extractelement <8 x i16> %vec1, i64 0
+  %el1 = extractelement <8 x i16> %vec1, i64 1
+  %el2 = extractelement <8 x i16> %vec1, i64 2
+  %el3 = extractelement <8 x i16> %vec1, i64 3
+  %el4 = extractelement <8 x i16> %vec1, i64 4
+  %el5 = extractelement <8 x i16> %vec1, i64 5
+  %el6 = extractelement <8 x i16> %vec1, i64 6
+  %el7 = extractelement <8 x i16> %vec1, i64 7
+  %bvec1 = load <8 x i16>, ptr addrspace(1) %bptr1
+  %bel0 = extractelement <8 x i16> %bvec1, i64 0
+  %bel1 = extractelement <8 x i16> %bvec1, i64 1
+  %bel2 = extractelement <8 x i16> %bvec1, i64 2
+  %bel3 = extractelement <8 x i16> %bvec1, i64 3
+  %bel4 = extractelement <8 x i16> %bvec1, i64 4
+  %bel5 = extractelement <8 x i16> %bvec1, i64 5
+  %bel6 = extractelement <8 x i16> %bvec1, i64 6
+  %bel7 = extractelement <8 x i16> %bvec1, i64 7
+  br label %do.body
+
+do.body:
+  %a_thread_buf1 = phi i16 [%el1, %entry], [%newel1, %do.body]
+  %a_thread_buf2 = phi i16 [%el2, %entry], [%newel2, %do.body]
+  %a_thread_buf3 = phi i16 [%el3, %entry], [%newel3, %do.body]
+  %a_thread_buf4 = phi i16 [%el4, %entry], [%newel4, %do.body]
+  %a_thread_buf5 = phi i16 [%el5, %entry], [%newel5, %do.body]
+  %a_thread_buf6 = phi i16 [%el6, %entry], [%newel6, %do.body]
+  %a_thread_buf7 = phi i16 [%el7, %entry], [%newel7, %do.body]
+  %b_thread_buf1 = phi i16 [%bel1, %entry], [%bnewel1, %do.body]
+  %b_thread_buf2 = phi i16 [%bel2, %entry], [%bnewel2, %do.body]
+  %b_thread_buf3 = phi i16 [%bel3, %entry], [%bnewel3, %do.body]
+  %b_thread_buf4 = phi i16 [%bel4, %entry], [%bnewel4, %do.body]
+  %b_thread_buf5 = phi i16 [%bel5, %entry], [%bnewel5, %do.body]
+  %b_thread_buf6 = phi i16 [%bel6, %entry], [%bnewel6, %do.body]
+  %b_thread_buf7 = phi i16 [%bel7, %entry], [%bnewel7, %do.body]
+  %add = phi i32 [0, %entry], [%newadd, %do.body]
+  %a_thread_buf0 = phi i16 [%el0, %entry], [%newel0, %do.body]
+  %b_thread_buf0 = phi i16 [%bel0, %entry], [%bnewel0, %do.body]
+  %a_thread_vec0 = insertelement <8 x i16> poison, i16 %a_thread_buf0, i64 0
+  %a_thread_vec1 = insertelement <8 x i16> %a_thread_vec0, i16 %a_thread_buf1, i64 1
+  %a_thread_vec2 = insertelement <8 x i16> %a_thread_vec1, i16 %a_thread_buf2, i64 2
+  %a_thread_vec3 = insertelement <8 x i16> %a_thread_vec2, i16 %a_thread_buf3, i64 3
+  %a_thread_vec4 = insertelement <8 x i16> %a_thread_vec3, i16 %a_thread_buf4, i64 4
+  %a_thread_vec5 = insertelement <8 x i16> %a_thread_vec4, i16 %a_thread_buf5, i64 5
+  %a_thread_vec6 = insertelement <8 x i16> %a_thread_vec5, i16 %a_thread_buf6, i64 6
+  %a_thread_vec7 = insertelement <8 x i16> %a_thread_vec6, i16 %a_thread_buf7, i64 7
+  %b_thread_vec0 = insertelement <8 x i16> poison, i16 %b_thread_buf0, i64 0
+  %b_thread_vec1 = insertelement <8 x i16> %b_thread_vec0, i16 %b_thread_buf1, i64 1
+  %b_thread_vec2 = insertelement <8 x i16> %b_thread_vec1, i16 %b_thread_buf2, i64 2
+  %b_thread_vec3 = insertelement <8 x i16> %b_thread_vec2, i16 %b_thread_buf3, i64 3
+  %b_thread_vec4 = insertelement <8 x i16> %b_thread_vec3, i16 %b_thread_buf4, i64 4
+  %b_thread_vec5 = insertelement <8 x i16> %b_thread_vec4, i16 %b_thread_buf5, i64 5
+  %b_thread_vec6 = insertelement <8 x i16> %b_thread_vec5, i16 %b_thread_buf6, i64 6
+  %b_thread_vec7 = insertelement <8 x i16> %b_thread_vec6, i16 %b_thread_buf7, i64 7
+  %res = add <8 x i16> %a_thread_vec7, %b_thread_vec7
+  %vec2 = load <8 x i16>, ptr addrspace(1) %aptr2
+  %newel0 = extractelement <8 x i16> %vec2, i64 0
+  %newel1 = extractelement <8 x i16> %vec2, i64 1
+  %newel2 = extractelement <8 x i16> %vec2, i64 2
+  %newel3 = extractelement <8 x i16> %vec2, i64 3
+  %newel4 = extractelement <8 x i16> %vec2, i64 4
+  %newel5 = extractelement <8 x i16> %vec2, i64 5
+  %newel6 = extractelement <8 x i16> %vec2, i64 6
+  %newel7 = extractelement <8 x i16> %vec2, i64 7
+  %bvec2 = load <8 x i16>, ptr addrspace(1) %bptr2
+  %bnewel0 = extractelement <8 x i16> %bvec2, i64 0
+  %bnewel1 = extractelement <8 x i16> %bvec2, i64 1
+  %bnewel2 = extractelement <8 x i16> %bvec2, i64 2
+  %bnewel3 = extractelement <8 x i16> %bvec2, i64 3
+  %bnewel4 = extractelement <8 x i16> %bvec2, i64 4
+  %bnewel5 = extractelement <8 x i16> %bvec2, i64 5
+  %bnewel6 = extractelement <8 x i16> %bvec2, i64 6
+  %bnewel7 = extractelement <8 x i16> %bvec2, i64 7
+  %newadd = add i32 %add, 1
+  %cond = icmp sgt i32 %newadd, %in
+  br i1 %cond, label %do.body, label %end
+
+end:
+  ret void
+}
+
+
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
index 3b63c1e35610f..a3a4ab948519f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
@@ -49,19 +49,19 @@ bb1:
 define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis_reverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP2]], [[ENTRY]] ], [ [[TMP9]], [[BB0]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x half> [[TMP8]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index dd7a21198ac1f..651f565412830 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -141,7 +141,7 @@ define ptr @test4() {
 ; POWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
 ; POWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; POWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 0>
+; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
 ; POWEROF2-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
 ; POWEROF2-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
 ; POWEROF2-NEXT:    br label [[TMP8:%.*]]
@@ -156,10 +156,10 @@ define ptr @test4() {
 ; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
 ; POWEROF2-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
 ; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
 ; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
+; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
+; POWEROF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
 ; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
 ; POWEROF2-NEXT:    [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
 ; POWEROF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0

llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll

Precommit test for #131229

Precommit test for llvm/llvm-project#131229

Change-Id: I62668ebc355c7746ec5ef69249573fe87e1343cc

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

jtb20 · 2025-03-14T10:37:58Z

I'm still studying the code, but I'm wondering whether BoUpSLP::getReorderingData should be handling this case? It has its own phi node sorting, and indeed seems to be properly detecting identity order for the missed optimisation case here:

    if (IsIdentityOrder(ResOrder))
      return std::nullopt; // No need to reorder.

but, the "no need to reorder" case fires, and so the shuffles are left as-is rather than being removed.

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

alexey-bataev · 2025-03-14T11:13:04Z

I'm still studying the code, but I'm wondering whether BoUpSLP::getReorderingData should be handling this case? It has its own phi node sorting, and indeed seems to be properly detecting identity order for the missed optimisation case here:
    if (IsIdentityOrder(ResOrder))
      return std::nullopt; // No need to reorder.
but, the "no need to reorder" case fires, and so the shuffles are left as-is rather than being removed.

Generally speaking, implementing it in getReorderingData should be enough, but having it here as the first step also should be fine

Change-Id: I5bb81e7e3177f3cf196e682786a3de966a7a0014

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Change-Id: I8b15558af0c8b23ff8c769d1a01323a87068f6ef

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Change-Id: I666a35830bc431b763474dcbb2d7ecae58768a45

alexey-bataev

LG

jrbyrnes · 2025-03-17T21:19:30Z

Okay the bot failures are not caused by this patch and just exist in the slightly older version of llvm that this branch uses. Confirmed locally that the bot fails are fixed by 03614b9 .

Thanks for review

Precommit test for llvm#131229

jrbyrnes requested review from alexey-bataev and jtb20 March 13, 2025 22:09

llvmbot added backend:AMDGPU vectorizers llvm:transforms labels Mar 13, 2025

alexey-bataev reviewed Mar 13, 2025

View reviewed changes

llvm/test/Transforms/SLPVectorizer/AMDGPU/extract-ordering.ll Outdated Show resolved Hide resolved

jrbyrnes mentioned this pull request Mar 13, 2025

[SLP] Precommit test #131236

Merged

jrbyrnes force-pushed the SLPPhiSort branch from e4f663b to ee90133 Compare March 13, 2025 23:02

jrbyrnes added a commit that referenced this pull request Mar 13, 2025

[SLP] Precommit test (#131236)

fc28f83

Precommit test for #131229

llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Mar 13, 2025

Automerge: [SLP] Precommit test (#131236)

3bd7e5d

Precommit test for llvm/llvm-project#131229

[SLP] Sort PHIs by ExtractElements when relevant

b450a7c

Change-Id: I62668ebc355c7746ec5ef69249573fe87e1343cc

jrbyrnes force-pushed the SLPPhiSort branch from ee90133 to b450a7c Compare March 13, 2025 23:18

arsenm reviewed Mar 14, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

alexey-bataev reviewed Mar 14, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

Review comments

0b5d6c0

Change-Id: I5bb81e7e3177f3cf196e682786a3de966a7a0014

alexey-bataev reviewed Mar 17, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Show resolved Hide resolved

Review comments 2

a8f8f42

Change-Id: I8b15558af0c8b23ff8c769d1a01323a87068f6ef

alexey-bataev reviewed Mar 17, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Outdated Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Show resolved Hide resolved

Review Comments 3

fe2623c

Change-Id: I666a35830bc431b763474dcbb2d7ecae58768a45

alexey-bataev approved these changes Mar 17, 2025

View reviewed changes

jrbyrnes merged commit 4336e5e into llvm:main Mar 17, 2025
9 of 11 checks passed

frederik-h pushed a commit to frederik-h/llvm-project that referenced this pull request Mar 18, 2025

[SLP] Precommit test (llvm#131236)

304e96e

Precommit test for llvm#131229

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SLP] Sort PHIs by ExtractElements when relevant #131229

[SLP] Sort PHIs by ExtractElements when relevant #131229

jrbyrnes commented Mar 13, 2025

llvmbot commented Mar 13, 2025 •

edited

Loading

llvmbot commented Mar 13, 2025

jtb20 commented Mar 14, 2025

alexey-bataev commented Mar 14, 2025

alexey-bataev left a comment

jrbyrnes commented Mar 17, 2025

[SLP] Sort PHIs by ExtractElements when relevant #131229

[SLP] Sort PHIs by ExtractElements when relevant #131229

Conversation

jrbyrnes commented Mar 13, 2025

llvmbot commented Mar 13, 2025 • edited Loading

llvmbot commented Mar 13, 2025

jtb20 commented Mar 14, 2025

alexey-bataev commented Mar 14, 2025

alexey-bataev left a comment

Choose a reason for hiding this comment

jrbyrnes commented Mar 17, 2025

llvmbot commented Mar 13, 2025 •

edited

Loading