Skip to content

Commit 344f594

Browse files
authored
[X86] combineTargetShuffle - fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y) (#141579)
Move VZEXT_MOVL nodes up through shift nodes. We should be trying harder to move VZEXT_MOVL towards any associated SCALAR_TO_VECTOR nodes to make use of MOVD/Q implicit zeroing of upper elements. Fixes #141475
1 parent 3e18216 commit 344f594

File tree

5 files changed

+39
-66
lines changed

5 files changed

+39
-66
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42368,6 +42368,20 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4236842368
case X86ISD::VZEXT_MOVL: {
4236942369
SDValue N0 = N.getOperand(0);
4237042370

42371+
// Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42372+
// Zeroing out the upper elements means we're just shifting a zero value.
42373+
// TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42374+
// TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42375+
if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42376+
N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42377+
N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42378+
if (N0.hasOneUse())
42379+
return DAG.getNode(
42380+
N0.getOpcode(), DL, VT,
42381+
DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42382+
N0.getOperand(1));
42383+
}
42384+
4237142385
// If this a vzmovl of a full vector load, replace it with a vzload, unless
4237242386
// the load is volatile.
4237342387
if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

llvm/test/CodeGen/X86/codegen-no-uselist-constantdata.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,8 @@ define <16 x i8> @load_null_offset() {
3636
; CHECK-LABEL: load_null_offset:
3737
; CHECK: # %bb.0:
3838
; CHECK-NEXT: movzbl 11, %eax
39-
; CHECK-NEXT: movd %eax, %xmm1
40-
; CHECK-NEXT: pslld $8, %xmm1
41-
; CHECK-NEXT: xorps %xmm0, %xmm0
42-
; CHECK-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
39+
; CHECK-NEXT: movd %eax, %xmm0
40+
; CHECK-NEXT: pslld $8, %xmm0
4341
; CHECK-NEXT: retq
4442
%gep.null = getelementptr i8, ptr null, i64 11
4543
%load = load i8, ptr %gep.null, align 1

llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,15 +147,14 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
147147
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
148148
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
149149
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
150-
; SSE2-NEXT: movdqa %xmm0, %xmm3
151-
; SSE2-NEXT: pand %xmm1, %xmm3
152-
; SSE2-NEXT: psrld $1, %xmm3
153-
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
154-
; SSE2-NEXT: pslld $10, %xmm0
155-
; SSE2-NEXT: xorps %xmm3, %xmm3
150+
; SSE2-NEXT: pxor %xmm3, %xmm3
156151
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
157-
; SSE2-NEXT: orps %xmm2, %xmm3
158-
; SSE2-NEXT: andps %xmm1, %xmm3
152+
; SSE2-NEXT: pand %xmm1, %xmm0
153+
; SSE2-NEXT: psrld $1, %xmm0
154+
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
155+
; SSE2-NEXT: pslld $10, %xmm3
156+
; SSE2-NEXT: por %xmm2, %xmm3
157+
; SSE2-NEXT: pand %xmm1, %xmm3
159158
; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
160159
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
161160
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -175,9 +174,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
175174
; SSE41-NEXT: pand %xmm1, %xmm2
176175
; SSE41-NEXT: psrld $1, %xmm2
177176
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
178-
; SSE41-NEXT: pslld $10, %xmm0
179177
; SSE41-NEXT: pxor %xmm3, %xmm3
180178
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7]
179+
; SSE41-NEXT: pslld $10, %xmm3
181180
; SSE41-NEXT: por %xmm2, %xmm3
182181
; SSE41-NEXT: pand %xmm1, %xmm3
183182
; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
@@ -200,9 +199,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
200199
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
201200
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
202201
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7]
203-
; AVX1-NEXT: vpslld $10, %xmm0, %xmm0
204202
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
205203
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
204+
; AVX1-NEXT: vpslld $10, %xmm0, %xmm0
206205
; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
207206
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
208207
; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0

llvm/test/CodeGen/X86/vec_insert-5.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,16 @@ define void @t1(i32 %a, ptr %P) nounwind {
1010
; X86: # %bb.0:
1111
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1212
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
13+
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
1314
; X86-NEXT: pslld $12, %xmm0
14-
; X86-NEXT: psllq $32, %xmm0
1515
; X86-NEXT: movq %xmm0, (%eax)
1616
; X86-NEXT: retl
1717
;
1818
; X64-LABEL: t1:
1919
; X64: # %bb.0:
2020
; X64-NEXT: movd %edi, %xmm0
21-
; X64-NEXT: pslld $12, %xmm0
2221
; X64-NEXT: psllq $32, %xmm0
22+
; X64-NEXT: pslld $12, %xmm0
2323
; X64-NEXT: movq %xmm0, (%rsi)
2424
; X64-NEXT: retq
2525
%tmp12 = shl i32 %a, 12

llvm/test/CodeGen/X86/vector-shuffle-combining.ll

Lines changed: 12 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3547,57 +3547,19 @@ define <16 x i8> @PR107289(<16 x i8> %0) {
35473547
}
35483548

35493549
define <8 x i16> @PR141475(i32 %in) {
3550-
; SSE2-LABEL: PR141475:
3551-
; SSE2: # %bb.0:
3552-
; SSE2-NEXT: movd %edi, %xmm0
3553-
; SSE2-NEXT: pslld $1, %xmm0
3554-
; SSE2-NEXT: xorps %xmm1, %xmm1
3555-
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
3556-
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
3557-
; SSE2-NEXT: retq
3558-
;
3559-
; SSSE3-LABEL: PR141475:
3560-
; SSSE3: # %bb.0:
3561-
; SSSE3-NEXT: movd %edi, %xmm0
3562-
; SSSE3-NEXT: pslld $1, %xmm0
3563-
; SSSE3-NEXT: xorps %xmm1, %xmm1
3564-
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
3565-
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
3566-
; SSSE3-NEXT: retq
3567-
;
3568-
; SSE41-LABEL: PR141475:
3569-
; SSE41: # %bb.0:
3570-
; SSE41-NEXT: movd %edi, %xmm0
3571-
; SSE41-NEXT: pslld $1, %xmm0
3572-
; SSE41-NEXT: pxor %xmm1, %xmm1
3573-
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
3574-
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
3575-
; SSE41-NEXT: retq
3576-
;
3577-
; AVX1-LABEL: PR141475:
3578-
; AVX1: # %bb.0:
3579-
; AVX1-NEXT: vmovd %edi, %xmm0
3580-
; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
3581-
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
3582-
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
3583-
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3584-
; AVX1-NEXT: retq
3585-
;
3586-
; AVX2-SLOW-LABEL: PR141475:
3587-
; AVX2-SLOW: # %bb.0:
3588-
; AVX2-SLOW-NEXT: vmovd %edi, %xmm0
3589-
; AVX2-SLOW-NEXT: vpslld $1, %xmm0, %xmm0
3590-
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3591-
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
3592-
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3593-
; AVX2-SLOW-NEXT: retq
3550+
; SSE-LABEL: PR141475:
3551+
; SSE: # %bb.0:
3552+
; SSE-NEXT: movd %edi, %xmm0
3553+
; SSE-NEXT: pslld $1, %xmm0
3554+
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3555+
; SSE-NEXT: retq
35943556
;
3595-
; AVX2-FAST-LABEL: PR141475:
3596-
; AVX2-FAST: # %bb.0:
3597-
; AVX2-FAST-NEXT: vmovd %edi, %xmm0
3598-
; AVX2-FAST-NEXT: vpslld $1, %xmm0, %xmm0
3599-
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1],zero,zero,zero,zero,zero,zero,zero,zero
3600-
; AVX2-FAST-NEXT: retq
3557+
; AVX-LABEL: PR141475:
3558+
; AVX: # %bb.0:
3559+
; AVX-NEXT: vmovd %edi, %xmm0
3560+
; AVX-NEXT: vpslld $1, %xmm0, %xmm0
3561+
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
3562+
; AVX-NEXT: retq
36013563
%mul = shl i32 %in, 1
36023564
%vecinit = insertelement <4 x i32> zeroinitializer, i32 %mul, i64 0
36033565
%cast = bitcast <4 x i32> %vecinit to <8 x i16>

0 commit comments

Comments
 (0)