[CGP] Update MemIntrinsic alignment if possible

arichardson · arichardson · commit 754d25844a7e · 2022-11-17T11:59:35.000Z
Previously it was only being done if shouldAlignPointerArgs() returned true, which right now is only true for ARM targets. Updating the argument alignment attributes of memcpy/memset intrinsics if the underlying object has larger alignment can be beneficial even when CGP didn't increase alignment (as can be seen from the test changes), so invert the loop and if condition. Differential Revision: https://reviews.llvm.org/D134281
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2252,19 +2252,19 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
           DL->getTypeAllocSize(GV->getValueType()) >= MinSize + Offset2)
         GV->setAlignment(PrefAlign);
     }
-    // If this is a memcpy (or similar) then we may be able to improve the
-    // alignment
-    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
-      Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
-      MaybeAlign MIDestAlign = MI->getDestAlign();
-      if (!MIDestAlign || DestAlign > *MIDestAlign)
-        MI->setDestAlignment(DestAlign);
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
-        MaybeAlign MTISrcAlign = MTI->getSourceAlign();
-        Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
-        if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
-          MTI->setSourceAlignment(SrcAlign);
-      }
+  }
+  // If this is a memcpy (or similar) then we may be able to improve the
+  // alignment.
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
+    Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
+    MaybeAlign MIDestAlign = MI->getDestAlign();
+    if (!MIDestAlign || DestAlign > *MIDestAlign)
+      MI->setDestAlignment(DestAlign);
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+      MaybeAlign MTISrcAlign = MTI->getSourceAlign();
+      Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
+      if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
+        MTI->setSourceAlignment(SrcAlign);
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -19,16 +19,16 @@ define protected amdgpu_kernel void @test(i8 addrspace(1)* nocapture %ptr.coerce
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    ds_read_u8 v1, v0 offset:1
+; GCN-NEXT:    v_mov_b32_e32 v1, 2
+; GCN-NEXT:    ds_write_b8 v0, v1
 ; GCN-NEXT:    ds_read_u8 v2, v0 offset:2
+; GCN-NEXT:    ds_read_u16 v3, v0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v3, 2
-; GCN-NEXT:    ds_write_b8 v0, v3
-; GCN-NEXT:    ds_write_b8 v0, v3 offset:4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ds_write_b8 v0, v1 offset:5
 ; GCN-NEXT:    ds_write_b8 v0, v2 offset:6
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    ds_write_b16 v0, v3 offset:4
+; GCN-NEXT:    v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; GCN-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/CodeGen/X86/mcu-abi.ll b/llvm/test/CodeGen/X86/mcu-abi.ll
@@ -64,13 +64,14 @@ entry:
 define void @ret_large_struct(ptr noalias nocapture sret(%struct.st12_t) %agg.result, ptr byval(%struct.st12_t) nocapture readonly align 4 %r) #0 {
 ; CHECK-LABEL: ret_large_struct:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    movl %eax, %esi
-; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl $48, %ecx
-; CHECK-NEXT:    calll memcpy
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl $12, %ecx
+; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
 ; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 entry:
   call void @llvm.memcpy.p0.p0.i32(ptr %agg.result, ptr %r, i32 48, i1 false)
diff --git a/llvm/test/CodeGen/X86/memset-2.ll b/llvm/test/CodeGen/X86/memset-2.ll
@@ -1,31 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s | FileCheck %s
 
-define fastcc void @t1() nounwind {
+define fastcc void @t1(ptr nocapture %s) nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    pushl $188
 ; CHECK-NEXT:    pushl $0
-; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $16, %esp
 ; CHECK-NEXT:    ud2
 entry:
-  call void @llvm.memset.p0.i32(ptr null, i8 0, i32 188, i1 false)
+  call void @llvm.memset.p0.i32(ptr %s, i8 0, i32 188, i1 false)
   unreachable
 }
 
-define fastcc void @t2(i8 signext %c) nounwind {
+define fastcc void @t2(ptr nocapture %s, i8 signext %c) nounwind {
 ; CHECK-LABEL: t2:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    subl $12, %esp
-; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $76, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    subl $16, %esp
+; CHECK-NEXT:    pushl $76
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _memset
+; CHECK-NEXT:    addl $16, %esp
 ; CHECK-NEXT:    ud2
 entry:
-  call void @llvm.memset.p0.i32(ptr undef, i8 %c, i32 76, i1 false)
+  call void @llvm.memset.p0.i32(ptr %s, i8 %c, i32 76, i1 false)
   unreachable
 }
 
diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -3,55 +3,57 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown   -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_64
 
-define void @bork() nounwind {
+define void @bork(ptr nocapture align 4 %dst) nounwind {
 ; FAST-LABEL: bork:
 ; FAST:       # %bb.0:
+; FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FAST-NEXT:    xorps %xmm0, %xmm0
-; FAST-NEXT:    movups %xmm0, 64
-; FAST-NEXT:    movups %xmm0, 48
-; FAST-NEXT:    movups %xmm0, 32
-; FAST-NEXT:    movups %xmm0, 16
-; FAST-NEXT:    movups %xmm0, 0
+; FAST-NEXT:    movups %xmm0, 64(%eax)
+; FAST-NEXT:    movups %xmm0, 48(%eax)
+; FAST-NEXT:    movups %xmm0, 32(%eax)
+; FAST-NEXT:    movups %xmm0, 16(%eax)
+; FAST-NEXT:    movups %xmm0, (%eax)
 ; FAST-NEXT:    retl
 ;
 ; SLOW_32-LABEL: bork:
 ; SLOW_32:       # %bb.0:
-; SLOW_32-NEXT:    movl $0, 4
-; SLOW_32-NEXT:    movl $0, 0
-; SLOW_32-NEXT:    movl $0, 12
-; SLOW_32-NEXT:    movl $0, 8
-; SLOW_32-NEXT:    movl $0, 20
-; SLOW_32-NEXT:    movl $0, 16
-; SLOW_32-NEXT:    movl $0, 28
-; SLOW_32-NEXT:    movl $0, 24
-; SLOW_32-NEXT:    movl $0, 36
-; SLOW_32-NEXT:    movl $0, 32
-; SLOW_32-NEXT:    movl $0, 44
-; SLOW_32-NEXT:    movl $0, 40
-; SLOW_32-NEXT:    movl $0, 52
-; SLOW_32-NEXT:    movl $0, 48
-; SLOW_32-NEXT:    movl $0, 60
-; SLOW_32-NEXT:    movl $0, 56
-; SLOW_32-NEXT:    movl $0, 68
-; SLOW_32-NEXT:    movl $0, 64
-; SLOW_32-NEXT:    movl $0, 76
-; SLOW_32-NEXT:    movl $0, 72
+; SLOW_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SLOW_32-NEXT:    movl $0, 4(%eax)
+; SLOW_32-NEXT:    movl $0, (%eax)
+; SLOW_32-NEXT:    movl $0, 12(%eax)
+; SLOW_32-NEXT:    movl $0, 8(%eax)
+; SLOW_32-NEXT:    movl $0, 20(%eax)
+; SLOW_32-NEXT:    movl $0, 16(%eax)
+; SLOW_32-NEXT:    movl $0, 28(%eax)
+; SLOW_32-NEXT:    movl $0, 24(%eax)
+; SLOW_32-NEXT:    movl $0, 36(%eax)
+; SLOW_32-NEXT:    movl $0, 32(%eax)
+; SLOW_32-NEXT:    movl $0, 44(%eax)
+; SLOW_32-NEXT:    movl $0, 40(%eax)
+; SLOW_32-NEXT:    movl $0, 52(%eax)
+; SLOW_32-NEXT:    movl $0, 48(%eax)
+; SLOW_32-NEXT:    movl $0, 60(%eax)
+; SLOW_32-NEXT:    movl $0, 56(%eax)
+; SLOW_32-NEXT:    movl $0, 68(%eax)
+; SLOW_32-NEXT:    movl $0, 64(%eax)
+; SLOW_32-NEXT:    movl $0, 76(%eax)
+; SLOW_32-NEXT:    movl $0, 72(%eax)
 ; SLOW_32-NEXT:    retl
 ;
 ; SLOW_64-LABEL: bork:
 ; SLOW_64:       # %bb.0:
-; SLOW_64-NEXT:    movq $0, 72
-; SLOW_64-NEXT:    movq $0, 64
-; SLOW_64-NEXT:    movq $0, 56
-; SLOW_64-NEXT:    movq $0, 48
-; SLOW_64-NEXT:    movq $0, 40
-; SLOW_64-NEXT:    movq $0, 32
-; SLOW_64-NEXT:    movq $0, 24
-; SLOW_64-NEXT:    movq $0, 16
-; SLOW_64-NEXT:    movq $0, 8
-; SLOW_64-NEXT:    movq $0, 0
+; SLOW_64-NEXT:    movq $0, 72(%rdi)
+; SLOW_64-NEXT:    movq $0, 64(%rdi)
+; SLOW_64-NEXT:    movq $0, 56(%rdi)
+; SLOW_64-NEXT:    movq $0, 48(%rdi)
+; SLOW_64-NEXT:    movq $0, 40(%rdi)
+; SLOW_64-NEXT:    movq $0, 32(%rdi)
+; SLOW_64-NEXT:    movq $0, 24(%rdi)
+; SLOW_64-NEXT:    movq $0, 16(%rdi)
+; SLOW_64-NEXT:    movq $0, 8(%rdi)
+; SLOW_64-NEXT:    movq $0, (%rdi)
 ; SLOW_64-NEXT:    retq
-  call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 80, i1 false)
+  call void @llvm.memset.p0.i64(ptr align 4 %dst, i8 0, i64 80, i1 false)
   ret void
 }