[MachineCombiner] Don't compute the latency of transient instructions

weiguozhi · weiguozhi · commit 2f11b3a6d7b8 · 2022-07-14T17:08:14.000Z
If an MI will not generate a target instruction, we should not compute its latency. Then we can compute more precise instruction sequence cost, and get better result. Differential Revision: https://reviews.llvm.org/D129615
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -92,6 +92,7 @@ class MachineCombiner : public MachineFunctionPass {
   bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize);
   bool combineInstructions(MachineBasicBlock *);
   MachineInstr *getOperandDef(const MachineOperand &MO);
+  bool isTransientMI(const MachineInstr *MI);
   unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
                     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
                     MachineTraceMetrics::Trace BlockTrace);
@@ -158,6 +159,43 @@ MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
   return DefInstr;
 }
 
+/// Return true if MI is unlikely to generate an actual target instruction.
+bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
+  if (!MI->isCopy())
+    return MI->isTransient();
+
+  // If MI is a COPY, check if its src and dst registers can be coalesced.
+  Register Dst = MI->getOperand(0).getReg();
+  Register Src = MI->getOperand(1).getReg();
+
+  if (!MI->isFullCopy()) {
+    // If src RC contains super registers of dst RC, it can also be coalesced.
+    if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
+      return false;
+
+    auto SrcSub = MI->getOperand(1).getSubReg();
+    auto SrcRC = MRI->getRegClass(Src);
+    auto DstRC = MRI->getRegClass(Dst);
+    return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
+  }
+
+  if (Src.isPhysical() && Dst.isPhysical())
+    return Src == Dst;
+
+  if (Src.isVirtual() && Dst.isVirtual()) {
+    auto SrcRC = MRI->getRegClass(Src);
+    auto DstRC = MRI->getRegClass(Dst);
+    return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
+  }
+
+  if (Src.isVirtual())
+    std::swap(Src, Dst);
+
+  // Now Src is physical register, Dst is virtual register.
+  auto DstRC = MRI->getRegClass(Dst);
+  return DstRC->contains(Src);
+}
+
 /// Computes depth of instructions in vector \InsInstr.
 ///
 /// \param InsInstrs is a vector of machine instructions
@@ -204,9 +242,10 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
         MachineInstr *DefInstr = getOperandDef(MO);
         if (DefInstr) {
           DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
-          LatencyOp = TSchedModel.computeOperandLatency(
-              DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
-              InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+          if (!isTransientMI(DefInstr))
+            LatencyOp = TSchedModel.computeOperandLatency(
+                DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
+                InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
         }
       }
       IDepth = std::max(IDepth, DepthOp + LatencyOp);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir
@@ -23,8 +23,8 @@ body:             |
 
 ...
 # UNPROFITABLE-LABEL: name: f1_2s
-# UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1
-# UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2
+# UNPROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
+# UNPROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1
 #
 # PROFITABLE-LABEL: name: f1_2s
 # PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
@@ -49,8 +49,8 @@ body:             |
 
 ...
 # UNPROFITABLE-LABEL: name: f1_4s
-# UNPROFITABLE: %3:fpr128 = FMULv4f32 %0, %1
-# UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2
+# UNPROFITABLE: [[R1:%[0-9]+]]:fpr128 = FMULv4f32 %0, %1
+# UNPROFITABLE-NEXT: FSUBv4f32 killed [[R1]], %2
 #
 # PROFITABLE-LABEL: name: f1_4s
 # PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-transient.ll b/llvm/test/CodeGen/AArch64/machine-combiner-transient.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
+
+; When lowering the LLVM IR to MIR, transient COPY instructions are generated
+; to copy parameters from physical registers to virtual registers. Wrong
+; handling of transient instructions in MachineCombiner can fail the forllowing
+; transformation.
+;     A - (B + C)  ==>   (A - B) - C
+
+; 32 bit version.
+define i32 @test1(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub w8, w2, w0
+; CHECK-NEXT:    eor w9, w1, w0, lsl #8
+; CHECK-NEXT:    sub w8, w8, w9
+; CHECK-NEXT:    eor w0, w8, w9, asr #13
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %a, 8
+  %xor = xor i32 %shl, %b
+  %add = add i32 %xor, %a
+  %sub = sub i32 %c, %add
+  %shr = ashr i32 %xor, 13
+  %xor2 = xor i32 %sub, %shr
+  ret i32 %xor2
+}
+
+; 64 bit version.
+define i64 @test2(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: test2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub x8, x2, x0
+; CHECK-NEXT:    eor x9, x1, x0, lsl #8
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    eor x0, x8, x9, asr #13
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i64 %a, 8
+  %xor = xor i64 %shl, %b
+  %add = add i64 %xor, %a
+  %sub = sub i64 %c, %add
+  %shr = ashr i64 %xor, 13
+  %xor2 = xor i64 %sub, %shr
+  ret i64 %xor2
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll
@@ -138,8 +138,9 @@ define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
 define <8 x i8> @mls2v8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
 ; CHECK-LABEL: mls2v8xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    sub v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    neg v2.8b, v2.8b
+; CHECK-NEXT:    mla v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    fmov d0, d2
 ; CHECK-NEXT:    ret
 	%tmp1 = mul <8 x i8> %A, %B;
 	%tmp2 = sub <8 x i8> %tmp1, %C;
@@ -149,8 +150,9 @@ define <8 x i8> @mls2v8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
 define <16 x i8> @mls2v16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
 ; CHECK-LABEL: mls2v16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sub v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    neg v2.16b, v2.16b
+; CHECK-NEXT:    mla v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = mul <16 x i8> %A, %B;
 	%tmp2 = sub <16 x i8> %tmp1, %C;
@@ -160,8 +162,9 @@ define <16 x i8> @mls2v16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
 define <4 x i16> @mls2v4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
 ; CHECK-LABEL: mls2v4xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    sub v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    neg v2.4h, v2.4h
+; CHECK-NEXT:    mla v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    fmov d0, d2
 ; CHECK-NEXT:    ret
 	%tmp1 = mul <4 x i16> %A, %B;
 	%tmp2 = sub <4 x i16> %tmp1, %C;
@@ -171,8 +174,9 @@ define <4 x i16> @mls2v4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
 define <8 x i16> @mls2v8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: mls2v8xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    sub v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    neg v2.8h, v2.8h
+; CHECK-NEXT:    mla v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = mul <8 x i16> %A, %B;
 	%tmp2 = sub <8 x i16> %tmp1, %C;
@@ -182,8 +186,9 @@ define <8 x i16> @mls2v8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
 define <2 x i32> @mls2v2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
 ; CHECK-LABEL: mls2v2xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    sub v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    neg v2.2s, v2.2s
+; CHECK-NEXT:    mla v2.2s, v0.2s, v1.2s
+; CHECK-NEXT:    fmov d0, d2
 ; CHECK-NEXT:    ret
 	%tmp1 = mul <2 x i32> %A, %B;
 	%tmp2 = sub <2 x i32> %tmp1, %C;
@@ -193,8 +198,9 @@ define <2 x i32> @mls2v2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
 define <4 x i32> @mls2v4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
 ; CHECK-LABEL: mls2v4xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    neg v2.4s, v2.4s
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = mul <4 x i32> %A, %B;
 	%tmp2 = sub <4 x i32> %tmp1, %C;
diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
diff --git a/llvm/test/CodeGen/X86/machine-combiner-int.ll b/llvm/test/CodeGen/X86/machine-combiner-int.ll