Skip to content

Commit 2f11b3a

Browse files
committed
[MachineCombiner] Don't compute the latency of transient instructions
If an MI will not generate a target instruction, we should not compute its latency. Then we can compute more precise instruction sequence cost, and get better result. Differential Revision: https://reviews.llvm.org/D129615
1 parent f48ce52 commit 2f11b3a

File tree

6 files changed

+333
-242
lines changed

6 files changed

+333
-242
lines changed

llvm/lib/CodeGen/MachineCombiner.cpp

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class MachineCombiner : public MachineFunctionPass {
9292
bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize);
9393
bool combineInstructions(MachineBasicBlock *);
9494
MachineInstr *getOperandDef(const MachineOperand &MO);
95+
bool isTransientMI(const MachineInstr *MI);
9596
unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
9697
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
9798
MachineTraceMetrics::Trace BlockTrace);
@@ -158,6 +159,43 @@ MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
158159
return DefInstr;
159160
}
160161

162+
/// Return true if MI is unlikely to generate an actual target instruction.
163+
bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
164+
if (!MI->isCopy())
165+
return MI->isTransient();
166+
167+
// If MI is a COPY, check if its src and dst registers can be coalesced.
168+
Register Dst = MI->getOperand(0).getReg();
169+
Register Src = MI->getOperand(1).getReg();
170+
171+
if (!MI->isFullCopy()) {
172+
// If src RC contains super registers of dst RC, it can also be coalesced.
173+
if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
174+
return false;
175+
176+
auto SrcSub = MI->getOperand(1).getSubReg();
177+
auto SrcRC = MRI->getRegClass(Src);
178+
auto DstRC = MRI->getRegClass(Dst);
179+
return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
180+
}
181+
182+
if (Src.isPhysical() && Dst.isPhysical())
183+
return Src == Dst;
184+
185+
if (Src.isVirtual() && Dst.isVirtual()) {
186+
auto SrcRC = MRI->getRegClass(Src);
187+
auto DstRC = MRI->getRegClass(Dst);
188+
return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
189+
}
190+
191+
if (Src.isVirtual())
192+
std::swap(Src, Dst);
193+
194+
// Now Src is physical register, Dst is virtual register.
195+
auto DstRC = MRI->getRegClass(Dst);
196+
return DstRC->contains(Src);
197+
}
198+
161199
/// Computes depth of instructions in vector \InsInstr.
162200
///
163201
/// \param InsInstrs is a vector of machine instructions
@@ -204,9 +242,10 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
204242
MachineInstr *DefInstr = getOperandDef(MO);
205243
if (DefInstr) {
206244
DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
207-
LatencyOp = TSchedModel.computeOperandLatency(
208-
DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
209-
InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
245+
if (!isTransientMI(DefInstr))
246+
LatencyOp = TSchedModel.computeOperandLatency(
247+
DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
248+
InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
210249
}
211250
}
212251
IDepth = std::max(IDepth, DepthOp + LatencyOp);

llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ body: |
2323
2424
...
2525
# UNPROFITABLE-LABEL: name: f1_2s
26-
# UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1
27-
# UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2
26+
# UNPROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
27+
# UNPROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1
2828
#
2929
# PROFITABLE-LABEL: name: f1_2s
3030
# PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
@@ -49,8 +49,8 @@ body: |
4949
5050
...
5151
# UNPROFITABLE-LABEL: name: f1_4s
52-
# UNPROFITABLE: %3:fpr128 = FMULv4f32 %0, %1
53-
# UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2
52+
# UNPROFITABLE: [[R1:%[0-9]+]]:fpr128 = FMULv4f32 %0, %1
53+
# UNPROFITABLE-NEXT: FSUBv4f32 killed [[R1]], %2
5454
#
5555
# PROFITABLE-LABEL: name: f1_4s
5656
# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
3+
4+
; When lowering the LLVM IR to MIR, transient COPY instructions are generated
5+
; to copy parameters from physical registers to virtual registers. Wrong
6+
; handling of transient instructions in MachineCombiner can fail the forllowing
7+
; transformation.
8+
; A - (B + C) ==> (A - B) - C
9+
10+
; 32 bit version.
11+
define i32 @test1(i32 %a, i32 %b, i32 %c) {
12+
; CHECK-LABEL: test1:
13+
; CHECK: // %bb.0: // %entry
14+
; CHECK-NEXT: sub w8, w2, w0
15+
; CHECK-NEXT: eor w9, w1, w0, lsl #8
16+
; CHECK-NEXT: sub w8, w8, w9
17+
; CHECK-NEXT: eor w0, w8, w9, asr #13
18+
; CHECK-NEXT: ret
19+
entry:
20+
%shl = shl i32 %a, 8
21+
%xor = xor i32 %shl, %b
22+
%add = add i32 %xor, %a
23+
%sub = sub i32 %c, %add
24+
%shr = ashr i32 %xor, 13
25+
%xor2 = xor i32 %sub, %shr
26+
ret i32 %xor2
27+
}
28+
29+
; 64 bit version.
30+
define i64 @test2(i64 %a, i64 %b, i64 %c) {
31+
; CHECK-LABEL: test2:
32+
; CHECK: // %bb.0: // %entry
33+
; CHECK-NEXT: sub x8, x2, x0
34+
; CHECK-NEXT: eor x9, x1, x0, lsl #8
35+
; CHECK-NEXT: sub x8, x8, x9
36+
; CHECK-NEXT: eor x0, x8, x9, asr #13
37+
; CHECK-NEXT: ret
38+
entry:
39+
%shl = shl i64 %a, 8
40+
%xor = xor i64 %shl, %b
41+
%add = add i64 %xor, %a
42+
%sub = sub i64 %c, %add
43+
%shr = ashr i64 %xor, 13
44+
%xor2 = xor i64 %sub, %shr
45+
ret i64 %xor2
46+
}

llvm/test/CodeGen/AArch64/neon-mla-mls.ll

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,9 @@ define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
138138
define <8 x i8> @mls2v8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
139139
; CHECK-LABEL: mls2v8xi8:
140140
; CHECK: // %bb.0:
141-
; CHECK-NEXT: mul v0.8b, v0.8b, v1.8b
142-
; CHECK-NEXT: sub v0.8b, v0.8b, v2.8b
141+
; CHECK-NEXT: neg v2.8b, v2.8b
142+
; CHECK-NEXT: mla v2.8b, v0.8b, v1.8b
143+
; CHECK-NEXT: fmov d0, d2
143144
; CHECK-NEXT: ret
144145
%tmp1 = mul <8 x i8> %A, %B;
145146
%tmp2 = sub <8 x i8> %tmp1, %C;
@@ -149,8 +150,9 @@ define <8 x i8> @mls2v8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
149150
define <16 x i8> @mls2v16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
150151
; CHECK-LABEL: mls2v16xi8:
151152
; CHECK: // %bb.0:
152-
; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b
153-
; CHECK-NEXT: sub v0.16b, v0.16b, v2.16b
153+
; CHECK-NEXT: neg v2.16b, v2.16b
154+
; CHECK-NEXT: mla v2.16b, v0.16b, v1.16b
155+
; CHECK-NEXT: mov v0.16b, v2.16b
154156
; CHECK-NEXT: ret
155157
%tmp1 = mul <16 x i8> %A, %B;
156158
%tmp2 = sub <16 x i8> %tmp1, %C;
@@ -160,8 +162,9 @@ define <16 x i8> @mls2v16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
160162
define <4 x i16> @mls2v4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
161163
; CHECK-LABEL: mls2v4xi16:
162164
; CHECK: // %bb.0:
163-
; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h
164-
; CHECK-NEXT: sub v0.4h, v0.4h, v2.4h
165+
; CHECK-NEXT: neg v2.4h, v2.4h
166+
; CHECK-NEXT: mla v2.4h, v0.4h, v1.4h
167+
; CHECK-NEXT: fmov d0, d2
165168
; CHECK-NEXT: ret
166169
%tmp1 = mul <4 x i16> %A, %B;
167170
%tmp2 = sub <4 x i16> %tmp1, %C;
@@ -171,8 +174,9 @@ define <4 x i16> @mls2v4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
171174
define <8 x i16> @mls2v8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
172175
; CHECK-LABEL: mls2v8xi16:
173176
; CHECK: // %bb.0:
174-
; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
175-
; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h
177+
; CHECK-NEXT: neg v2.8h, v2.8h
178+
; CHECK-NEXT: mla v2.8h, v0.8h, v1.8h
179+
; CHECK-NEXT: mov v0.16b, v2.16b
176180
; CHECK-NEXT: ret
177181
%tmp1 = mul <8 x i16> %A, %B;
178182
%tmp2 = sub <8 x i16> %tmp1, %C;
@@ -182,8 +186,9 @@ define <8 x i16> @mls2v8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
182186
define <2 x i32> @mls2v2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
183187
; CHECK-LABEL: mls2v2xi32:
184188
; CHECK: // %bb.0:
185-
; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
186-
; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s
189+
; CHECK-NEXT: neg v2.2s, v2.2s
190+
; CHECK-NEXT: mla v2.2s, v0.2s, v1.2s
191+
; CHECK-NEXT: fmov d0, d2
187192
; CHECK-NEXT: ret
188193
%tmp1 = mul <2 x i32> %A, %B;
189194
%tmp2 = sub <2 x i32> %tmp1, %C;
@@ -193,8 +198,9 @@ define <2 x i32> @mls2v2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
193198
define <4 x i32> @mls2v4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
194199
; CHECK-LABEL: mls2v4xi32:
195200
; CHECK: // %bb.0:
196-
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
197-
; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
201+
; CHECK-NEXT: neg v2.4s, v2.4s
202+
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
203+
; CHECK-NEXT: mov v0.16b, v2.16b
198204
; CHECK-NEXT: ret
199205
%tmp1 = mul <4 x i32> %A, %B;
200206
%tmp2 = sub <4 x i32> %tmp1, %C;

0 commit comments

Comments
 (0)