Skip to content

Commit d768d35

Browse files
committed
[MC][X86] Correctly model additional operand latency caused by transfer delays from the integer to the floating point unit.
This patch adds a new ReadAdvance definition named ReadInt2Fpu. ReadInt2Fpu allows x86 scheduling models to accurately describe delays caused by data transfers from the integer unit to the floating point unit. ReadInt2Fpu currently defaults to a delay of zero cycles (i.e. no delay) for all x86 models excluding BtVer2. That means, this patch is only a functional change for the Jaguar cpu model only. Tablegen definitions for instructions (V)PINSR* have been updated to account for the new ReadInt2Fpu. That read is mapped to the the GPR input operand. On Jaguar, int-to-fpu transfers are modeled as a +6cy delay. Before this patch, that extra delay was added to the opcode latency. In practice, the insert opcode only executes for 1cy. Most of the actual latency is actually contributed by the so-called operand-latency. According to the AMD SOG for family 16h, (V)PINSR* latency is defined by expression f+1, where f is defined as a forwarding delay from the integer unit to the fpu. When printing instruction latency from MCA (see InstructionInfoView.cpp) and LLC (only when flag -print-schedule is speified), we now need to account for any extra forwarding delays. We do this by checking if scheduling classes declare any negative ReadAdvance entries. Quoting a code comment in TargetSchedule.td: "A negative advance effectively increases latency, which may be used for cross-domain stalls". When computing the instruction latency for the purpose of our scheduling tests, we now add any extra delay to the formula. This avoids regressing existing codegen and mca schedule tests. It comes with the cost of an extra (but very simple) hook in MCSchedModel. Differential Revision: https://reviews.llvm.org/D57056 llvm-svn: 351965
1 parent 21ed868 commit d768d35

24 files changed

+123
-40
lines changed

llvm/include/llvm/MC/MCSchedule.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#ifndef LLVM_MC_MCSCHEDULE_H
1515
#define LLVM_MC_MCSCHEDULE_H
1616

17+
#include "llvm/ADT/ArrayRef.h"
1718
#include "llvm/ADT/Optional.h"
1819
#include "llvm/Config/llvm-config.h"
1920
#include "llvm/Support/DataTypes.h"
@@ -369,6 +370,11 @@ struct MCSchedModel {
369370
getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
370371
const MCInst &Inst) const;
371372

373+
/// Returns the maximum forwarding delay for register reads dependent on
374+
/// writes of scheduling class WriteResourceIdx.
375+
static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries,
376+
unsigned WriteResourceIdx = 0);
377+
372378
/// Returns the default initialized model.
373379
static const MCSchedModel &GetDefaultSchedModel() { return Default; }
374380
static const MCSchedModel Default;

llvm/include/llvm/MC/MCSubtargetInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,16 @@ class MCSubtargetInfo {
152152
return 0;
153153
}
154154

155+
/// Return the set of ReadAdvance entries declared by the scheduling class
156+
/// descriptor in input.
157+
ArrayRef<MCReadAdvanceEntry>
158+
getReadAdvanceEntries(const MCSchedClassDesc &SC) const {
159+
if (!SC.NumReadAdvanceEntries)
160+
return ArrayRef<MCReadAdvanceEntry>();
161+
return ArrayRef<MCReadAdvanceEntry>(&ReadAdvanceTable[SC.ReadAdvanceIdx],
162+
SC.NumReadAdvanceEntries);
163+
}
164+
155165
/// Get scheduling itinerary of a CPU.
156166
InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const;
157167

llvm/include/llvm/MCA/Instruction.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,10 @@ struct InstrDesc {
332332
unsigned MaxLatency;
333333
// Number of MicroOps for this instruction.
334334
unsigned NumMicroOps;
335+
// SchedClassID used to construct this InstrDesc.
336+
// This information is currently used by views to do fast queries on the
337+
// subtarget when computing the reciprocal throughput.
338+
unsigned SchedClassID;
335339

336340
bool MayLoad;
337341
bool MayStore;

llvm/lib/CodeGen/TargetSubtargetInfo.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,12 @@ std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const {
8888
TargetSchedModel TSchedModel;
8989
TSchedModel.init(this);
9090
unsigned Latency = TSchedModel.computeInstrLatency(&MI);
91+
92+
// Add extra latency due to forwarding delays.
93+
const MCSchedClassDesc &SCDesc = *TSchedModel.resolveSchedClass(&MI);
94+
Latency +=
95+
MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc));
96+
9197
double RThroughput = TSchedModel.computeReciprocalThroughput(&MI);
9298
return createSchedInfoStr(Latency, RThroughput);
9399
}
@@ -99,9 +105,17 @@ std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const {
99105
TargetSchedModel TSchedModel;
100106
TSchedModel.init(this);
101107
unsigned Latency;
102-
if (TSchedModel.hasInstrSchedModel())
108+
if (TSchedModel.hasInstrSchedModel()) {
103109
Latency = TSchedModel.computeInstrLatency(MCI);
104-
else if (TSchedModel.hasInstrItineraries()) {
110+
// Add extra latency due to forwarding delays.
111+
const MCSchedModel &SM = *TSchedModel.getMCSchedModel();
112+
unsigned SClassID = getInstrInfo()->get(MCI.getOpcode()).getSchedClass();
113+
while (SM.getSchedClassDesc(SClassID)->isVariant())
114+
SClassID = resolveVariantSchedClass(SClassID, &MCI, SM.ProcID);
115+
const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SClassID);
116+
Latency +=
117+
MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc));
118+
} else if (TSchedModel.hasInstrItineraries()) {
105119
auto *ItinData = TSchedModel.getInstrItineraries();
106120
Latency = ItinData->getStageLatency(
107121
getInstrInfo()->get(MCI.getOpcode()).getSchedClass());

llvm/lib/MC/MCSchedule.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,19 @@ MCSchedModel::getReciprocalThroughput(unsigned SchedClass,
149149
// that it can execute at the maximum default issue width.
150150
return 1.0 / DefaultIssueWidth;
151151
}
152+
153+
unsigned
154+
MCSchedModel::getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries,
155+
unsigned WriteResourceID) {
156+
if (Entries.empty())
157+
return 0;
158+
159+
int DelayCycles = 0;
160+
for (const MCReadAdvanceEntry &E : Entries) {
161+
if (E.WriteResourceID != WriteResourceID)
162+
continue;
163+
DelayCycles = std::min(DelayCycles, E.Cycles);
164+
}
165+
166+
return std::abs(DelayCycles);
167+
}

llvm/lib/MCA/InstrBuilder.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
532532
// Create a new empty descriptor.
533533
std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
534534
ID->NumMicroOps = SCDesc.NumMicroOps;
535+
ID->SchedClassID = SchedClassID;
535536

536537
if (MCDesc.isCall() && FirstCallInst) {
537538
// We don't correctly model calls.

llvm/lib/Target/X86/X86InstrMMX.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in {
543543
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
544544
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
545545
GR32orGR64:$src2, imm:$src3))]>,
546-
Sched<[WriteVecInsert]>;
546+
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
547547

548548
def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
549549
(outs VR64:$dst),

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4122,7 +4122,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
41224122
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
41234123
[(set VR128:$dst,
41244124
(X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
4125-
Sched<[WriteVecInsert]>;
4125+
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
41264126
def rm : Ii8<0xC4, MRMSrcMem,
41274127
(outs VR128:$dst), (ins VR128:$src1,
41284128
i16mem:$src2, u8imm:$src3),
@@ -5577,7 +5577,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
55775577
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
55785578
[(set VR128:$dst,
55795579
(X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5580-
Sched<[WriteVecInsert]>;
5580+
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
55815581
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
55825582
(ins VR128:$src1, i8mem:$src2, u8imm:$src3),
55835583
!if(Is2Addr,
@@ -5603,7 +5603,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
56035603
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
56045604
[(set VR128:$dst,
56055605
(v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5606-
Sched<[WriteVecInsert]>;
5606+
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
56075607
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
56085608
(ins VR128:$src1, i32mem:$src2, u8imm:$src3),
56095609
!if(Is2Addr,
@@ -5629,7 +5629,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
56295629
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
56305630
[(set VR128:$dst,
56315631
(v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5632-
Sched<[WriteVecInsert]>;
5632+
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
56335633
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
56345634
(ins VR128:$src1, i64mem:$src2, u8imm:$src3),
56355635
!if(Is2Addr,

llvm/lib/Target/X86/X86SchedBroadwell.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
8181
def : ReadAdvance<ReadAfterVecXLd, 5>;
8282
def : ReadAdvance<ReadAfterVecYLd, 6>;
8383

84+
def : ReadAdvance<ReadInt2Fpu, 0>;
85+
8486
// Many SchedWrites are defined in pairs with and without a folded load.
8587
// Instructions with folded loads are usually micro-fused, so they only appear
8688
// as two micro-ops when queued in the reservation station.

llvm/lib/Target/X86/X86SchedHaswell.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
8686
def : ReadAdvance<ReadAfterVecXLd, 6>;
8787
def : ReadAdvance<ReadAfterVecYLd, 7>;
8888

89+
def : ReadAdvance<ReadInt2Fpu, 0>;
90+
8991
// Many SchedWrites are defined in pairs with and without a folded load.
9092
// Instructions with folded loads are usually micro-fused, so they only appear
9193
// as two micro-ops when queued in the reservation station.

llvm/lib/Target/X86/X86SchedSandyBridge.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
7676
def : ReadAdvance<ReadAfterVecXLd, 6>;
7777
def : ReadAdvance<ReadAfterVecYLd, 7>;
7878

79+
def : ReadAdvance<ReadInt2Fpu, 0>;
80+
7981
// Many SchedWrites are defined in pairs with and without a folded load.
8082
// Instructions with folded loads are usually micro-fused, so they only appear
8183
// as two micro-ops when queued in the reservation station.

llvm/lib/Target/X86/X86SchedSkylakeClient.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
8080
def : ReadAdvance<ReadAfterVecXLd, 6>;
8181
def : ReadAdvance<ReadAfterVecYLd, 7>;
8282

83+
def : ReadAdvance<ReadInt2Fpu, 0>;
84+
8385
// Many SchedWrites are defined in pairs with and without a folded load.
8486
// Instructions with folded loads are usually micro-fused, so they only appear
8587
// as two micro-ops when queued in the reservation station.

llvm/lib/Target/X86/X86SchedSkylakeServer.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
8080
def : ReadAdvance<ReadAfterVecXLd, 6>;
8181
def : ReadAdvance<ReadAfterVecYLd, 7>;
8282

83+
def : ReadAdvance<ReadInt2Fpu, 0>;
84+
8385
// Many SchedWrites are defined in pairs with and without a folded load.
8486
// Instructions with folded loads are usually micro-fused, so they only appear
8587
// as two micro-ops when queued in the reservation station.

llvm/lib/Target/X86/X86Schedule.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ def ReadAfterVecLd : SchedRead;
1717
def ReadAfterVecXLd : SchedRead;
1818
def ReadAfterVecYLd : SchedRead;
1919

20+
// Instructions that move data between general purpose registers and vector
21+
// registers may be subject to extra latency due to data bypass delays.
22+
// This SchedRead describes a bypass delay caused by data being moved from the
23+
// integer unit to the floating point unit.
24+
def ReadInt2Fpu : SchedRead;
25+
2026
// Instructions with both a load and a store folded are modeled as a folded
2127
// load + WriteRMW.
2228
def WriteRMW : SchedWrite;

llvm/lib/Target/X86/X86ScheduleAtom.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>;
4646
def : ReadAdvance<ReadAfterVecXLd, 3>;
4747
def : ReadAdvance<ReadAfterVecYLd, 3>;
4848

49+
def : ReadAdvance<ReadInt2Fpu, 0>;
50+
4951
// Many SchedWrites are defined in pairs with and without a folded load.
5052
// Instructions with folded loads are usually micro-fused, so they only appear
5153
// as two micro-ops when dispatched by the schedulers.

llvm/lib/Target/X86/X86ScheduleBdVer2.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,8 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
250250
def : ReadAdvance<ReadAfterVecXLd, 5>;
251251
def : ReadAdvance<ReadAfterVecYLd, 5>;
252252

253+
def : ReadAdvance<ReadInt2Fpu, 0>;
254+
253255
// A folded store needs a cycle on the PdStore for the store data.
254256
def : WriteRes<WriteRMW, [PdStore]>;
255257

llvm/lib/Target/X86/X86ScheduleBtVer2.td

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,11 @@ def : ReadAdvance<ReadAfterVecLd, 5>;
108108
def : ReadAdvance<ReadAfterVecXLd, 5>;
109109
def : ReadAdvance<ReadAfterVecYLd, 5>;
110110

111+
/// "Additional 6 cycle transfer operation which moves a floating point
112+
/// operation input value from the integer unit to the floating point unit.
113+
/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
114+
def : ReadAdvance<ReadInt2Fpu, -6>;
115+
111116
// Many SchedWrites are defined in pairs with and without a folded load.
112117
// Instructions with folded loads are usually micro-fused, so they only appear
113118
// as two micro-ops when dispatched by the schedulers.
@@ -540,7 +545,7 @@ defm : X86WriteResPairUnsupported<WriteVarShuffle256>;
540545
// Vector insert/extract operations.
541546
////////////////////////////////////////////////////////////////////////////////
542547

543-
defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>;
548+
defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>;
544549
defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
545550
defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
546551
defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;

llvm/lib/Target/X86/X86ScheduleSLM.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ def : ReadAdvance<ReadAfterVecLd, 3>;
5252
def : ReadAdvance<ReadAfterVecXLd, 3>;
5353
def : ReadAdvance<ReadAfterVecYLd, 3>;
5454

55+
def : ReadAdvance<ReadInt2Fpu, 0>;
56+
5557
// Many SchedWrites are defined in pairs with and without a folded load.
5658
// Instructions with folded loads are usually micro-fused, so they only appear
5759
// as two micro-ops when queued in the reservation station.

llvm/lib/Target/X86/X86ScheduleZnver1.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ def : ReadAdvance<ReadAfterVecLd, 8>;
9494
def : ReadAdvance<ReadAfterVecXLd, 8>;
9595
def : ReadAdvance<ReadAfterVecYLd, 8>;
9696

97+
def : ReadAdvance<ReadInt2Fpu, 0>;
98+
9799
// The Integer PRF for Zen is 168 entries, and it holds the architectural and
98100
// speculative version of the 64-bit integer registers.
99101
// Reference: "Software Optimization Guide for AMD Family 17h Processors"

llvm/test/CodeGen/X86/mmx-schedule.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3887,8 +3887,8 @@ define i64 @test_pinsrw(x86_mmx %a0, i32 %a1, i16* %a2) optsize {
38873887
;
38883888
; BTVER2-LABEL: test_pinsrw:
38893889
; BTVER2: # %bb.0:
3890-
; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50]
38913890
; BTVER2-NEXT: movswl (%rsi), %eax # sched: [4:1.00]
3891+
; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50]
38923892
; BTVER2-NEXT: pinsrw $1, %eax, %mm0 # sched: [7:0.50]
38933893
; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00]
38943894
; BTVER2-NEXT: retq # sched: [4:1.00]

llvm/test/CodeGen/X86/sse41-schedule.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2679,15 +2679,15 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
26792679
;
26802680
; BTVER2-SSE-LABEL: test_pinsrq:
26812681
; BTVER2-SSE: # %bb.0:
2682-
; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50]
26832682
; BTVER2-SSE-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00]
2683+
; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50]
26842684
; BTVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
26852685
; BTVER2-SSE-NEXT: retq # sched: [4:1.00]
26862686
;
26872687
; BTVER2-LABEL: test_pinsrq:
26882688
; BTVER2: # %bb.0:
2689-
; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50]
26902689
; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [4:1.00]
2690+
; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50]
26912691
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
26922692
; BTVER2-NEXT: retq # sched: [4:1.00]
26932693
;

llvm/test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0
2727

2828
# CHECK: Iterations: 500
2929
# CHECK-NEXT: Instructions: 1000
30-
# CHECK-NEXT: Total Cycles: 7003
30+
# CHECK-NEXT: Total Cycles: 1003
3131
# CHECK-NEXT: Total uOps: 2000
3232

3333
# CHECK: Dispatch Width: 2
34-
# CHECK-NEXT: uOps Per Cycle: 0.29
35-
# CHECK-NEXT: IPC: 0.14
34+
# CHECK-NEXT: uOps Per Cycle: 1.99
35+
# CHECK-NEXT: IPC: 1.00
3636
# CHECK-NEXT: Block RThroughput: 2.0
3737

3838
# CHECK: Instruction Info:
@@ -76,12 +76,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0
7676

7777
# CHECK: Iterations: 500
7878
# CHECK-NEXT: Instructions: 1000
79-
# CHECK-NEXT: Total Cycles: 7003
79+
# CHECK-NEXT: Total Cycles: 1003
8080
# CHECK-NEXT: Total uOps: 2000
8181

8282
# CHECK: Dispatch Width: 2
83-
# CHECK-NEXT: uOps Per Cycle: 0.29
84-
# CHECK-NEXT: IPC: 0.14
83+
# CHECK-NEXT: uOps Per Cycle: 1.99
84+
# CHECK-NEXT: IPC: 1.00
8585
# CHECK-NEXT: Block RThroughput: 2.0
8686

8787
# CHECK: Instruction Info:
@@ -125,12 +125,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0
125125

126126
# CHECK: Iterations: 500
127127
# CHECK-NEXT: Instructions: 1000
128-
# CHECK-NEXT: Total Cycles: 7003
128+
# CHECK-NEXT: Total Cycles: 1003
129129
# CHECK-NEXT: Total uOps: 2000
130130

131131
# CHECK: Dispatch Width: 2
132-
# CHECK-NEXT: uOps Per Cycle: 0.29
133-
# CHECK-NEXT: IPC: 0.14
132+
# CHECK-NEXT: uOps Per Cycle: 1.99
133+
# CHECK-NEXT: IPC: 1.00
134134
# CHECK-NEXT: Block RThroughput: 2.0
135135

136136
# CHECK: Instruction Info:
@@ -174,12 +174,12 @@ vpinsrq $1, %rax, %xmm0, %xmm0
174174

175175
# CHECK: Iterations: 500
176176
# CHECK-NEXT: Instructions: 1000
177-
# CHECK-NEXT: Total Cycles: 7003
177+
# CHECK-NEXT: Total Cycles: 1003
178178
# CHECK-NEXT: Total uOps: 2000
179179

180180
# CHECK: Dispatch Width: 2
181-
# CHECK-NEXT: uOps Per Cycle: 0.29
182-
# CHECK-NEXT: IPC: 0.14
181+
# CHECK-NEXT: uOps Per Cycle: 1.99
182+
# CHECK-NEXT: IPC: 1.00
183183
# CHECK-NEXT: Block RThroughput: 2.0
184184

185185
# CHECK: Instruction Info:

0 commit comments

Comments
 (0)