Skip to content

Commit 1184c27

Browse files
committed
[AMDGPU] Support mov dpp with 64 bit operands
We define mov/update dpp intrinsics as overloaded but do not support i64, which is a practically useful type. Fix the selection and lowering. Differential Revision: https://reviews.llvm.org/D68673 llvm-svn: 374910
1 parent 034badb commit 1184c27

File tree

7 files changed

+226
-6
lines changed

7 files changed

+226
-6
lines changed

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,13 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
557557
if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
558558
Changed = true;
559559
++NumDPPMovsCombined;
560+
} else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
561+
auto Split = TII->expandMovDPP64(MI);
562+
for (auto M : { Split.first, Split.second }) {
563+
if (combineDPPMov(*M))
564+
++NumDPPMovsCombined;
565+
}
566+
Changed = true;
560567
}
561568
}
562569
}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,6 +1454,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
14541454
MI.eraseFromParent();
14551455
break;
14561456
}
1457+
case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
1458+
expandMovDPP64(MI);
1459+
break;
1460+
}
14571461
case AMDGPU::V_SET_INACTIVE_B32: {
14581462
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
14591463
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
@@ -1567,6 +1571,64 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
15671571
return true;
15681572
}
15691573

1574+
std::pair<MachineInstr*, MachineInstr*>
1575+
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
1576+
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
1577+
1578+
MachineBasicBlock &MBB = *MI.getParent();
1579+
DebugLoc DL = MBB.findDebugLoc(MI);
1580+
MachineFunction *MF = MBB.getParent();
1581+
MachineRegisterInfo &MRI = MF->getRegInfo();
1582+
Register Dst = MI.getOperand(0).getReg();
1583+
unsigned Part = 0;
1584+
MachineInstr *Split[2];
1585+
1586+
1587+
for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
1588+
auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
1589+
if (Dst.isPhysical()) {
1590+
MovDPP.addDef(RI.getSubReg(Dst, Sub));
1591+
} else {
1592+
assert(MRI.isSSA());
1593+
auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1594+
MovDPP.addDef(Tmp);
1595+
}
1596+
1597+
for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
1598+
const MachineOperand &SrcOp = MI.getOperand(I);
1599+
assert(!SrcOp.isFPImm());
1600+
if (SrcOp.isImm()) {
1601+
APInt Imm(64, SrcOp.getImm());
1602+
Imm.ashrInPlace(Part * 32);
1603+
MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
1604+
} else {
1605+
assert(SrcOp.isReg());
1606+
Register Src = SrcOp.getReg();
1607+
if (Src.isPhysical())
1608+
MovDPP.addReg(RI.getSubReg(Src, Sub));
1609+
else
1610+
MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
1611+
}
1612+
}
1613+
1614+
for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
1615+
MovDPP.addImm(MI.getOperand(I).getImm());
1616+
1617+
Split[Part] = MovDPP;
1618+
++Part;
1619+
}
1620+
1621+
if (Dst.isVirtual())
1622+
BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
1623+
.addReg(Split[0]->getOperand(0).getReg())
1624+
.addImm(AMDGPU::sub0)
1625+
.addReg(Split[1]->getOperand(0).getReg())
1626+
.addImm(AMDGPU::sub1);
1627+
1628+
MI.eraseFromParent();
1629+
return std::make_pair(Split[0], Split[1]);
1630+
}
1631+
15701632
bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
15711633
MachineOperand &Src0,
15721634
unsigned Src0OpName,

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
229229

230230
bool expandPostRAPseudo(MachineInstr &MI) const override;
231231

232+
// Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
233+
// instructions. Returns a pair of generated instructions.
234+
// Can split either post-RA with physical registers or pre-RA with
235+
// virtual registers. In latter case IR needs to be in SSA form and
236+
// and a REG_SEQUENCE is produced to define original register.
237+
std::pair<MachineInstr*, MachineInstr*>
238+
expandMovDPP64(MachineInstr &MI) const;
239+
232240
// Returns an opcode that can be used to move a value to a \p DstRC
233241
// register. If there is no hardware instruction that can store to \p
234242
// DstRC, then AMDGPU::COPY is returned.

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ def ATOMIC_FENCE : SPseudoInstSI<
9292
let maybeAtomic = 1;
9393
}
9494

95+
def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
96+
let HasExt = 1;
97+
let HasExtDPP = 1;
98+
}
99+
95100
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
96101

97102
// For use in patterns
@@ -107,6 +112,11 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
107112
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
108113
(ins VSrc_b64:$src0)>;
109114

115+
// 64-bit vector move with dpp. Expanded post-RA.
116+
def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
117+
let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
118+
}
119+
110120
// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
111121
// WQM pass processes it.
112122
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
@@ -1855,6 +1865,22 @@ def : GCNPat <
18551865
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
18561866
>;
18571867

1868+
def : GCNPat <
1869+
(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
1870+
timm:$bound_ctrl)),
1871+
(V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
1872+
(as_i32imm $row_mask), (as_i32imm $bank_mask),
1873+
(as_i1imm $bound_ctrl))
1874+
>;
1875+
1876+
def : GCNPat <
1877+
(i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
1878+
timm:$bank_mask, timm:$bound_ctrl)),
1879+
(V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
1880+
(as_i32imm $row_mask), (as_i32imm $bank_mask),
1881+
(as_i1imm $bound_ctrl))
1882+
>;
1883+
18581884
//===----------------------------------------------------------------------===//
18591885
// Fract Patterns
18601886
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/AMDGPU/dpp_combine.mir

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,3 +718,43 @@ body: |
718718
%6:vgpr_32 = V_ADD_I32_e32 %5.sub0, %8, implicit-def $vcc, implicit $exec
719719
%7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %8, implicit-def $vcc, implicit $vcc, implicit $exec
720720
...
721+
722+
# GCN-LABEL: name: dpp64_add64_impdef
723+
# GCN: %3:vgpr_32 = V_ADD_I32_dpp %1.sub0, %0.sub0, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $exec
724+
# GCN: %5:vgpr_32 = V_ADDC_U32_dpp %1.sub1, %0.sub1, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
725+
name: dpp64_add64_impdef
726+
tracksRegLiveness: true
727+
body: |
728+
bb.0:
729+
%0:vreg_64 = IMPLICIT_DEF
730+
%1:vreg_64 = IMPLICIT_DEF
731+
%2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 1, 15, 15, 1, implicit $exec
732+
%5:vgpr_32 = V_ADD_I32_e32 %2.sub0, undef %4:vgpr_32, implicit-def $vcc, implicit $exec
733+
%6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec
734+
...
735+
736+
# GCN-LABEL: name: dpp64_add64_undef
737+
# GCN: %3:vgpr_32 = V_ADD_I32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $exec
738+
# GCN: %5:vgpr_32 = V_ADDC_U32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
739+
name: dpp64_add64_undef
740+
tracksRegLiveness: true
741+
body: |
742+
bb.0:
743+
%2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec
744+
%5:vgpr_32 = V_ADD_I32_e32 %2.sub0, undef %4:vgpr_32, implicit-def $vcc, implicit $exec
745+
%6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec
746+
...
747+
748+
# GCN-LABEL: name: dpp64_add64_first_combined
749+
# GCN: %8:vgpr_32 = V_MOV_B32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, 1, 15, 15, 1, implicit $exec
750+
# GCN: %0:vreg_64 = REG_SEQUENCE undef %7:vgpr_32, %subreg.sub0, %8, %subreg.sub1
751+
# GCN: %3:vgpr_32 = V_ADD_I32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $exec
752+
# GCN: %5:vgpr_32, dead %6:sreg_64_xexec = V_ADDC_U32_e64 1, %0.sub1, undef $vcc, 0, implicit $exec
753+
name: dpp64_add64_first_combined
754+
tracksRegLiveness: true
755+
body: |
756+
bb.0:
757+
%2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec
758+
%4:vgpr_32 = V_ADD_I32_e32 %2.sub0, undef %3:vgpr_32, implicit-def $vcc, implicit $exec
759+
%5:vgpr_32, dead %6:sreg_64_xexec = V_ADDC_U32_e64 1, %2.sub1, undef $vcc, 0, implicit $exec
760+
...

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,31 @@ endif:
7777
ret void
7878
}
7979

80+
; VI-LABEL: {{^}}mov_dpp64_test:
81+
; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
82+
; VI: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
83+
define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) {
84+
%tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 0) #0
85+
store i64 %tmp0, i64 addrspace(1)* %out
86+
ret void
87+
}
88+
89+
; VI-LABEL: {{^}}mov_dpp64_imm_test:
90+
; VI-OPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
91+
; VI-OPT-DAG: s_movk_i32 s[[SOLD_HI:[0-9]+]], 0x7047
92+
; VI-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], s[[SOLD_LO]]
93+
; VI-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], s[[SOLD_HI]]
94+
; VI-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
95+
; VI-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
96+
; VI-NOOPT-COUNT2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
97+
define amdgpu_kernel void @mov_dpp64_imm_test(i64 addrspace(1)* %out) {
98+
%tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
99+
store i64 %tmp0, i64 addrspace(1)* %out
100+
ret void
101+
}
102+
80103
declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #0
104+
declare i64 @llvm.amdgcn.mov.dpp.i64(i64, i32, i32, i32, i1) #0
81105

82106
attributes #0 = { nounwind readnone convergent }
83107

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1-
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8 %s
2-
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
1+
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
2+
; RUN: llc -march=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
3+
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
34

45
; GCN-LABEL: {{^}}dpp_test:
56
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
67
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
7-
; GFX8: s_nop 1
8+
; GFX8-OPT: s_nop 1
9+
; GFX8-NOOPT: s_nop 0
10+
; GFX8-NOOPT-NEXT: s_nop 0
811
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
912
define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
1013
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0
@@ -15,7 +18,9 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
1518
; GCN-LABEL: {{^}}dpp_test_bc:
1619
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
1720
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
18-
; GFX8: s_nop 1
21+
; GFX8-OPT: s_nop 1
22+
; GFX8-NOOPT: s_nop 0
23+
; GFX8-NOOPT-NEXT: s_nop 0
1924
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0{{$}}
2025
define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
2126
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0
@@ -24,14 +29,14 @@ define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in
2429
}
2530

2631

27-
; VI-LABEL: {{^}}dpp_test1:
32+
; GCN-LABEL: {{^}}dpp_test1:
2833
; GFX10: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
2934
; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
3035
; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
3136
; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
3237
; GFX8: s_nop 0
3338
; GFX8-NEXT: s_nop 0
34-
; GFX8-OPT-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
39+
; GFX8-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
3540
@0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
3641
define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {
3742
bb:
@@ -50,8 +55,56 @@ bb:
5055
ret void
5156
}
5257

58+
; GCN-LABEL: {{^}}update_dpp64_test:
59+
; GCN: load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
60+
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
61+
; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
62+
define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) {
63+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
64+
%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
65+
%load = load i64, i64 addrspace(1)* %gep
66+
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
67+
store i64 %tmp0, i64 addrspace(1)* %gep
68+
ret void
69+
}
70+
71+
; GCN-LABEL: {{^}}update_dpp64_imm_old_test:
72+
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
73+
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
74+
; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
75+
; GFX8-NOOPT-DAG: s_movk_i32 s[[SOLD_HI:[0-9]+]], 0x7047
76+
; GCN-DAG: load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
77+
; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
78+
; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
79+
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
80+
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
81+
define amdgpu_kernel void @update_dpp64_imm_old_test(i64 addrspace(1)* %arg, i64 %in2) {
82+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
83+
%gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
84+
%load = load i64, i64 addrspace(1)* %gep
85+
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
86+
store i64 %tmp0, i64 addrspace(1)* %gep
87+
ret void
88+
}
89+
90+
; GCN-LABEL: {{^}}update_dpp64_imm_src_test:
91+
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
92+
; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
93+
; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
94+
; GFX8-NOOPT-DAG: s_movk_i32 s[[SOLD_HI:[0-9]+]], 0x7047
95+
; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
96+
; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
97+
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
98+
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
99+
define amdgpu_kernel void @update_dpp64_imm_src_test(i64 addrspace(1)* %out, i64 %in1) {
100+
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
101+
store i64 %tmp0, i64 addrspace(1)* %out
102+
ret void
103+
}
104+
53105
declare i32 @llvm.amdgcn.workitem.id.x()
54106
declare void @llvm.amdgcn.s.barrier()
55107
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
108+
declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
56109

57110
attributes #0 = { nounwind readnone convergent }

0 commit comments

Comments
 (0)