Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PowerPC] Add dense math bfloat16 floating-point outer-product accumulate to DMR instructions #133109

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

maryammo
Copy link
Contributor

This patch adds the following Dense Math Facility bfloat16 floating-point calculation instructions: dmxvbf16gerx2, dmxvbf16gerx2pp,dmxvbf16gerx2pn, dmxvbf16gerx2np, dmxvbf16gerx2nn, pmdmxvbf16gerx2, pmdmxvbf16gerx2pp, pmdmxvbf16gerx2pn, pmdmxvbf16gerx2np, pmdmxvbf16gerx2nn, along with their corresponding intrinsics and tests.

@maryammo maryammo self-assigned this Mar 26, 2025
@maryammo maryammo requested a review from RolandF77 March 26, 2025 15:45
@llvmbot llvmbot added mc Machine (object) code llvm:ir labels Mar 26, 2025
@llvmbot
Copy link
Member

llvmbot commented Mar 26, 2025

@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-mc

@llvm/pr-subscribers-backend-powerpc

Author: Maryam Moghadas (maryammo)

Changes

This patch adds the following Dense Math Facility bfloat16 floating-point calculation instructions: dmxvbf16gerx2, dmxvbf16gerx2pp,dmxvbf16gerx2pn, dmxvbf16gerx2np, dmxvbf16gerx2nn, pmdmxvbf16gerx2, pmdmxvbf16gerx2pp, pmdmxvbf16gerx2pn, pmdmxvbf16gerx2np, pmdmxvbf16gerx2nn, along with their corresponding intrinsics and tests.


Patch is 36.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133109.diff

6 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsPowerPC.td (+23)
  • (modified) llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td (+161-1)
  • (modified) llvm/test/CodeGen/PowerPC/dmf-outer-product.ll (+484)
  • (modified) llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt (+31-1)
  • (modified) llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt (+30)
  • (modified) llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s (+50)
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index e4d39134a4a25..b57102ef68f09 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -280,6 +280,22 @@ multiclass PowerPC_MMA_ACC_PP_Intrinsic<list<LLVMType> args> {
                                  [IntrNoMem]>;
 }
 
+multiclass PowerPC_MMA_DMR_Intrinsic<list<LLVMType> args> {
+  def NAME: DefaultAttrsIntrinsic<[llvm_v1024i1_ty], args, [IntrNoMem]>;
+  def pp : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+                                 !listconcat([llvm_v1024i1_ty], args),
+                                 [IntrNoMem]>;
+  def pn : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+                                 !listconcat([llvm_v1024i1_ty], args),
+                                 [IntrNoMem]>;
+  def np : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+                                 !listconcat([llvm_v1024i1_ty], args),
+                                 [IntrNoMem]>;
+  def nn : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+                                 !listconcat([llvm_v1024i1_ty], args),
+                                 [IntrNoMem]>;
+}
+
 multiclass PowerPC_MMA_DMR_PP_Intrinsic<list<LLVMType> args> {
   def NAME: DefaultAttrsIntrinsic<[llvm_v1024i1_ty], args, [IntrNoMem]>;
   def pp : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
@@ -1732,6 +1748,13 @@ let TargetPrefix = "ppc" in {
                             [llvm_v1024i1_ty, llvm_v256i1_ty, llvm_v16i8_ty,
                              llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
                             [IntrNoMem]>;
+
+  // MMA+ Reduced-Precision: bfloat16 Outer Product Intrinsic Definitions.
+  defm int_ppc_mma_dmxvbf16gerx2 :
+       PowerPC_MMA_DMR_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty]>;
+  defm int_ppc_mma_pmdmxvbf16gerx2 :
+       PowerPC_MMA_DMR_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty, llvm_i32_ty,
+                                     llvm_i32_ty, llvm_i32_ty]>;
 }
 
 // XL Compat intrinsics.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
index d4f0e222b457c..8ea0924f09b43 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
@@ -95,7 +95,7 @@ class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
                                  list<dag> pattern>
   : PI<1, opcode, OOL, IOL, asmstr, itin> {
   bits<3> AT;
-  bits<6> XAp;
+  bits<5> XAp;
   bits<6> XB;
   bits<8> XMSK;
   bits<4> YMSK;
@@ -123,6 +123,40 @@ class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
   let Inst{63} = 0;
 }
 
+class MMIRR_XX3Form_X8Y4P2_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+                                 string asmstr, InstrItinClass itin,
+                                 list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<3> AT;
+  bits<5> XAp;
+  bits<6> XB;
+  bits<8> XMSK;
+  bits<4> YMSK;
+  bits<2> PMSK;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 3;
+  let Inst{8-11} = 9;
+  let Inst{12-15} = 0;
+  let Inst{16-17} = PMSK;
+  let Inst{18-19} = 0;
+  let Inst{20-27} = XMSK;
+  let Inst{28-31} = YMSK;
+
+  // The instruction.
+  let Inst{38-40} = AT;
+  let Inst{41-42} = 0;
+  let Inst{43-46} = XAp{3-0};
+  let Inst{47} = 0;
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-60} = xo;
+  let Inst{61} = XAp{4};
+  let Inst{62} = XB{5};
+  let Inst{63} = 0;
+}
+
 multiclass DMR_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
                        string asmstr> {
   let Predicates = [MMA, IsISAFuture] in {
@@ -159,6 +193,83 @@ multiclass DMR_UM_M448_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
   }
 }
 
+multiclass DMR_BF16_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                       string asmstr> {
+  let Predicates = [MMA, IsISAFuture] in {
+  def NAME :
+    XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x11), (outs dmr:$AT), IOL,
+                     !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PP :
+    XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+                     !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+multiclass DMR_UM_M284_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+                            string asmstr> {
+  defm NAME : DMR_BF16_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
+  def PM#NAME :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, !or(xo, 0x11), (outs dmr:$AT),
+      !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)),
+      !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"@earlyclobber $AT">;
+  def PM#NAME#PP :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, xo, (outs dmr:$AT),
+      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
+multiclass DMR_NEG_UM_M284_XOXORf939a0<bits<6> opcode, bits<8> xo, dag IOL,
+                                  string asmbase, string asmstr> {
+  defm NAME : DMR_UM_M284_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+  let Predicates = [MMA, IsISAFuture] in {
+  def PN : XX3Form_AT3_XAp5B6<
+             opcode, !xor(xo, 0xF9), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+             !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NP : XX3Form_AT3_XAp5B6<
+             opcode, !xor(xo, 0x39), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+             !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def NN : XX3Form_AT3_XAp5B6<
+             opcode, !xor(xo, 0xA0), (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+             !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>,
+           RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+  let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
+   def PM#NAME#PN :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, !xor(xo, 0xF9), (outs dmr:$AT),
+      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NP :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, !xor(xo, 0x39), (outs dmr:$AT),
+      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  def PM#NAME#NN :
+    MMIRR_XX3Form_X8Y4P2_XAp5B6<
+      opcode, !xor(xo, 0xA0), (outs dmr:$AT),
+      !con((ins dmr:$ATi), !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
+      !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"),
+      IIC_VecFP, []>,
+    RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+  }
+}
+
 let Predicates = [IsISAFuture] in {
   def DMXXEXTFDMR512 : XX3Form_AT3_XABp5_P1<60, 226,
                                             (outs vsrprc:$XAp, vsrprc:$XBp),
@@ -231,6 +342,11 @@ let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
     RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
 }
 
+// DMXVBF16GERX2, DMXVBF16GERX2PP, DMXVBF16GERX2PN, dMXVBF16GERX2NP, DMXVBF16GERX2NN
+// PMDMXVBF16GERX2, PMDMXVBF16GERX2PP, PMDMXVBF16GERX2PN, PMDMXVBF16GERX2NP, PMDMXVBF16GERX2NN
+defm DMXVBF16GERX2 : DMR_NEG_UM_M284_XOXORf939a0<59, 74, (ins vsrprc:$XAp, vsrc:$XB),
+                                         "dmxvbf16gerx2", "$AT, $XAp, $XB">;
+
 // MMA+ Intrinsics
 let Predicates = [MMA, IsISAFuture] in {
   def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4 v256i1:$XAp, v16i8:$XB)),
@@ -240,6 +356,21 @@ let Predicates = [MMA, IsISAFuture] in {
 
   def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
             (DMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2 v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2 $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_dmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+            (DMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>;
 }
 
 let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
@@ -259,4 +390,33 @@ let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
                                                Msk4Imm:$PMSK)),
             (PMDMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
                            Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
+                                            Msk4Imm:$YMSK, Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                        Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+                                              Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                                              Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2pn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+                                               Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2PN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2np v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+                                              Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                                              Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2NP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                          Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
+
+  def : Pat<(v1024i1 (int_ppc_mma_pmdmxvbf16gerx2nn v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+                                               Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+                                               Msk2Imm:$PMSK)),
+            (PMDMXVBF16GERX2NN $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+                           Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
 }
diff --git a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
index cba52567c900d..e3b43062f417c 100644
--- a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
@@ -285,3 +285,487 @@ entry:
   store <1024 x i1> %call, ptr %resp, align 64
   ret void
 }
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxv vs0, 0(r4)
+; CHECK-NEXT:    lxv v3, 0(r3)
+; CHECK-NEXT:    dmxvbf16gerx2 dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r5)
+; CHECK-NEXT:    stxvp vsp36, 64(r5)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r5)
+; CHECK-NEXT:    stxvp vsp36, 0(r5)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-NEXT:    lxv vs0, 0(r4)
+; CHECK-BE-NEXT:    lxv v3, 16(r3)
+; CHECK-BE-NEXT:    dmxvbf16gerx2 dmr0, vsp34, vs0
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r5)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r5)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r5)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r5)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1> %v1, <16 x i8> %v2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2pp:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    dmxvbf16gerx2pp dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2pp:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    dmxvbf16gerx2pp dmr0, vsp34, vs0
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pn(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2pn:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    dmxvbf16gerx2pn dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2pn:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    dmxvbf16gerx2pn dmr0, vsp34, vs0
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pn(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2np(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2np:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    dmxvbf16gerx2np dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2np:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-NEXT:    lxv vs0, 0(r5)
+; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    dmxvbf16gerx2np dmr0, vsp34, vs0
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT:    stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT:    stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT:    blr
+entry:
+  %v.dmr = load <1024 x i1>, ptr %vop, align 64
+  %v1 = load <256 x i1>, ptr %vpp, align 32
+  %v2 = load <16 x i8>, ptr %vcp, align 32
+  %call = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2np(<1024 x i1> %v.dmr, <256 x i1> %v1, <16 x i8> %v2)
+  store <1024 x i1> %call, ptr %resp, align 64
+  ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2nn(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvbf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvbf16gerx2nn:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 32(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT:    lxvp vsp34, 64(r3)
+; CHECK-NEXT:    lxvp vsp36, 96(r3)
+; CHECK-NEXT:    dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT:    lxv v2, 16(r4)
+; CHECK-NEXT:    lxv vs0, 0(r5)
+; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    dmxvbf16gerx2nn dmr0, vsp34, vs0
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT:    stxvp vsp34, 96(r6)
+; CHECK-NEXT:    stxvp vsp36, 64(r6)
+; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT:    stxvp vsp34, 32(r6)
+; CHECK-NEXT:    stxvp vsp36, 0(r6)
+; CHECK-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_dmxvbf16gerx2nn:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT:    dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT:    lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT:...
[truncated]

@maryammo maryammo requested review from lei137 and mandlebug March 26, 2025 15:45
…late to DMR instructions

This patch adds the following Dense Math Facility bfloat16 floating-point
calculation instructions: dmxvbf16gerx2, dmxvbf16gerx2pp,dmxvbf16gerx2pn,
dmxvbf16gerx2np, dmxvbf16gerx2nn, pmdmxvbf16gerx2, pmdmxvbf16gerx2pp,
pmdmxvbf16gerx2pn, pmdmxvbf16gerx2np, pmdmxvbf16gerx2nn, along with their
corresponding intrinsics and tests.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:PowerPC llvm:ir mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants