i#3044 AArch64 SVE codec: Add ADR instructions (#5866)

This patch adds the appropriate macros, tests and codec entries to encode the following variants: ADR <Zd>.D, [<Zn>.D, <Zm>.D, SXTW <amount>] ADR <Zd>.D, [<Zn>.D, <Zm>.D, UXTW <amount>] ADR <Zd>.<Ts>, [<Zn>.<Ts>, <Zm>.<Ts>, <extend> <amount>] and the required changes to support the use of vector registers in base+disp adress operands. This required two main changes: 1) Adding element size to base disp operand ADR uses Z vector registers for the base and index register so we need to be able to specify the element size in the operand. This adds an element size field to base+disp operands (AArch64 only). The following sizes are supported: OPSZ_4: Single OPSZ_8: Double For example, the memory operand for: adr z0.d, [z1.d, z2.d, lsl 2] could be created with the call: opnd_create_vector_base_disp_aarch64(DR_REG_Z1, DR_REG_Z2, OPSZ_8, // Element size DR_EXTEND_UXTX, // LSL true, 0, 0, OPSZ_0, // Transfers 0 bytes 2); // Shift amount This will also be needed for SVE scatter/gather instructions. 2) Move DR_REG_Z* < 256 opnd_t only stores the first 8 bits of the reg_id_t values for the base and index, so in order to use a Z register in an address operand we need to make sure the DR_REG_Z* constants are < 256. While I was there I also added the Z and P registers and one system register to the dr_reg_fixer array as they were previously missing. The B, H, S, D, Q registers have been changed to map to the Z registers because they overlap with the lower 128 bits of the Z registers. Issues: #3044
DynamoRIO · Feb 23, 2023 · de22aaf · de22aaf
1 parent 218327a
commit de22aaf
Show file tree

Hide file tree

Showing 16 changed files with 850 additions and 87 deletions.
diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml
@@ -90,7 +90,7 @@ jobs:
       # We only use a non-zero build # when making multiple manual builds in one day.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.91.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi

diff --git a/.github/workflows/ci-package.yml b/.github/workflows/ci-package.yml
@@ -102,7 +102,7 @@ jobs:
       # We only use a non-zero build # when making multiple manual builds in one day.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.91.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -194,7 +194,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.91.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -282,7 +282,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.91.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -370,7 +370,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.91.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -450,7 +450,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.91.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -535,7 +535,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER="9.90.$((`git log -n 1 --format=%ct` / (60*60*24)))"
+          export VERSION_NUMBER="9.91.$((`git log -n 1 --format=%ct` / (60*60*24)))"
           export PREFIX="cronbuild-"
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -567,7 +567,7 @@ endif (EXISTS "${PROJECT_SOURCE_DIR}/.svn")
 
 # N.B.: When updating this, update all the default versions in ci-package.yml
 # and ci-docs.yml.  We should find a way to share (xref i#1565).
-set(VERSION_NUMBER_DEFAULT "9.90.${VERSION_NUMBER_PATCHLEVEL}")
+set(VERSION_NUMBER_DEFAULT "9.91.${VERSION_NUMBER_PATCHLEVEL}")
 # do not store the default VERSION_NUMBER in the cache to prevent a stale one
 # from preventing future version updates in a pre-existing build dir
 set(VERSION_NUMBER "" CACHE STRING "Version number: leave empty for default")

diff --git a/api/docs/release.dox b/api/docs/release.dox
@@ -142,6 +142,10 @@ changes:
  - Reduced the value of #DR_NOTE_FIRST_RESERVED.  This is not expected to cause
    problems unless clients are directly choosing high note values without using
    drmgr_reserve_note_range().
+ - Changed the values of the AArch64 DR_REG_Z* constants so that Z registers can be
+   used in base+disp operands in SVE scatter/gather instructions. This breaks binary
+   compatibility for clients built against an older version of opnd_api.h, but source
+   code compatibility is unchanged.
 
 Further non-compatibility-affecting changes include:
  - Added AArchXX support for attaching to a running process.
@@ -203,6 +207,9 @@ Further non-compatibility-affecting changes include:
  - Added opnd_create_increment_reg() to create a register from an existing
    register whose register number is incremented by some amount, wrapping
    at the max register number for that register.
+ - Added opnd_create_vector_base_disp_aarch64() and reg_is_z() for creating
+   memory address operands that use SVE Z registers with a specified element
+   size.
 
 **************************************************
 <hr>

diff --git a/core/arch/arch.c b/core/arch/arch.c
@@ -801,6 +801,10 @@ d_r_arch_init(void)
         }
 #endif
     }
+
+    /* Ensure addressing registers fit into base+disp operand base and index fields. */
+    IF_AARCHXX(ASSERT_BITFIELD_TRUNCATE(REG_SPECIFIER_BITS, DR_REG_MAX_ADDRESSING_REG));
+
     mangle_init();
 }
 

diff --git a/core/ir/aarch64/codec.c b/core/ir/aarch64/codec.c
@@ -5252,6 +5252,35 @@ encode_opnd_hs_fsz(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_ou
     return false;
 }
 
+/* z_sz_sd  # sve vector reg, element size depending on sz. */
+
+static inline bool
+encode_opnd_z_sz_sd(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint *enc_out)
+{
+    IF_RETURN_FALSE(!opnd_is_reg(opnd))
+
+    const uint reg_number = (uint)(opnd_get_reg(opnd) - DR_REG_Z0);
+    IF_RETURN_FALSE(!(reg_number < 32))
+
+    uint sz = 0;
+    switch (opnd_get_vector_element_size(opnd)) {
+    case OPSZ_4: sz = 0; break;
+    case OPSZ_8: sz = 1; break;
+    default: RETURN_FALSE;
+    }
+
+    *enc_out |= (sz << 22) | (reg_number << 0);
+
+    return true;
+}
+
+static inline bool
+decode_opnd_z_sz_sd(uint enc, int opcode, byte *pc, OUT opnd_t *opnd)
+{
+    const aarch64_reg_offset element_size = TEST(1u << 22, enc) ? DOUBLE_REG : SINGLE_REG;
+    return decode_single_sized(DR_REG_Z0, DR_REG_Z31, 0, 5, element_size, 0, enc, opnd);
+}
+
 /* dq5_sz: D/Q register at bit position 5; bit 22 selects Q reg */
 
 static inline bool
@@ -5627,6 +5656,85 @@ encode_opnd_wx_size_16_zr(uint enc, int opcode, byte *pc, opnd_t opnd, OUT uint
     return encode_wx_size_reg(false, 16, opnd, enc_out);
 }
 
+/* svemem_vec_vec_idx: SVE memory address [<Zn>.<T>, <Zm>.<T>{, <mod> <amount>}] */
+
+static inline bool
+decode_svemem_vec_vec_opc(uint opc, OUT opnd_size_t *element_size,
+                          OUT dr_extend_type_t *extend_type)
+{
+    switch (opc) {
+    case 0b00:
+        *element_size = OPSZ_8;
+        *extend_type = DR_EXTEND_SXTW;
+        return true;
+    case 0b01:
+        *element_size = OPSZ_8;
+        *extend_type = DR_EXTEND_UXTW;
+        return true;
+    // DR_EXTEND_UXTX is an alias for LSL. LSL preferred in disassembly.
+    case 0b10:
+        *element_size = OPSZ_4;
+        *extend_type = DR_EXTEND_UXTX;
+        return true;
+    case 0b11:
+        *element_size = OPSZ_8;
+        *extend_type = DR_EXTEND_UXTX;
+        return true;
+    }
+    return false;
+}
+
+static inline bool
+decode_opnd_svemem_vec_vec_idx(uint enc, int opcode, byte *pc, OUT opnd_t *opnd)
+{
+    opnd_size_t element_size;
+    dr_extend_type_t extend_type;
+    if (!decode_svemem_vec_vec_opc(BITS(enc, 23, 22), &element_size, &extend_type))
+        return false;
+
+    const uint msz = extract_uint(enc, 10, 2);
+
+    const reg_id_t zn = DR_REG_Z0 + extract_uint(enc, 5, 5);
+    const reg_id_t zm = DR_REG_Z0 + extract_uint(enc, 16, 5);
+
+    /* This operand is used for SVE ADR instructions which don't transfer any memory.
+     * If this operand ends up being used for other instructions in the future we will
+     * need to calculate the appropriate transfer amount here.
+     */
+    ASSERT(opcode == OP_adr);
+    const opnd_size_t mem_transfer_size = OPSZ_0;
+
+    *opnd = opnd_create_vector_base_disp_aarch64(zn, zm, element_size, extend_type,
+                                                 /*scaled=*/msz != 0,
+                                                 /*disp=*/0,
+                                                 /*flags=*/0, mem_transfer_size, msz);
+    return true;
+}
+
+static inline bool
+encode_opnd_svemem_vec_vec_idx(uint enc, int opcode, byte *pc, opnd_t opnd,
+                               OUT uint *enc_out)
+{
+    if (!opnd_is_base_disp(opnd))
+        return false;
+
+    const uint zn = (uint)(opnd_get_base(opnd) - DR_REG_Z0);
+    const uint zm = (uint)(opnd_get_index(opnd) - DR_REG_Z0);
+
+    opnd_size_t element_size;
+    dr_extend_type_t extend_type;
+    uint msz;
+    if (!((zn < 32) && (zm < 32)) ||
+        !decode_svemem_vec_vec_opc(BITS(enc, 23, 22), &element_size, &extend_type) ||
+        element_size != opnd_get_vector_element_size(opnd) ||
+        extend_type != opnd_get_index_extend(opnd, NULL, &msz))
+        return false;
+
+    *enc_out |= (zm << 16) | (msz << 10) | (zn << 5);
+
+    return true;
+}
+
 /* fpimm13: floating-point immediate for scalar fmov */
 
 static inline bool

diff --git a/core/ir/aarch64/codec_sve.txt b/core/ir/aarch64/codec_sve.txt
@@ -46,6 +46,9 @@
 00100101xx10000011xxxxxxxxxxxxxx  n   9    SVE      add  z_size_bhsd_0 : z_size_bhsd_0 imm8_5 lsl shift1
 00000100011xxxxx01010xxxxxxxxxxx  n   934  SVE    addpl           x0sp : x16sp simm6_5
 00000100001xxxxx01010xxxxxxxxxxx  n   935  SVE    addvl           x0sp : x16sp simm6_5
+00000100001xxxxx1010xxxxxxxxxxxx  n   15   SVE      adr          z_d_0 : svemem_vec_vec_idx
+00000100011xxxxx1010xxxxxxxxxxxx  n   15   SVE      adr          z_d_0 : svemem_vec_vec_idx
+000001001x1xxxxx1010xxxxxxxxxxxx  n   15   SVE      adr        z_sz_sd : svemem_vec_vec_idx
 00000100xx011010000xxxxxxxxxxxxx  n   21   SVE      and             z0 : p10_lo z0 z5 bhsd_sz
 00000101100000xxxxxxxxxxxxxxxxxx  n   21   SVE      and  z_imm13_bhsd_0 : z_imm13_bhsd_0 imm13_const
 001001010000xxxx01xxxx0xxxx0xxxx  n   21   SVE      and          p_b_0 : p10_zer p_b_5 p_b_16

diff --git a/core/ir/aarch64/encode.c b/core/ir/aarch64/encode.c
@@ -57,6 +57,11 @@ const char *const reg_names[] = {
     "w20", "w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28", "w29",
     "w30", "wsp", "wzr",
 
+    "z0",  "z1",  "z2",  "z3",  "z4",  "z5",  "z6",  "z7",  "z8",  "z9",
+    "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19",
+    "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29",
+    "z30", "z31",
+
     "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",  "q8",  "q9",
     "q10", "q11", "q12", "q13", "q14", "q15", "q16", "q17", "q18", "q19",
     "q20", "q21", "q22", "q23", "q24", "q25", "q26", "q27", "q28", "q29",
@@ -109,11 +114,6 @@ const char *const reg_names[] = {
     "pmevtyper28_el0", "pmevtyper29_el0", "pmevtyper30_el0", "pmccfiltr_el0",
     "spsr_irq", "spsr_abt", "spsr_und", "spsr_fiq", "tpidr_el0", "tpidrro_el0",
 
-    "z0",  "z1",  "z2",  "z3",  "z4",  "z5",  "z6",  "z7",  "z8",  "z9",
-    "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19",
-    "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29",
-    "z30", "z31",
-
     "p0",  "p1",  "p2",  "p3",  "p4",  "p5",  "p6",  "p7",  "p8",  "p9",
     "p10", "p11", "p12", "p13", "p14", "p15",
 
@@ -137,18 +137,19 @@ const reg_id_t dr_reg_fixer[] = { REG_NULL,
                                       XREGS /* W0-WSP */
 #undef XREGS
 
-#define QREGS                                                                            \
-    DR_REG_Q0, DR_REG_Q1, DR_REG_Q2, DR_REG_Q3, DR_REG_Q4, DR_REG_Q5, DR_REG_Q6,         \
-        DR_REG_Q7, DR_REG_Q8, DR_REG_Q9, DR_REG_Q10, DR_REG_Q11, DR_REG_Q12, DR_REG_Q13, \
-        DR_REG_Q14, DR_REG_Q15, DR_REG_Q16, DR_REG_Q17, DR_REG_Q18, DR_REG_Q19,          \
-        DR_REG_Q20, DR_REG_Q21, DR_REG_Q22, DR_REG_Q23, DR_REG_Q24, DR_REG_Q25,          \
-        DR_REG_Q26, DR_REG_Q27, DR_REG_Q28, DR_REG_Q29, DR_REG_Q30, DR_REG_Q31,
-                                          QREGS                 /* Q0-Q31*/
-                                              QREGS             /* D0-D31 */
-                                                  QREGS         /* S0-S31 */
-                                                      QREGS     /* H0-H31 */
-                                                          QREGS /* B0-B31 */
-#undef QREGS
+#define ZREGS                                                                            \
+    DR_REG_Z0, DR_REG_Z1, DR_REG_Z2, DR_REG_Z3, DR_REG_Z4, DR_REG_Z5, DR_REG_Z6,         \
+        DR_REG_Z7, DR_REG_Z8, DR_REG_Z9, DR_REG_Z10, DR_REG_Z11, DR_REG_Z12, DR_REG_Z13, \
+        DR_REG_Z14, DR_REG_Z15, DR_REG_Z16, DR_REG_Z17, DR_REG_Z18, DR_REG_Z19,          \
+        DR_REG_Z20, DR_REG_Z21, DR_REG_Z22, DR_REG_Z23, DR_REG_Z24, DR_REG_Z25,          \
+        DR_REG_Z26, DR_REG_Z27, DR_REG_Z28, DR_REG_Z29, DR_REG_Z30, DR_REG_Z31,
+                                          ZREGS                     /* Z0-Z31 */
+                                              ZREGS                 /* Q0-Q31*/
+                                                  ZREGS             /* D0-D31 */
+                                                      ZREGS         /* S0-S31 */
+                                                          ZREGS     /* H0-H31 */
+                                                              ZREGS /* B0-B31 */
+#undef ZREGS
 
     DR_REG_NZCV, DR_REG_FPCR, DR_REG_FPSR,
     DR_REG_MDCCSR_EL0, DR_REG_DBGDTR_EL0, DR_REG_DBGDTRRX_EL0, DR_REG_SP_EL0,
@@ -185,7 +186,13 @@ const reg_id_t dr_reg_fixer[] = { REG_NULL,
     DR_REG_PMEVTYPER26_EL0, DR_REG_PMEVTYPER27_EL0, DR_REG_PMEVTYPER28_EL0,
     DR_REG_PMEVTYPER29_EL0, DR_REG_PMEVTYPER30_EL0, DR_REG_PMCCFILTR_EL0,
     DR_REG_SPSR_IRQ, DR_REG_SPSR_ABT, DR_REG_SPSR_UND, DR_REG_SPSR_FIQ,
-    DR_REG_TPIDR_EL0, DR_REG_TPIDRRO_EL0
+    DR_REG_TPIDR_EL0, DR_REG_TPIDRRO_EL0,
+
+    DR_REG_P0, DR_REG_P1, DR_REG_P2, DR_REG_P3, DR_REG_P4, DR_REG_P5,
+    DR_REG_P6, DR_REG_P7, DR_REG_P8, DR_REG_P9, DR_REG_P10, DR_REG_P11,
+    DR_REG_P12, DR_REG_P13, DR_REG_P14, DR_REG_P15,
+
+    DR_REG_CNTVCT_EL0,
 };
 /* clang-format on */
 

diff --git a/core/ir/aarch64/instr.c b/core/ir/aarch64/instr.c
@@ -441,6 +441,12 @@ reg_is_fp(reg_id_t reg)
     return false;
 }
 
+bool
+reg_is_z(reg_id_t reg)
+{
+    return DR_REG_Z0 <= reg && reg <= DR_REG_Z31;
+}
+
 bool
 instr_is_nop(instr_t *instr)
 {

diff --git a/core/ir/aarch64/instr_create_api.h b/core/ir/aarch64/instr_create_api.h
@@ -11217,6 +11217,28 @@
 #define INSTR_CREATE_prfw_sve_pred(dc, prfop, Pg, Rn) \
     instr_create_0dst_3src(dc, OP_prfw, prfop, Pg, Rn)
 
+/*
+ * Creates an ADR instruction.
+ *
+ * This macro is used to encode the forms:
+ * \verbatim
+ *    ADR     <Zd>.D, [<Zn>.D, <Zm>.D, SXTW <amount>]
+ *    ADR     <Zd>.D, [<Zn>.D, <Zm>.D, UXTW <amount>]
+ *    ADR     <Zd>.<Ts>, [<Zn>.<Ts>, <Zm>.<Ts>, <extend> <amount>]
+ * \endverbatim
+ * \param dc   The void * dcontext used to allocate memory for the #instr_t.
+ * \param Zd   The destination vector register, Z (Scalable).
+ * \param Zn   The first source vector base register with a register offset,
+ *             constructed with one of:
+ *             opnd_create_vector_base_disp_aarch64(Zn, Zm, OPSZ_8, DR_EXTEND_SXTW,
+ *                                                  0, 0, 0, OPSZ_0, shift_amount)
+ *             opnd_create_vector_base_disp_aarch64(Zn, Zm, OPSZ_8, DR_EXTEND_UXTW,
+ *                                                  0, 0, 0, OPSZ_0, shift_amount)
+ *             opnd_create_vector_base_disp_aarch64(Zn, Zm, elsz, DR_EXTEND_UXTX,
+ *                                                  0, 0, 0, OPSZ_0, shift_amount)
+ */
+#define INSTR_CREATE_adr_sve(dc, Zd, Zn) instr_create_1dst_1src(dc, OP_adr, Zd, Zn)
+
 /*
  * Creates a LD2B instruction.
  *

diff --git a/core/ir/aarch64/opnd_defs.txt b/core/ir/aarch64/opnd_defs.txt
@@ -242,6 +242,7 @@
                                              # elements, depending on bit 22 (sz)
 ---------x----------------------  sd_sz      # element width of FP vector reg for single or double
 ---------x----------------------  hs_fsz      # element width of FP vector reg for half or single
+---------x-----------------xxxxx  z_sz_sd    # SVE vector reg, elsz depending on sz
 ---------x------------xxxxx-----  dq5_sz     # as dqx, but depending on the sz bit rather than the Q bit
 ---------x------------xxxxx-----  wx_sz_5    # W/X register (or WZR/XZR) with size indicated in bit 22
 ---------x-xx-------------------  i3_index_19 # Index value from 22, 20:19
@@ -259,6 +260,7 @@
 --------??-??--------------xxxxx  z_tszl19_bhsd_0 # z element register mediated by the tszl and tszh fields
 --------??-??---------xxxxx-----  z_tszl19_bhsd_5 # z element register mediated by the tszl and tszh fields
 --------??-xxxxx----------------  wx_size_16_zr # GPR scalar register, register size, W or X depending on size bits
+--------??-xxxxx----xxxxxxx-----  svemem_vec_vec_idx # SVE memory address [<Zn>.<T>, <Zm>.<T>{, <mod> <amount>}]
 --------??-xxxxxxxx-------------  fpimm8_13  # floating-point immediate for scalar fmov
 --------xx----------------------  b_sz       # element width of a vector (8<<b_sz)
 --------xx----------------------  hs_sz      # element width of a vector (8<<hs_sz)