i#5365: Add AArch64 SVE support to the core (part 2)

This patch adds SVE support for signals in the core. It is the follow on patch from the SVE core work part 1, in PR #5835 (f646a63) and includes vector address computation for SVE scatter/gather, enabling first-fault load handling. Issue: #5365, #5036 Co-authored-by: Jack Gallagher <jack.gallagher@arm.com>
DynamoRIO · Mar 26, 2024 · 026b04e · 026b04e
1 parent 781f15e
commit 026b04e
Show file tree

Hide file tree

Showing 17 changed files with 1,138 additions and 102 deletions.
diff --git a/core/arch/arch.h b/core/arch/arch.h
@@ -156,6 +156,9 @@ mixed_mode_enabled(void)
 #    define SCRATCH_REG4_OFFS R4_OFFSET
 #    define SCRATCH_REG5_OFFS R5_OFFSET
 #    define REG_OFFSET(reg) (R0_OFFSET + ((reg)-DR_REG_R0) * sizeof(reg_t))
+#    define Z_REG_OFFSET(reg) \
+        ((MC_OFFS) +          \
+         (offsetof(priv_mcontext_t, simd) + ((reg)-DR_REG_Z0) * sizeof(dr_simd_t)))
 #    define CALL_SCRATCH_REG DR_REG_R11
 #    define MC_IBL_REG r2
 #    define MC_RETVAL_REG r0

diff --git a/core/ir/aarch64/codec.h b/core/ir/aarch64/codec.h
@@ -57,21 +57,6 @@ encode_common(byte *pc, instr_t *i, decode_info_t *di);
 #define BITS(_enc, bitmax, bitmin) \
     ((((uint32)(_enc)) >> (bitmin)) & (uint32)MASK((bitmax) - (bitmin) + 1))
 
-#if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER) && !defined(BUILD_TESTS)
-#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
-#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
-#else
-/* SVE vector length for off-line decoder set using -vl option with drdisas,
- * e.g.
- * $ drdisas -vl 256 e58057a1 85865e6b
- *  e58057a1   str    %z1 -> +0x05(%x29)[32byte]
- *  85865e6b   ldr    +0x37(%x19)[32byte] -> %z11
- * $
- */
-#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
-#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
-#endif
-
 #define RETURN_FALSE                                               \
     do {                                                           \
         CLIENT_ASSERT(false, "Unexpected state in AArch64 codec"); \

diff --git a/core/ir/aarch64/instr.c b/core/ir/aarch64/instr.c
@@ -1,6 +1,6 @@
 /* **********************************************************
  * Copyright (c) 2017-2023 Google, Inc.  All rights reserved.
- * Copyright (c) 2016 ARM Limited. All rights reserved.
+ * Copyright (c) 2016-2024 ARM Limited. All rights reserved.
  * **********************************************************/
 
 /*
@@ -37,6 +37,8 @@
 #include "encode_api.h"
 #include "opcode_names.h"
 
+#include <stddef.h>
+
 /* XXX i#6690: currently only A64 is supported for instruction encoding.
  * We want to add support for A64 decoding and synthetic ISA encoding as well.
  * XXX i#1684: move this function to core/ir/instr_shared.c once we can support
@@ -447,7 +449,7 @@ reg_is_gpr(reg_id_t reg)
 bool
 reg_is_simd(reg_id_t reg)
 {
-    return (DR_REG_Q0 <= reg && reg <= DR_REG_B31);
+    return reg_is_z(reg) || (DR_REG_Q0 <= reg && reg <= DR_REG_B31);
 }
 
 bool
@@ -737,3 +739,121 @@ instr_invert_predicate(dr_pred_type_t pred)
     default: CLIENT_ASSERT(false, "Incorrect predicate value"); return DR_PRED_NONE;
     }
 }
+
+ptr_int_t
+compute_scaled_index_aarch64(opnd_t opnd, reg_t index_val)
+{
+    bool scaled = false;
+    uint amount = 0;
+    dr_extend_type_t type = opnd_get_index_extend(opnd, &scaled, &amount);
+    reg_t extended = 0;
+    uint msb = 0;
+    switch (type) {
+    default: CLIENT_ASSERT(false, "Unsupported extend type"); return 0;
+    case DR_EXTEND_UXTW: extended = (index_val << (63u - 31u)) >> (63u - 31u); break;
+    case DR_EXTEND_SXTW:
+        extended = (index_val << (63u - 31u)) >> (63u - 31u);
+        msb = extended >> 31u;
+        if (msb == 1) {
+            extended = ((~0ull) << 32u) | extended;
+        }
+        break;
+    case DR_EXTEND_UXTX:
+    case DR_EXTEND_SXTX: extended = index_val; break;
+    }
+    if (scaled) {
+        return extended << amount;
+    } else {
+        return extended;
+    }
+}
+
+static bool
+is_active_in_mask(size_t element, uint64 mask, size_t element_size_bytes)
+{
+    const uint64 element_flag = 1ull << (element_size_bytes * element);
+    return TESTALL(element_flag, mask);
+}
+
+bool
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint addr_index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write)
+{
+    CLIENT_ASSERT(have_addr != NULL && addr != NULL && mc != NULL,
+                  "SVE address computation: invalid args");
+    CLIENT_ASSERT(TEST(DR_MC_MULTIMEDIA, mc_flags),
+                  "dr_mcontext_t.flags must include DR_MC_MULTIMEDIA");
+    CLIENT_ASSERT(mc_size >= offsetof(dr_mcontext_t, svep) + sizeof(mc->svep),
+                  "Incompatible client, invalid dr_mcontext_t.size.");
+
+    ASSERT(write != NULL);
+    *write = instr_is_scatter(instr);
+    ASSERT(*write || instr_is_gather(instr));
+
+    const size_t vl_bytes = opnd_size_in_bytes(OPSZ_SVE_VL_BYTES);
+    /* DynamoRIO currently supports up to 512-bit vector registers so a predicate register
+     * value should be <= 64-bits.
+     * If DynamoRIO is extended in the future to support large vector lengths this
+     * function will need to be updated to cope with larger predicate mask values.
+     */
+    ASSERT(vl_bytes / 8 < sizeof(uint64));
+
+    const reg_t governing_pred = opnd_get_reg(instr_get_src(instr, 1));
+    ASSERT(governing_pred >= DR_REG_START_P && governing_pred <= DR_REG_STOP_P);
+    uint64 mask = mc->svep[governing_pred - DR_REG_START_P].d;
+
+    if (mask == 0) {
+        return false;
+    }
+
+    const size_t element_size_bytes =
+        opnd_size_in_bytes(opnd_get_vector_element_size(curop));
+    const size_t num_elements = vl_bytes / element_size_bytes;
+
+    size_t active_elements_found = 0;
+    for (size_t element = 0; element < num_elements; element++) {
+        if (is_active_in_mask(element, mask, element_size_bytes)) {
+            active_elements_found++;
+            if (active_elements_found == addr_index + 1) {
+                const reg_t base_reg = opnd_get_base(curop);
+                if (reg_is_z(base_reg)) {
+                    size_t base_reg_num = base_reg - DR_REG_START_Z;
+                    if (element_size_bytes == 4) {
+                        *addr = (app_pc)(reg_t)mc->simd[base_reg_num].u32[element];
+                    } else {
+                        ASSERT(element_size_bytes == 8);
+                        *addr = (app_pc)mc->simd[base_reg_num].u64[element];
+                    }
+                } else {
+                    *addr = (app_pc)reg_get_value_priv(base_reg, mc);
+                }
+
+                const reg_t index_reg = opnd_get_index(curop);
+                reg_t unscaled_index_val = 0;
+                if (reg_is_z(index_reg)) {
+                    /* Vector index, extract the current element */
+                    size_t index_reg_num = index_reg - DR_REG_START_Z;
+                    if (element_size_bytes == 4) {
+                        unscaled_index_val = mc->simd[index_reg_num].u32[element];
+                    } else {
+                        ASSERT(element_size_bytes == 8);
+                        unscaled_index_val = mc->simd[index_reg_num].u64[element];
+                    }
+                } else {
+                    /* scalar index or no index */
+                    unscaled_index_val = reg_get_value_priv(index_reg, mc);
+                }
+
+                *have_addr = true;
+                *addr += compute_scaled_index_aarch64(curop, unscaled_index_val);
+                *addr += opnd_get_disp(curop);
+
+                return addr_index < num_elements;
+            }
+        }
+    }
+
+    return false;
+}
diff --git a/core/ir/aarchxx/opnd.c b/core/ir/aarchxx/opnd.c
@@ -63,6 +63,8 @@ opnd_get_reg_dcontext_offs(reg_id_t reg)
         return R0_OFFSET + (R1_OFFSET - R0_OFFSET) * (reg - DR_REG_W0);
     if (reg == DR_REG_XSP || reg == DR_REG_WSP)
         return XSP_OFFSET;
+    if (DR_REG_Z0 <= reg && reg <= DR_REG_Z31)
+        return Z_REG_OFFSET(reg);
     CLIENT_ASSERT(false, "opnd_get_reg_dcontext_offs: invalid reg");
     return -1;
 #else

diff --git a/core/ir/instr.h b/core/ir/instr.h
@@ -676,11 +676,15 @@ int
 instr_length_arch(dcontext_t *dcontext, instr_t *instr);
 bool
 opc_is_not_a_real_memory_load(int opc);
+
+#if defined(X86) || defined(AARCH64)
 bool
-instr_compute_address_VSIB(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
-                           dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
-                           DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
-                           DR_PARAM_OUT bool *write);
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write);
+#endif
+
 uint
 instr_branch_type(instr_t *cti_instr);
 #ifdef AARCH64

diff --git a/core/ir/instr_shared.c b/core/ir/instr_shared.c
@@ -2644,21 +2644,22 @@ instr_compute_address_helper(instr_t *instr, priv_mcontext_t *mc, size_t mc_size
     for (i = 0; i < instr_num_dsts(instr); i++) {
         curop = instr_get_dst(instr, i);
         if (opnd_is_memory_reference(curop)) {
-            if (opnd_is_vsib(curop)) {
-#ifdef X86
-                if (instr_compute_address_VSIB(instr, mc, mc_size, mc_flags, curop, index,
-                                               &have_addr, addr, &write)) {
-                    CLIENT_ASSERT(
-                        write,
-                        "VSIB found in destination but instruction is not a scatter");
+#if defined(X86) || defined(AARCH64)
+            if (opnd_is_vector_base_disp(curop)) {
+                if (instr_compute_vector_address(instr, mc, mc_size, mc_flags, curop,
+                                                 index, &have_addr, addr, &write)) {
+                    CLIENT_ASSERT(write,
+                                  "Vector address found in destination but instruction "
+                                  "is not a scatter");
                     break;
                 } else {
                     return false;
                 }
+            }
 #else
-                CLIENT_ASSERT(false, "VSIB should be x86-only");
+            CLIENT_ASSERT(
+                false, "Vector address computation implemented for AArch64 and x86 only");
 #endif
-            }
             memcount++;
             if (memcount == (int)index) {
                 write = true;
@@ -2672,15 +2673,17 @@ instr_compute_address_helper(instr_t *instr, priv_mcontext_t *mc, size_t mc_size
         for (i = 0; i < instr_num_srcs(instr); i++) {
             curop = instr_get_src(instr, i);
             if (opnd_is_memory_reference(curop)) {
-                if (opnd_is_vsib(curop)) {
-#ifdef X86
-                    if (instr_compute_address_VSIB(instr, mc, mc_size, mc_flags, curop,
-                                                   index, &have_addr, addr, &write))
+#if defined(X86) || defined(AARCH64)
+                if (opnd_is_vector_base_disp(curop)) {
+                    if (instr_compute_vector_address(instr, mc, mc_size, mc_flags, curop,
+                                                     index, &have_addr, addr, &write))
                         break;
                     else
                         return false;
 #else
-                    CLIENT_ASSERT(false, "VSIB should be x86-only");
+                CLIENT_ASSERT(
+                    false,
+                    "Vector address computation implemented for AArch64 and x86 only");
 #endif
                 }
                 memcount++;

diff --git a/core/ir/opnd.h b/core/ir/opnd.h
@@ -196,6 +196,12 @@ opnd_compute_address_helper(opnd_t opnd, priv_mcontext_t *mc, ptr_int_t scaled_i
 bool
 opnd_is_abs_base_disp(opnd_t opnd);
 
+#if defined(AARCH64)
+/* Internal function shared with vector address calculation */
+ptr_int_t
+compute_scaled_index_aarch64(opnd_t opnd, reg_t index_val);
+#endif
+
 #ifndef STANDALONE_DECODER
 opnd_t
 opnd_create_dcontext_field(dcontext_t *dcontext, int offs);
@@ -339,4 +345,22 @@ extern reg_id_t dr_reg_stolen;
 extern reg_id_t dr_reg_stolen;
 #endif
 
+#ifdef AARCH64
+#if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER) && !defined(BUILD_TESTS)
+#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
+#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
+#else
+/* SVE vector length for off-line decoder set using -vl option with drdisas,
+ * e.g.
+ * $ drdisas -vl 256 e58057a1 85865e6b
+ *  e58057a1   str    %z1 -> +0x05(%x29)[32byte]
+ *  85865e6b   ldr    +0x37(%x19)[32byte] -> %z11
+ * $
+ */
+#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
+#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
+#endif
+#endif /*AARCH64*/
+
+
 #endif /* _OPND_H_ */
diff --git a/core/ir/opnd_api.h b/core/ir/opnd_api.h
@@ -1119,6 +1119,10 @@ enum {
     DR_REG_STOP_32 = DR_REG_WSP,  /**< End of 32-bit general register enum values */
     DR_REG_START_GPR = DR_REG_X0, /**< Start of full-size general-purpose registers */
     DR_REG_STOP_GPR = DR_REG_XSP, /**< End of full-size general-purpose registers */
+    DR_REG_START_Z = DR_REG_Z0,   /**< Start of Z scalable vector registers */
+    DR_REG_STOP_Z = DR_REG_Z31,   /**< Start of Z scalable vector registers */
+    DR_REG_START_P = DR_REG_P0,   /**< Start of P scalable predicate registers */
+    DR_REG_STOP_P = DR_REG_P15,   /**< Start of P scalable predicate registers */
 #    else
     DR_REG_START_32 = DR_REG_R0,  /**< Start of 32-bit general register enum values */
     DR_REG_STOP_32 = DR_REG_R15,  /**< End of 32-bit general register enum values */
@@ -1128,7 +1132,8 @@ enum {
 
     DR_NUM_GPR_REGS = DR_REG_STOP_GPR - DR_REG_START_GPR + 1, /**< Count of GPR regs. */
 #    ifdef AARCH64
-    DR_NUM_SIMD_VECTOR_REGS = DR_REG_Z31 - DR_REG_Z0 + 1,     /**< Count of SIMD regs. */
+    DR_NUM_SIMD_VECTOR_REGS =
+        DR_REG_STOP_Z - DR_REG_START_Z + 1, /**< Count of SIMD regs. */
 #    else
     /* XXX: maybe we want more distinct names that provide counts for 64-bit D or 32-bit
      * S registers.
@@ -2604,6 +2609,14 @@ DR_API
 bool
 opnd_is_vsib(opnd_t opnd);
 
+DR_API
+/**
+ * Returns true iff \p opnd is a base+disp memory reference operand which uses vector
+ * registers.
+ */
+bool
+opnd_is_vector_base_disp(opnd_t opnd);
+
 DR_API
 /**
  * Returns true iff \p opnd is a (near or far) absolute address operand.