From 026b04e8a1a67f5990c10c865bd9ce1c81a00e31 Mon Sep 17 00:00:00 2001
From: Assad Hashmi <assad.hashmi@arm.com>
Date: Tue, 26 Mar 2024 11:36:34 +0000
Subject: [PATCH] i#5365: Add AArch64 SVE support to the core (part 2)

This patch adds SVE support for signals in the core. It is the follow
on patch from the SVE core work part 1, in PR #5835 (f646a632d) and
includes vector address computation for SVE scatter/gather, enabling
first-fault load handling.

Issue: #5365, #5036

Co-authored-by: Jack Gallagher <jack.gallagher@arm.com>
---
 core/arch/arch.h                 |   3 +
 core/ir/aarch64/codec.h          |  15 -
 core/ir/aarch64/instr.c          | 124 +++++++-
 core/ir/aarchxx/opnd.c           |   2 +
 core/ir/instr.h                  |  12 +-
 core/ir/instr_shared.c           |  31 +-
 core/ir/opnd.h                   |  24 ++
 core/ir/opnd_api.h               |  15 +-
 core/ir/opnd_shared.c            |  60 ++--
 core/ir/x86/instr.c              |   8 +-
 core/lib/globals_api.h           |  15 +-
 core/unix/include/sigcontext.h   | 167 ++++++++++-
 core/unix/signal_linux_aarch64.c | 199 ++++++++++++-
 suite/runsuite_wrapper.pl        |   1 +
 suite/tests/api/opnd-a64.c       | 470 ++++++++++++++++++++++++++++++-
 suite/tests/tools.c              |  80 ++++++
 suite/tests/tools.h              |  14 +
 17 files changed, 1138 insertions(+), 102 deletions(-)

diff --git a/core/arch/arch.h b/core/arch/arch.h
index 2de3d2915c9..35f2855a65e 100644
--- a/core/arch/arch.h
+++ b/core/arch/arch.h
@@ -156,6 +156,9 @@ mixed_mode_enabled(void)
 #    define SCRATCH_REG4_OFFS R4_OFFSET
 #    define SCRATCH_REG5_OFFS R5_OFFSET
 #    define REG_OFFSET(reg) (R0_OFFSET + ((reg)-DR_REG_R0) * sizeof(reg_t))
+#    define Z_REG_OFFSET(reg) \
+        ((MC_OFFS) +          \
+         (offsetof(priv_mcontext_t, simd) + ((reg)-DR_REG_Z0) * sizeof(dr_simd_t)))
 #    define CALL_SCRATCH_REG DR_REG_R11
 #    define MC_IBL_REG r2
 #    define MC_RETVAL_REG r0
diff --git a/core/ir/aarch64/codec.h b/core/ir/aarch64/codec.h
index 81de59b069f..b5d47b86236 100644
--- a/core/ir/aarch64/codec.h
+++ b/core/ir/aarch64/codec.h
@@ -57,21 +57,6 @@ encode_common(byte *pc, instr_t *i, decode_info_t *di);
 #define BITS(_enc, bitmax, bitmin) \
     ((((uint32)(_enc)) >> (bitmin)) & (uint32)MASK((bitmax) - (bitmin) + 1))
 
-#if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER) && !defined(BUILD_TESTS)
-#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
-#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
-#else
-/* SVE vector length for off-line decoder set using -vl option with drdisas,
- * e.g.
- * $ drdisas -vl 256 e58057a1 85865e6b
- *  e58057a1   str    %z1 -> +0x05(%x29)[32byte]
- *  85865e6b   ldr    +0x37(%x19)[32byte] -> %z11
- * $
- */
-#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
-#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
-#endif
-
 #define RETURN_FALSE                                               \
     do {                                                           \
         CLIENT_ASSERT(false, "Unexpected state in AArch64 codec"); \
diff --git a/core/ir/aarch64/instr.c b/core/ir/aarch64/instr.c
index 63a0ec57f2c..21393f04c2e 100644
--- a/core/ir/aarch64/instr.c
+++ b/core/ir/aarch64/instr.c
@@ -1,6 +1,6 @@
 /* **********************************************************
  * Copyright (c) 2017-2023 Google, Inc.  All rights reserved.
- * Copyright (c) 2016 ARM Limited. All rights reserved.
+ * Copyright (c) 2016-2024 ARM Limited. All rights reserved.
  * **********************************************************/
 
 /*
@@ -37,6 +37,8 @@
 #include "encode_api.h"
 #include "opcode_names.h"
 
+#include <stddef.h>
+
 /* XXX i#6690: currently only A64 is supported for instruction encoding.
  * We want to add support for A64 decoding and synthetic ISA encoding as well.
  * XXX i#1684: move this function to core/ir/instr_shared.c once we can support
@@ -447,7 +449,7 @@ reg_is_gpr(reg_id_t reg)
 bool
 reg_is_simd(reg_id_t reg)
 {
-    return (DR_REG_Q0 <= reg && reg <= DR_REG_B31);
+    return reg_is_z(reg) || (DR_REG_Q0 <= reg && reg <= DR_REG_B31);
 }
 
 bool
@@ -737,3 +739,121 @@ instr_invert_predicate(dr_pred_type_t pred)
     default: CLIENT_ASSERT(false, "Incorrect predicate value"); return DR_PRED_NONE;
     }
 }
+
+ptr_int_t
+compute_scaled_index_aarch64(opnd_t opnd, reg_t index_val)
+{
+    bool scaled = false;
+    uint amount = 0;
+    dr_extend_type_t type = opnd_get_index_extend(opnd, &scaled, &amount);
+    reg_t extended = 0;
+    uint msb = 0;
+    switch (type) {
+    default: CLIENT_ASSERT(false, "Unsupported extend type"); return 0;
+    case DR_EXTEND_UXTW: extended = (index_val << (63u - 31u)) >> (63u - 31u); break;
+    case DR_EXTEND_SXTW:
+        extended = (index_val << (63u - 31u)) >> (63u - 31u);
+        msb = extended >> 31u;
+        if (msb == 1) {
+            extended = ((~0ull) << 32u) | extended;
+        }
+        break;
+    case DR_EXTEND_UXTX:
+    case DR_EXTEND_SXTX: extended = index_val; break;
+    }
+    if (scaled) {
+        return extended << amount;
+    } else {
+        return extended;
+    }
+}
+
+static bool
+is_active_in_mask(size_t element, uint64 mask, size_t element_size_bytes)
+{
+    const uint64 element_flag = 1ull << (element_size_bytes * element);
+    return TESTALL(element_flag, mask);
+}
+
+bool
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint addr_index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write)
+{
+    CLIENT_ASSERT(have_addr != NULL && addr != NULL && mc != NULL,
+                  "SVE address computation: invalid args");
+    CLIENT_ASSERT(TEST(DR_MC_MULTIMEDIA, mc_flags),
+                  "dr_mcontext_t.flags must include DR_MC_MULTIMEDIA");
+    CLIENT_ASSERT(mc_size >= offsetof(dr_mcontext_t, svep) + sizeof(mc->svep),
+                  "Incompatible client, invalid dr_mcontext_t.size.");
+
+    ASSERT(write != NULL);
+    *write = instr_is_scatter(instr);
+    ASSERT(*write || instr_is_gather(instr));
+
+    const size_t vl_bytes = opnd_size_in_bytes(OPSZ_SVE_VL_BYTES);
+    /* DynamoRIO currently supports up to 512-bit vector registers so a predicate register
+     * value should be <= 64-bits.
+     * If DynamoRIO is extended in the future to support large vector lengths this
+     * function will need to be updated to cope with larger predicate mask values.
+     */
+    ASSERT(vl_bytes / 8 < sizeof(uint64));
+
+    const reg_t governing_pred = opnd_get_reg(instr_get_src(instr, 1));
+    ASSERT(governing_pred >= DR_REG_START_P && governing_pred <= DR_REG_STOP_P);
+    uint64 mask = mc->svep[governing_pred - DR_REG_START_P].d;
+
+    if (mask == 0) {
+        return false;
+    }
+
+    const size_t element_size_bytes =
+        opnd_size_in_bytes(opnd_get_vector_element_size(curop));
+    const size_t num_elements = vl_bytes / element_size_bytes;
+
+    size_t active_elements_found = 0;
+    for (size_t element = 0; element < num_elements; element++) {
+        if (is_active_in_mask(element, mask, element_size_bytes)) {
+            active_elements_found++;
+            if (active_elements_found == addr_index + 1) {
+                const reg_t base_reg = opnd_get_base(curop);
+                if (reg_is_z(base_reg)) {
+                    size_t base_reg_num = base_reg - DR_REG_START_Z;
+                    if (element_size_bytes == 4) {
+                        *addr = (app_pc)(reg_t)mc->simd[base_reg_num].u32[element];
+                    } else {
+                        ASSERT(element_size_bytes == 8);
+                        *addr = (app_pc)mc->simd[base_reg_num].u64[element];
+                    }
+                } else {
+                    *addr = (app_pc)reg_get_value_priv(base_reg, mc);
+                }
+
+                const reg_t index_reg = opnd_get_index(curop);
+                reg_t unscaled_index_val = 0;
+                if (reg_is_z(index_reg)) {
+                    /* Vector index, extract the current element */
+                    size_t index_reg_num = index_reg - DR_REG_START_Z;
+                    if (element_size_bytes == 4) {
+                        unscaled_index_val = mc->simd[index_reg_num].u32[element];
+                    } else {
+                        ASSERT(element_size_bytes == 8);
+                        unscaled_index_val = mc->simd[index_reg_num].u64[element];
+                    }
+                } else {
+                    /* scalar index or no index */
+                    unscaled_index_val = reg_get_value_priv(index_reg, mc);
+                }
+
+                *have_addr = true;
+                *addr += compute_scaled_index_aarch64(curop, unscaled_index_val);
+                *addr += opnd_get_disp(curop);
+
+                return addr_index < num_elements;
+            }
+        }
+    }
+
+    return false;
+}
diff --git a/core/ir/aarchxx/opnd.c b/core/ir/aarchxx/opnd.c
index d4d121a30b8..934779f6b36 100644
--- a/core/ir/aarchxx/opnd.c
+++ b/core/ir/aarchxx/opnd.c
@@ -63,6 +63,8 @@ opnd_get_reg_dcontext_offs(reg_id_t reg)
         return R0_OFFSET + (R1_OFFSET - R0_OFFSET) * (reg - DR_REG_W0);
     if (reg == DR_REG_XSP || reg == DR_REG_WSP)
         return XSP_OFFSET;
+    if (DR_REG_Z0 <= reg && reg <= DR_REG_Z31)
+        return Z_REG_OFFSET(reg);
     CLIENT_ASSERT(false, "opnd_get_reg_dcontext_offs: invalid reg");
     return -1;
 #else
diff --git a/core/ir/instr.h b/core/ir/instr.h
index 76dc3a02c82..b56c9b51eea 100644
--- a/core/ir/instr.h
+++ b/core/ir/instr.h
@@ -676,11 +676,15 @@ int
 instr_length_arch(dcontext_t *dcontext, instr_t *instr);
 bool
 opc_is_not_a_real_memory_load(int opc);
+
+#if defined(X86) || defined(AARCH64)
 bool
-instr_compute_address_VSIB(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
-                           dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
-                           DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
-                           DR_PARAM_OUT bool *write);
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write);
+#endif
+
 uint
 instr_branch_type(instr_t *cti_instr);
 #ifdef AARCH64
diff --git a/core/ir/instr_shared.c b/core/ir/instr_shared.c
index a3943249feb..b4e6db8f2ae 100644
--- a/core/ir/instr_shared.c
+++ b/core/ir/instr_shared.c
@@ -2644,21 +2644,22 @@ instr_compute_address_helper(instr_t *instr, priv_mcontext_t *mc, size_t mc_size
     for (i = 0; i < instr_num_dsts(instr); i++) {
         curop = instr_get_dst(instr, i);
         if (opnd_is_memory_reference(curop)) {
-            if (opnd_is_vsib(curop)) {
-#ifdef X86
-                if (instr_compute_address_VSIB(instr, mc, mc_size, mc_flags, curop, index,
-                                               &have_addr, addr, &write)) {
-                    CLIENT_ASSERT(
-                        write,
-                        "VSIB found in destination but instruction is not a scatter");
+#if defined(X86) || defined(AARCH64)
+            if (opnd_is_vector_base_disp(curop)) {
+                if (instr_compute_vector_address(instr, mc, mc_size, mc_flags, curop,
+                                                 index, &have_addr, addr, &write)) {
+                    CLIENT_ASSERT(write,
+                                  "Vector address found in destination but instruction "
+                                  "is not a scatter");
                     break;
                 } else {
                     return false;
                 }
+            }
 #else
-                CLIENT_ASSERT(false, "VSIB should be x86-only");
+            CLIENT_ASSERT(
+                false, "Vector address computation implemented for AArch64 and x86 only");
 #endif
-            }
             memcount++;
             if (memcount == (int)index) {
                 write = true;
@@ -2672,15 +2673,17 @@ instr_compute_address_helper(instr_t *instr, priv_mcontext_t *mc, size_t mc_size
         for (i = 0; i < instr_num_srcs(instr); i++) {
             curop = instr_get_src(instr, i);
             if (opnd_is_memory_reference(curop)) {
-                if (opnd_is_vsib(curop)) {
-#ifdef X86
-                    if (instr_compute_address_VSIB(instr, mc, mc_size, mc_flags, curop,
-                                                   index, &have_addr, addr, &write))
+#if defined(X86) || defined(AARCH64)
+                if (opnd_is_vector_base_disp(curop)) {
+                    if (instr_compute_vector_address(instr, mc, mc_size, mc_flags, curop,
+                                                     index, &have_addr, addr, &write))
                         break;
                     else
                         return false;
 #else
-                    CLIENT_ASSERT(false, "VSIB should be x86-only");
+                CLIENT_ASSERT(
+                    false,
+                    "Vector address computation implemented for AArch64 and x86 only");
 #endif
                 }
                 memcount++;
diff --git a/core/ir/opnd.h b/core/ir/opnd.h
index 4d90ac18369..3a4ed544657 100644
--- a/core/ir/opnd.h
+++ b/core/ir/opnd.h
@@ -196,6 +196,12 @@ opnd_compute_address_helper(opnd_t opnd, priv_mcontext_t *mc, ptr_int_t scaled_i
 bool
 opnd_is_abs_base_disp(opnd_t opnd);
 
+#if defined(AARCH64)
+/* Internal function shared with vector address calculation */
+ptr_int_t
+compute_scaled_index_aarch64(opnd_t opnd, reg_t index_val);
+#endif
+
 #ifndef STANDALONE_DECODER
 opnd_t
 opnd_create_dcontext_field(dcontext_t *dcontext, int offs);
@@ -339,4 +345,22 @@ extern reg_id_t dr_reg_stolen;
 extern reg_id_t dr_reg_stolen;
 #endif
 
+#ifdef AARCH64
+#if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER) && !defined(BUILD_TESTS)
+#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
+#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
+#else
+/* SVE vector length for off-line decoder set using -vl option with drdisas,
+ * e.g.
+ * $ drdisas -vl 256 e58057a1 85865e6b
+ *  e58057a1   str    %z1 -> +0x05(%x29)[32byte]
+ *  85865e6b   ldr    +0x37(%x19)[32byte] -> %z11
+ * $
+ */
+#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
+#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
+#endif
+#endif /*AARCH64*/
+
+
 #endif /* _OPND_H_ */
diff --git a/core/ir/opnd_api.h b/core/ir/opnd_api.h
index b62a5807440..69175aa06ec 100644
--- a/core/ir/opnd_api.h
+++ b/core/ir/opnd_api.h
@@ -1119,6 +1119,10 @@ enum {
     DR_REG_STOP_32 = DR_REG_WSP,  /**< End of 32-bit general register enum values */
     DR_REG_START_GPR = DR_REG_X0, /**< Start of full-size general-purpose registers */
     DR_REG_STOP_GPR = DR_REG_XSP, /**< End of full-size general-purpose registers */
+    DR_REG_START_Z = DR_REG_Z0,   /**< Start of Z scalable vector registers */
+    DR_REG_STOP_Z = DR_REG_Z31,   /**< Start of Z scalable vector registers */
+    DR_REG_START_P = DR_REG_P0,   /**< Start of P scalable predicate registers */
+    DR_REG_STOP_P = DR_REG_P15,   /**< Start of P scalable predicate registers */
 #    else
     DR_REG_START_32 = DR_REG_R0,  /**< Start of 32-bit general register enum values */
     DR_REG_STOP_32 = DR_REG_R15,  /**< End of 32-bit general register enum values */
@@ -1128,7 +1132,8 @@ enum {
 
     DR_NUM_GPR_REGS = DR_REG_STOP_GPR - DR_REG_START_GPR + 1, /**< Count of GPR regs. */
 #    ifdef AARCH64
-    DR_NUM_SIMD_VECTOR_REGS = DR_REG_Z31 - DR_REG_Z0 + 1,     /**< Count of SIMD regs. */
+    DR_NUM_SIMD_VECTOR_REGS =
+        DR_REG_STOP_Z - DR_REG_START_Z + 1, /**< Count of SIMD regs. */
 #    else
     /* XXX: maybe we want more distinct names that provide counts for 64-bit D or 32-bit
      * S registers.
@@ -2604,6 +2609,14 @@ DR_API
 bool
 opnd_is_vsib(opnd_t opnd);
 
+DR_API
+/**
+ * Returns true iff \p opnd is a base+disp memory reference operand which uses vector
+ * registers.
+ */
+bool
+opnd_is_vector_base_disp(opnd_t opnd);
+
 DR_API
 /**
  * Returns true iff \p opnd is a (near or far) absolute address operand.
diff --git a/core/ir/opnd_shared.c b/core/ir/opnd_shared.c
index f78d56a8d6e..a7f00cdb300 100644
--- a/core/ir/opnd_shared.c
+++ b/core/ir/opnd_shared.c
@@ -183,6 +183,13 @@ opnd_is_vsib(opnd_t op)
              reg_is_strictly_zmm(opnd_get_index(op))));
 }
 
+bool
+opnd_is_vector_base_disp(opnd_t op)
+{
+    return opnd_is_base_disp(op) &&
+        (reg_is_simd(opnd_get_base(op)) || reg_is_simd(opnd_get_index(op)));
+}
+
 bool
 opnd_is_reg_32bit(opnd_t opnd)
 {
@@ -232,7 +239,12 @@ bool
 reg_is_pointer_sized(reg_id_t reg)
 {
 #ifdef X64
+#    ifdef AARCH64
+    return (reg >= DR_REG_Z0 && reg <= DR_REG_Z31) ||
+        (reg >= REG_START_64 && reg <= REG_STOP_64);
+#    else
     return (reg >= REG_START_64 && reg <= REG_STOP_64);
+#    endif
 #else
     return (reg >= REG_START_32 && reg <= REG_STOP_32);
 #endif
@@ -2211,6 +2223,21 @@ reg_get_value_ex(reg_id_t reg, dr_mcontext_t *mc, DR_PARAM_OUT byte *val)
         reg_t regval = reg_get_value(reg, mc);
         *(reg_t *)val = regval;
     }
+#elif defined(AARCH64)
+    if (reg >= DR_REG_START_Z && reg <= DR_REG_STOP_Z) {
+        if (!TEST(DR_MC_MULTIMEDIA, mc->flags) || mc->size != sizeof(dr_mcontext_t))
+            return false;
+        memcpy(val, &mc->simd[reg - DR_REG_START_Z],
+               opnd_size_in_bytes(reg_get_size(reg)));
+    } else if (reg >= DR_REG_START_P && reg <= DR_REG_STOP_P) {
+        if (!TEST(DR_MC_MULTIMEDIA, mc->flags) || mc->size != sizeof(dr_mcontext_t))
+            return false;
+        memcpy(val, &mc->svep[reg - DR_REG_START_P],
+               opnd_size_in_bytes(reg_get_size(reg)));
+    } else {
+        reg_t regval = reg_get_value(reg, mc);
+        *(reg_t *)val = regval;
+    }
 #else
     CLIENT_ASSERT(false, "NYI i#1551");
 #endif
@@ -2334,30 +2361,7 @@ opnd_compute_address_priv(opnd_t opnd, priv_mcontext_t *mc)
         ptr_int_t scale = opnd_get_scale(opnd);
         scaled_index = scale * reg_get_value_priv(index, mc);
 #elif defined(AARCH64)
-        bool scaled = false;
-        uint amount = 0;
-        dr_extend_type_t type = opnd_get_index_extend(opnd, &scaled, &amount);
-        reg_t index_val = reg_get_value_priv(index, mc);
-        reg_t extended = 0;
-        uint msb = 0;
-        switch (type) {
-        default: CLIENT_ASSERT(false, "Unsupported extend type"); return NULL;
-        case DR_EXTEND_UXTW: extended = (index_val << (63u - 31u)) >> (63u - 31u); break;
-        case DR_EXTEND_SXTW:
-            extended = (index_val << (63u - 31u)) >> (63u - 31u);
-            msb = extended >> 31u;
-            if (msb == 1) {
-                extended = ((~0ull) << 32u) | extended;
-            }
-            break;
-        case DR_EXTEND_UXTX:
-        case DR_EXTEND_SXTX: extended = index_val; break;
-        }
-        if (scaled) {
-            scaled_index = extended << amount;
-        } else {
-            scaled_index = extended;
-        }
+        scaled_index = compute_scaled_index_aarch64(opnd, reg_get_value_priv(index, mc));
 #elif defined(ARM)
         uint amount;
         dr_shift_type_t type = opnd_get_index_shift(opnd, &amount);
@@ -2758,14 +2762,10 @@ reg_get_size(reg_id_t reg)
     if (reg >= DR_REG_MDCCSR_EL0 && reg <= DR_REG_SPSR_FIQ)
         return OPSZ_8;
     if (reg >= DR_REG_Z0 && reg <= DR_REG_Z31) {
-#        if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER)
-        return opnd_size_from_bytes(proc_get_vector_length_bytes());
-#        else
-        return OPSZ_SCALABLE;
-#        endif
+        return OPSZ_SVE_VL_BYTES;
     }
     if ((reg >= DR_REG_P0 && reg <= DR_REG_P15) || reg == DR_REG_FFR)
-        return OPSZ_SCALABLE_PRED;
+        return OPSZ_SVE_PL_BYTES;
     if (reg == DR_REG_CNTVCT_EL0)
         return OPSZ_8;
     if (reg >= DR_REG_NZCV && reg <= DR_REG_FPSR)
diff --git a/core/ir/x86/instr.c b/core/ir/x86/instr.c
index b9cf49eea69..31438fa047d 100644
--- a/core/ir/x86/instr.c
+++ b/core/ir/x86/instr.c
@@ -410,10 +410,10 @@ instr_compute_VSIB_index(bool *selected DR_PARAM_OUT, app_pc *result DR_PARAM_OU
 }
 
 bool
-instr_compute_address_VSIB(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
-                           dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
-                           DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
-                           DR_PARAM_OUT bool *write)
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write)
 {
     /* We assume that any instr w/ a VSIB opnd has no other
      * memory reference (and the VSIB is a source)!  Else we'll
diff --git a/core/lib/globals_api.h b/core/lib/globals_api.h
index 5988379cb35..659e32d0a2b 100644
--- a/core/lib/globals_api.h
+++ b/core/lib/globals_api.h
@@ -701,12 +701,15 @@ typedef uint64 dr_opmask_t;
  */
 #    ifdef X64
 typedef union ALIGN_VAR(16) _dr_simd_t {
-    byte b;       /**< Byte (8 bit, Bn) scalar element of Vn, Zn, or Pn.        */
-    ushort h;     /**< Halfword (16 bit, Hn) scalar element of Vn, Zn and Pn.   */
-    uint s;       /**< Singleword (32 bit, Sn) scalar element of Vn, Zn and Pn. */
-    uint64 d;     /**< Doubleword (64 bit, Dn) scalar element of Vn, Zn and Pn. */
-    uint q[4];    /**< The full 128 bit Vn register, Qn as q[3]:q[2]:q[1]:q[0]. */
-    uint u32[16]; /**< The full 512 bit Zn, Pn and FFR registers. */
+    byte b;        /**< Byte (8 bit, Bn) scalar element of Vn, Zn, or Pn.        */
+    ushort h;      /**< Halfword (16 bit, Hn) scalar element of Vn, Zn and Pn.   */
+    uint s;        /**< Singleword (32 bit, Sn) scalar element of Vn, Zn and Pn. */
+    uint64 d;      /**< Doubleword (64 bit, Dn) scalar element of Vn, Zn and Pn. */
+    uint q[4];     /**< The full 128 bit Vn register, Qn as q[3]:q[2]:q[1]:q[0]. */
+    uint u32[16];  /**< The full 512 bit Zn, Pn and FFR registers as Singleword (32-bit)
+                      elements. */
+    uint64 u64[8]; /**< The full 512 bit Zn, Pn and FFR registers as Doubleword (64-bit)
+                      elements. */
 } dr_simd_t;
 #    else
 typedef union _dr_simd_t {
diff --git a/core/unix/include/sigcontext.h b/core/unix/include/sigcontext.h
index b4acbbdcfb0..084fb880bb3 100644
--- a/core/unix/include/sigcontext.h
+++ b/core/unix/include/sigcontext.h
@@ -332,6 +332,25 @@ typedef struct _kernel_sigcontext_t {
     unsigned char __reserved[4096] __attribute__((__aligned__(16)));
 } kernel_sigcontext_t;
 
+/*
+ * Allocation of 4k bytes of __reserved[]:
+ * (Note: records do not necessarily occur in the order shown here.)
+ *
+ * size   description
+ *
+ * 528    fpsimd_context
+ * 16     esr_context (not used in DynamoRIO)
+ * 16     sve_context
+ * 32     extra_context
+ * 16     terminator (null _aarch64_ctx)
+ *
+ * 3488 (reserved for future allocation)
+ *
+ * The above table documents the maximum set and sizes of records that can be
+ * generated for userspace. New records which exceed this space will need to
+ * implement a mechanism to handle expanded signal frames.
+ */
+
 /* XXX: These defines come from the system include files for a regular
  * build (signal.h is included), but for DR_HOST_NOT_TARGET we need
  * them defined here.  Probably what we should do is rename them so
@@ -341,8 +360,8 @@ typedef struct _kernel_sigcontext_t {
 /*
  * Header to be used at the beginning of structures extending the user
  * context. Such structures must be placed after the rt_sigframe on the stack
- * and be 16-byte aligned. The last structure must be a dummy one with the
- * magic and size set to 0.
+ * and be 16-byte aligned. The last structure must be a null terminator context
+ * with the magic and size set to 0.
  */
 struct _aarch64_ctx {
     __u32 magic;
@@ -352,26 +371,152 @@ struct _aarch64_ctx {
 #        define FPSIMD_MAGIC 0x46508001
 
 struct fpsimd_context {
-    struct _aarch64_ctx head;
-    __u32 fpsr;
-    __u32 fpcr;
-    __uint128_t vregs[32];
+    struct _aarch64_ctx head; /* 8 bytes */
+    __u32 fpsr;               /* 4 */
+    __u32 fpcr;               /* 4 */
+    __uint128_t vregs[32];    /* 512 */
 };
 
-/* TODO i#5365: Storage of sve_context in kernel_sigcontext_t.__reserved, see
- * above. See also sigcontext_to_mcontext_simd() and
- * mcontext_to_sigcontext_simd().
+/* Storage of sve_context in kernel_sigcontext_t.__reserved, see above. See
+ * also sigcontext_to_mcontext_simd() and mcontext_to_sigcontext_simd().
  */
 
 #        define SVE_MAGIC 0x53564501
 
 struct sve_context {
+    struct _aarch64_ctx head; /* 8 bytes */
+    __u16 vl;                 /* 2 */
+    __u16 __reserved[3];      /* 6 */
+};
+
+/*
+ * extra_context: describes extra space in the signal frame for
+ * additional structures that don't fit in sigcontext.__reserved[].
+ *
+ * Note:
+ *
+ * 1) fpsimd_context, esr_context and extra_context must be placed in
+ * sigcontext.__reserved[] if present.  They cannot be placed in the
+ * extra space.  Any other record can be placed either in the extra
+ * space or in sigcontext.__reserved[], unless otherwise specified in
+ * this file.
+ *
+ * 2) There must not be more than one extra_context.
+ *
+ * 3) If extra_context is present, it must be followed immediately in
+ * sigcontext.__reserved[] by the terminating null _aarch64_ctx.
+ *
+ * 4) The extra space to which datap points must start at the first
+ * 16-byte aligned address immediately after the terminating null
+ * _aarch64_ctx that follows the extra_context structure in
+ * __reserved[].  The extra space may overrun the end of __reserved[],
+ * as indicated by a sufficiently large value for the size field.
+ *
+ * 5) The extra space must itself be terminated with a null
+ * _aarch64_ctx.
+ */
+#        define EXTRA_MAGIC 0x45585401
+
+struct extra_context {
+    struct _aarch64_ctx head; /* 8 bytes */
+    __u64 datap; /*  8 bytes. 16-byte aligned pointer to extra space cast to __u64 */
+    __u32 size;  /* 4 bytes. size in bytes of the extra space */
+    __u32 __reserved[3]; /* 12 bytes */
+};
+
+#        define ESR_MAGIC       0x45535201
+
+struct esr_context {
     struct _aarch64_ctx head;
-    __u16 vl;
-    __u16 __reserved[3];
+    __u64 esr;
 };
+
 #    endif
 
+/* SVE helper macros. */
+#define __SVE_VQ_BYTES          16    /* number of bytes per quadword */
+
+#define __SVE_VQ_MIN            1
+#define __SVE_VQ_MAX            512
+
+#define __SVE_VL_MIN            (__SVE_VQ_MIN * __SVE_VQ_BYTES)
+#define __SVE_VL_MAX            (__SVE_VQ_MAX * __SVE_VQ_BYTES)
+
+#define __SVE_NUM_ZREGS         32
+#define __SVE_NUM_PREGS         16
+
+#define __sve_vl_valid(vl)                      \
+        ((vl) % __SVE_VQ_BYTES == 0 &&          \
+     (vl) >= __SVE_VL_MIN &&        \
+     (vl) <= __SVE_VL_MAX)
+
+#define __sve_vq_from_vl(vl)    ((vl) / __SVE_VQ_BYTES)
+#define __sve_vl_from_vq(vq)    ((vq) * __SVE_VQ_BYTES)
+
+#define __SVE_ZREG_SIZE(vq)    ((__u32)(vq) * __SVE_VQ_BYTES)
+#define __SVE_PREG_SIZE(vq)    ((__u32)(vq) * (__SVE_VQ_BYTES / 8))
+#define __SVE_FFR_SIZE(vq)    __SVE_PREG_SIZE(vq)
+
+#define __SVE_ZREGS_OFFSET    0
+#define __SVE_ZREG_OFFSET(vq, n) \
+    (__SVE_ZREGS_OFFSET + __SVE_ZREG_SIZE(vq) * (n))
+#define __SVE_ZREGS_SIZE(vq) \
+    (__SVE_ZREG_OFFSET(vq, __SVE_NUM_ZREGS) - __SVE_ZREGS_OFFSET)
+
+#define __SVE_PREGS_OFFSET(vq) \
+    (__SVE_ZREGS_OFFSET + __SVE_ZREGS_SIZE(vq))
+#define __SVE_PREG_OFFSET(vq, n) \
+    (__SVE_PREGS_OFFSET(vq) + __SVE_PREG_SIZE(vq) * (n))
+#define __SVE_PREGS_SIZE(vq) \
+    (__SVE_PREG_OFFSET(vq, __SVE_NUM_PREGS) - __SVE_PREGS_OFFSET(vq))
+
+#define __SVE_FFR_OFFSET(vq) \
+    (__SVE_PREGS_OFFSET(vq) + __SVE_PREGS_SIZE(vq))
+
+#define SVE_VQ_BYTES            __SVE_VQ_BYTES  /* bytes per quadword */
+
+#define SVE_VQ_MIN              __SVE_VQ_MIN
+#define SVE_VQ_MAX              __SVE_VQ_MAX
+
+#define SVE_VL_MIN              __SVE_VL_MIN
+#define SVE_VL_MAX              __SVE_VL_MAX
+
+#define SVE_NUM_ZREGS           __SVE_NUM_ZREGS
+#define SVE_NUM_PREGS           __SVE_NUM_PREGS
+
+#define sve_vl_valid(vl)        __sve_vl_valid(vl)
+#define sve_vq_from_vl(vl)      __sve_vq_from_vl(vl)
+#define sve_vl_from_vq(vq)      __sve_vl_from_vq(vq)
+
+#define SVE_SIG_ZREG_SIZE(vq)    __SVE_ZREG_SIZE(vq)
+#define SVE_SIG_PREG_SIZE(vq)    __SVE_PREG_SIZE(vq)
+#define SVE_SIG_FFR_SIZE(vq)    __SVE_FFR_SIZE(vq)
+
+#define SVE_SIG_REGS_OFFSET                    \
+    ((sizeof(struct sve_context) + (__SVE_VQ_BYTES - 1))    \
+        / __SVE_VQ_BYTES * __SVE_VQ_BYTES)
+
+#define SVE_SIG_ZREGS_OFFSET \
+        (SVE_SIG_REGS_OFFSET + __SVE_ZREGS_OFFSET)
+#define SVE_SIG_ZREG_OFFSET(vq, n) \
+        (SVE_SIG_REGS_OFFSET + __SVE_ZREG_OFFSET(vq, n))
+#define SVE_SIG_ZREGS_SIZE(vq) __SVE_ZREGS_SIZE(vq)
+
+#define SVE_SIG_PREGS_OFFSET(vq) \
+        (SVE_SIG_REGS_OFFSET + __SVE_PREGS_OFFSET(vq))
+#define SVE_SIG_PREG_OFFSET(vq, n) \
+        (SVE_SIG_REGS_OFFSET + __SVE_PREG_OFFSET(vq, n))
+#define SVE_SIG_PREGS_SIZE(vq) __SVE_PREGS_SIZE(vq)
+
+#define SVE_SIG_FFR_OFFSET(vq) \
+        (SVE_SIG_REGS_OFFSET + __SVE_FFR_OFFSET(vq))
+
+#define SVE_SIG_REGS_SIZE(vq) \
+        (__SVE_FFR_OFFSET(vq) + __SVE_FFR_SIZE(vq))
+
+#define SVE_SIG_CONTEXT_SIZE(vq) \
+        (SVE_SIG_REGS_OFFSET + SVE_SIG_REGS_SIZE(vq))
+
 #endif /* AARCH64 */
 
 #ifdef RISCV64
diff --git a/core/unix/signal_linux_aarch64.c b/core/unix/signal_linux_aarch64.c
index 585365930a2..8d3aa4bb832 100644
--- a/core/unix/signal_linux_aarch64.c
+++ b/core/unix/signal_linux_aarch64.c
@@ -54,21 +54,121 @@ save_fpstate(dcontext_t *dcontext, sigframe_rt_t *frame)
 }
 
 #ifdef DEBUG
+/* Representation of quadwords as 2 doublewords. */
+typedef union {
+    __uint128_t as_128;
+    struct {
+        uint64 lo;
+        uint64 hi;
+    } as_2x64;
+} reinterpret128_2x64_t;
+
 void
 dump_sigcontext(dcontext_t *dcontext, sigcontext_t *sc)
 {
+#ifdef DR_HOST_NOT_TARGET
+    ASSERT_NOT_REACHED();
+#endif
+    LOG(THREAD, LOG_ASYNCH, 1, "\tSignal context:\n");
     int i;
     for (i = 0; i <= DR_REG_X30 - DR_REG_X0; i++)
         LOG(THREAD, LOG_ASYNCH, 1, "\tx%-2d    = " PFX "\n", i, sc->regs[i]);
     LOG(THREAD, LOG_ASYNCH, 1, "\tsp     = " PFX "\n", sc->sp);
     LOG(THREAD, LOG_ASYNCH, 1, "\tpc     = " PFX "\n", sc->pc);
     LOG(THREAD, LOG_ASYNCH, 1, "\tpstate = " PFX "\n", sc->pstate);
+    LOG(THREAD, LOG_ASYNCH, 1, "\n");
+
+    struct _aarch64_ctx *head = (struct _aarch64_ctx *)sc->__reserved;
+    ASSERT(head->magic == FPSIMD_MAGIC);
+    ASSERT(head->size == sizeof(struct fpsimd_context));
+
+    struct fpsimd_context *fpsimd = (struct fpsimd_context *)sc->__reserved;
+    LOG(THREAD, LOG_ASYNCH, 2, "\tfpsr 0x%x\n", fpsimd->fpsr);
+    LOG(THREAD, LOG_ASYNCH, 2, "\tfpcr 0x%x\n", fpsimd->fpcr);
+    reinterpret128_2x64_t vreg;
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        vreg.as_128 = fpsimd->vregs[i];
+        LOG(THREAD, LOG_ASYNCH, 2, "\tq%-2d  0x%016lx %016lx\n", i, vreg.as_2x64.hi,
+            vreg.as_2x64.lo);
+    }
+    LOG(THREAD, LOG_ASYNCH, 2, "\n");
+
+    if (proc_has_feature(FEATURE_SVE)) {
+        size_t offset = sizeof(struct fpsimd_context);
+        struct _aarch64_ctx *next_head = (struct _aarch64_ctx *)(sc->__reserved + offset);
+        while (next_head->magic != 0) {
+            switch (next_head->magic) {
+            case ESR_MAGIC: break;
+            case EXTRA_MAGIC: break;
+            case SVE_MAGIC: {
+                const struct sve_context *sve = (struct sve_context *)(next_head);
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE VL %d bytes\n", sve->vl);
+                ASSERT(sve->vl == proc_get_vector_length_bytes());
+                const unsigned int vq = sve_vq_from_vl(sve->vl);
+                LOG(THREAD, LOG_ASYNCH, 2, "\tVQ %d\n\n", vq);
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_ZREG_SIZE %d\n",
+                    SVE_SIG_ZREG_SIZE(vq));
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_PREG_SIZE %d\n",
+                    SVE_SIG_PREG_SIZE(vq));
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_FFR_SIZE  %d\n",
+                    SVE_SIG_FFR_SIZE(vq));
+                LOG(THREAD, LOG_ASYNCH, 2, "\tsve->head.size %d\n\n", sve->head.size);
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_ZREGS_OFFSET %d\n",
+                    SVE_SIG_ZREGS_OFFSET);
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_PREGS_OFFSET %d\n",
+                    SVE_SIG_PREGS_OFFSET(vq));
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_FFR_OFFSET   %d\n\n",
+                    SVE_SIG_FFR_OFFSET(vq));
+
+                uint64 vdw; /* A vector's doubleword. */
+                int boff;   /* Byte offset for each doubleword in a vector. */
+                for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+                    LOG(THREAD, LOG_ASYNCH, 2, "\tz%-2d  0x", i);
+                    for (boff = ((vq * 2) - 1); boff >= 0; boff--) {
+                        vdw = *((uint64 *)((((byte *)sve) + (SVE_SIG_ZREG_OFFSET(vq, i)) + (boff * 8))));
+                        LOG(THREAD, LOG_ASYNCH, 2, "%016lx ", vdw);
+                    }
+                    LOG(THREAD, LOG_ASYNCH, 2, "\n");
+                }
+                LOG(THREAD, LOG_ASYNCH, 2, "\n");
+                for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+                    LOG(THREAD, LOG_ASYNCH, 2, "\tp%-2d  0x%08lx\n", i,
+                        *((uint32 *)((byte *)sve + SVE_SIG_PREG_OFFSET(vq, i))));
+                }
+                LOG(THREAD, LOG_ASYNCH, 2, "\n");
+                LOG(THREAD, LOG_ASYNCH, 2, "\tFFR  0x%08lx\n\n",
+                    *((uint32 *)((byte *)sve + SVE_SIG_FFR_OFFSET(vq))));
+                break;
+            }
+            default:
+                SYSLOG_INTERNAL_WARNING("%s %d Unknown section found in signal context with magic number 0x%x",
+                                      __func__, __LINE__, next_head->magic);
+                break;
+            }
+            offset += next_head->size;
+            next_head = (struct _aarch64_ctx *)(sc->__reserved + offset);
+        }
+    }
 }
 #endif /* DEBUG */
 
+/* Representation of quadword as 4 words, used for SIMD. */
+typedef union {
+    __uint128_t as_128;
+    struct {
+        uint32 lowest;
+        uint32 lo;
+        uint32 hi;
+        uint32 highest;
+    } as_4x32;
+} reinterpret128_4x32_t;
+
 void
 sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
 {
+#ifdef DR_HOST_NOT_TARGET
+    ASSERT_NOT_REACHED();
+#endif
     struct fpsimd_context *fpc = (struct fpsimd_context *)sc_full->fp_simd_state;
     if (fpc == NULL)
         return;
@@ -77,30 +177,107 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
     mc->fpsr = fpc->fpsr;
     mc->fpcr = fpc->fpcr;
     ASSERT((sizeof(mc->simd->q) * MCXT_NUM_SIMD_SVE_SLOTS) == sizeof(fpc->vregs));
-    memcpy(&mc->simd, &fpc->vregs, sizeof(mc->simd));
-    /* TODO i#5365: memcpy(&mc->simd->u32,...)
-     * See also sve_context in core/unix/include/sigcontext.h.
-     */
+    int i;
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        memcpy(&mc->simd[i].q, &fpc->vregs[i], sizeof(mc->simd->q));
+    }
+
+    if (proc_has_feature(FEATURE_SVE)) {
+        size_t offset = sizeof(struct fpsimd_context);
+        /* fpsimd_context is always the first section. After that the esr_context,
+         * extra_context and sve_context sections can be in any order.
+         */
+        struct _aarch64_ctx *next_head =
+            (struct _aarch64_ctx *)(sc_full->sc->__reserved + offset);
+        while (next_head->magic != 0) {
+            ASSERT(next_head->magic == ESR_MAGIC || next_head->magic == SVE_MAGIC ||
+                   next_head->magic == EXTRA_MAGIC);
+            switch (next_head->magic) {
+            case ESR_MAGIC: break;
+            case EXTRA_MAGIC: break;
+            case SVE_MAGIC: {
+                const struct sve_context *sve = (struct sve_context *)(next_head);
+                ASSERT(sve->vl == proc_get_vector_length_bytes());
+                const unsigned int vq = sve_vq_from_vl(sve->vl);
+                if (sve->head.size != sizeof(struct sve_context)) {
+                    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+                        /* SVE specifies that AArch64's SIMD&FP registers
+                         * (V0-V31) which hold FP scalars and NEON 128-bit
+                         * vectors overlay the bottom 128 bits of the SVE
+                         * registers (Z0-Z31). For backward compatibility
+                         * reasons, bits 0->127 of Z0-Z31 are always restored
+                         * from the corresponding members of fpsimd_context's
+                         * vregs and not from sve_context.
+                         */
+                        memcpy(&mc->simd[i].u32, (byte *)sve + SVE_SIG_ZREG_OFFSET(vq, i),
+                               sve->vl);
+                        memcpy(&mc->simd[i].q, &fpc->vregs[i], sizeof(mc->simd->q));
+                    }
+                    for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+                        memcpy(&mc->svep[i].u32, (byte *)sve + SVE_SIG_PREG_OFFSET(vq, i),
+                               sve->vl);
+                    }
+                    memcpy(&mc->ffr, (byte *)sve + SVE_SIG_FFR_OFFSET(vq), sve->vl);
+                }
+                break;
+            }
+            default:
+                SYSLOG_INTERNAL_WARNING("%s %d Unhandled section with magic number 0x%x",
+                                        __func__, __LINE__, next_head->magic);
+            }
+            offset += next_head->size;
+            next_head = (struct _aarch64_ctx *)(sc_full->sc->__reserved + offset);
+        }
+    }
 }
 
 void
 mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
 {
+#ifdef DR_HOST_NOT_TARGET
+    ASSERT_NOT_REACHED();
+#endif
+    /* sig_full_initialize() will have set the fp_simd_state pointer in the
+     * user level machine context's (uc_mcontext) to __reserved.
+     */
     struct fpsimd_context *fpc = (struct fpsimd_context *)sc_full->fp_simd_state;
     if (fpc == NULL)
         return;
-    struct _aarch64_ctx *next = (void *)((char *)fpc + sizeof(struct fpsimd_context));
     fpc->head.magic = FPSIMD_MAGIC;
     fpc->head.size = sizeof(struct fpsimd_context);
     fpc->fpsr = mc->fpsr;
     fpc->fpcr = mc->fpcr;
     ASSERT(sizeof(fpc->vregs) == (sizeof(mc->simd->q) * MCXT_NUM_SIMD_SVE_SLOTS));
-    memcpy(&fpc->vregs, &mc->simd, sizeof(fpc->vregs));
-    /* TODO i#5365: memcpy(..., &mc->simd->u32)
-     * See also sve_context in core/unix/include/sigcontext.h.
-     */
-    next->magic = 0;
-    next->size = 0;
+    int i;
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        memcpy(&fpc->vregs[i], &mc->simd[i].u32[0], sizeof(fpc->vregs[i]));
+    }
+
+    if (proc_has_feature(FEATURE_SVE)) {
+        struct _aarch64_ctx *esr = (void *)((byte *)fpc + sizeof(struct fpsimd_context));
+        esr->magic = ESR_MAGIC;
+        esr->size = sizeof(struct esr_context);
+
+        struct sve_context *sve = (void *)((byte *)esr + sizeof(struct esr_context));
+        sve->head.magic = SVE_MAGIC;
+        sve->vl = proc_get_vector_length_bytes();
+        const uint vq = sve_vq_from_vl(sve->vl);
+        sve->head.size = ALIGN_FORWARD(SVE_SIG_CONTEXT_SIZE(vq), 16);
+        for (uint i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+            memcpy((byte *)sve + SVE_SIG_ZREG_OFFSET(vq, i), &mc->simd[i].u32, sve->vl);
+        }
+        for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+            memcpy((byte *)sve + SVE_SIG_PREG_OFFSET(vq, i), &mc->svep[i].u32, sve->vl);
+        }
+        memcpy((byte *)sve + SVE_SIG_FFR_OFFSET(vq), &mc->ffr, sve->vl);
+
+        size_t offset = (proc_get_vector_length_bytes() * MCXT_NUM_SIMD_SVE_SLOTS) +
+            ((proc_get_vector_length_bytes() / 8) * MCXT_NUM_SVEP_SLOTS) + 16;
+        struct _aarch64_ctx *null =
+            (void *)((byte *)sve + sizeof(struct sve_context) + offset);
+        null->magic = 0;
+        null->size = 0;
+    }
 }
 
 size_t
diff --git a/suite/runsuite_wrapper.pl b/suite/runsuite_wrapper.pl
index e9135e9331c..92646c1b9a5 100755
--- a/suite/runsuite_wrapper.pl
+++ b/suite/runsuite_wrapper.pl
@@ -341,6 +341,7 @@
                                    );
             # FIXME i#2417: fix flaky/regressed AArch64 tests
             %ignore_failures_64 = ('code_api|linux.sigsuspend' => 1,
+                                   'code_api|linux.thread-reset' => 1,
                                    'code_api|pthreads.pthreads_exit' => 1,
                                    'code_api|tool.histogram.offline' => 1, # i#3980
                                    'code_api|linux.fib-conflict' => 1,
diff --git a/suite/tests/api/opnd-a64.c b/suite/tests/api/opnd-a64.c
index 5278196e87f..184b1fcf519 100644
--- a/suite/tests/api/opnd-a64.c
+++ b/suite/tests/api/opnd-a64.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2018 Arm Limited.  All rights reserved.
+ * Copyright (c) 2018 - 2024 Arm Limited.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -35,6 +35,7 @@
 #include "configure.h"
 #include "dr_api.h"
 #include <stdio.h>
+#include <string.h>
 
 #define ASSERT(x)                                                                        \
     ((void)((!(x)) ? (fprintf(stderr, "ASSERT FAILURE: %s:%d: %s\n", __FILE__, __LINE__, \
@@ -68,6 +69,7 @@ test_get_size()
     }
 
     opnd_size_t opsz_vl = OPSZ_NA;
+    opnd_size_t opsz_pl = OPSZ_NA;
     if (proc_has_feature(FEATURE_SVE)) {
         /* Check sizes of SVE vector and predicate registers. Read vector length
          * directly from hardware and compare with OPSZ_ value reg_get_size()
@@ -81,17 +83,19 @@ test_get_size()
             :
             : "x0");
         opsz_vl = opnd_size_from_bytes(vl);
+        opsz_pl = opnd_size_from_bytes(vl / 8);
     } else {
         /* Set vector length to 256 bits for unit tests on non-SVE hardware. */
+        ASSERT(dr_get_sve_vector_length() == 256);
         opsz_vl = OPSZ_32;
+        opsz_pl = OPSZ_4;
     }
     for (uint i = 0; i < 32; i++) {
         ASSERT(reg_get_size((reg_id_t)DR_REG_Z0 + i) == opsz_vl);
     }
 
-    /* TODO i#5365: Check sizes of SVE predicate regs. */
     for (uint i = 0; i < 16; i++) {
-        ASSERT(reg_get_size((reg_id_t)DR_REG_P0 + i) == OPSZ_SCALABLE_PRED);
+        ASSERT(reg_get_size((reg_id_t)DR_REG_P0 + i) == opsz_pl);
     }
 }
 
@@ -303,6 +307,460 @@ test_opnd_invert_immed_int()
 #endif
 }
 
+typedef struct _vector_address_test_expectation_t {
+    app_pc *addresses;
+    uint num_addresses;
+    bool is_write;
+} vector_address_test_expectation_t;
+
+void
+test_compute_vector_address_helper(void *drcontext, instr_t *instr, dr_mcontext_t *mc,
+                                   const vector_address_test_expectation_t *expected,
+                                   uint line)
+{
+    bool printed_instr = false;
+#define TEST_FAILED()                                             \
+    do {                                                          \
+        if (!printed_instr) {                                     \
+            printf("%s:%u:\n", __FILE__, line);                   \
+            dr_print_instr(drcontext, STDOUT, instr,              \
+                           "Failed to compute addresses for:\n"); \
+            printed_instr = true;                                 \
+        }                                                         \
+    } while (0)
+
+#define EXPECT_CMP(cmp, fmt, a, b)                                                   \
+    do {                                                                             \
+        if (!(a cmp b)) {                                                            \
+            TEST_FAILED();                                                           \
+            printf("Expected " #a " " #cmp " " #b ":\n    " #a " = " fmt "\n    " #b \
+                   " = " fmt "\n",                                                   \
+                   a, b);                                                            \
+        }                                                                            \
+    } while (0)
+
+#define EXPECT_EQ(fmt, a, b) EXPECT_CMP(==, fmt, a, b)
+#define EXPECT_LT(fmt, a, b) EXPECT_CMP(<, fmt, a, b)
+
+    app_pc addr;
+    bool is_write;
+    uint index = 0;
+    while (instr_compute_address_ex(instr, mc, index, &addr, &is_write)) {
+        EXPECT_LT("%u", index, expected->num_addresses);
+        EXPECT_EQ("%p", addr, expected->addresses[index]);
+        EXPECT_EQ("%u", is_write, expected->is_write);
+        index++;
+    }
+    EXPECT_EQ("%u", index, expected->num_addresses);
+
+#undef TEST_FAILED
+#undef EXPECT_CMP
+#undef EXPECT_EQ
+#undef EXPECT_LT
+}
+
+/* Used by test_compute_vector_address() to determine whether an instruction reads or
+ * writes its memory operand and set test expectations.
+ * This isn't an exhaustive list of opcodes; it just contains the ones used in the test
+ */
+static bool
+op_is_write(int op)
+{
+    switch (op) {
+    case OP_ld1b:
+    case OP_ld1h:
+    case OP_ld1w:
+    case OP_ld1d:
+    case OP_ldnt1b:
+    case OP_ldnt1h:
+    case OP_ldnt1w:
+    case OP_ldnt1d: return false;
+    case OP_st1b:
+    case OP_st1h:
+    case OP_st1w:
+    case OP_st1d:
+    case OP_stnt1b:
+    case OP_stnt1h:
+    case OP_stnt1w:
+    case OP_stnt1d: return true;
+
+    default: ASSERT(false);
+    }
+}
+
+/* Used by test_compute_vector_address() to determine whether an instruction reads or
+ * writes its memory operand and set test expectations.
+ * This isn't an exhaustive list of opcodes; it just contains the ones used in the test
+ */
+static opnd_size_t
+op_mem_size(int op)
+{
+    switch (op) {
+    case OP_ld1b:
+    case OP_ldnt1b:
+    case OP_st1b:
+    case OP_stnt1b: return OPSZ_1;
+    case OP_ld1h:
+    case OP_ldnt1h:
+    case OP_st1h:
+    case OP_stnt1h: return OPSZ_2;
+    case OP_ld1w:
+    case OP_ldnt1w:
+    case OP_st1w:
+    case OP_stnt1w: return OPSZ_4;
+    case OP_ld1d:
+    case OP_ldnt1d:
+    case OP_st1d:
+    case OP_stnt1d: return OPSZ_8;
+
+    default: ASSERT(false);
+    }
+}
+
+void
+test_compute_vector_address(void *drcontext)
+{
+    const int original_vector_length = dr_get_sve_vector_length();
+    ASSERT(dr_set_sve_vector_length(256));
+
+#define SCALAR_BASE_REG 0
+
+#define INDEX_REG_D 0
+#define INDEX_REG_S 1
+#define BASE_REG_D 2
+#define BASE_REG_S 3
+
+    dr_mcontext_t mc = {
+        .size = sizeof(dr_mcontext_t),
+        .flags = DR_MC_ALL,
+        .r0 = 0x8000000000000000, /* SCALAR_BASE_REG */
+        .r1 = 1,
+        .r2 = 2,
+        .r3 = 3,
+        .r4 = 4,
+        .r5 = 5,
+        .r6 = 6,
+        .r7 = 7,
+        .r8 = 0xffffffffffffffff,
+        .simd[INDEX_REG_D].u64 = { 0x0000000000010000, 0x0000000000020000,
+                                   0xffffffffffff0000, 0xfffffffffffe0000 },
+        .simd[INDEX_REG_S].u32 = { 0x00010000, 0x00020000, 0x00030000, 0x00040000,
+                                   0xffff0000, 0xfffd0000, 0xfffc0000, 0xfffb0000 },
+        .simd[BASE_REG_D].u64 = { 0x0000000000000000, 0x8000000000000000,
+                                  0xffffffffffffffff, 0x0000000010000000 },
+        .simd[BASE_REG_S].u32 = { 0x00000000, 0x80000000, 0xffffffff, 0x00010000,
+                                  0x10000000, 0x20000000, 0x30000000, 0x40000000 },
+    };
+
+    for (size_t i = BASE_REG_S + 1; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        static const uint64 poison[4] = { 0xdeaddeaddeaddead, 0xdeaddeaddeaddead,
+                                          0xdeaddeaddeaddead, 0xdeaddeaddeaddead };
+        memcpy(&mc.simd[i].u64[0], poison, sizeof(poison));
+    }
+    for (size_t i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+        mc.svep[i].u32[0] = 0xffffffff;
+    }
+
+/* Map SVE element sizes to opnd_size_t */
+#define ELSZ_B OPSZ_1
+#define ELSZ_H OPSZ_2
+#define ELSZ_S OPSZ_4
+#define ELSZ_D OPSZ_8
+
+#define EXPECT(...)                                \
+    app_pc addresses[] = { __VA_ARGS__ };          \
+    vector_address_test_expectation_t expected = { \
+        addresses,                                 \
+        sizeof(addresses) / sizeof(app_pc),        \
+        false,                                     \
+    }
+
+#define VEC_ADDR_TEST(op, pg, mask, create_mem_opnd, decl_expect)                       \
+    {                                                                                   \
+        decl_expect;                                                                    \
+        expected.is_write = op_is_write(OP_##op);                                       \
+        mc.svep[pg].u32[0] = mask;                                                      \
+        opnd_t mem_opnd = create_mem_opnd;                                              \
+        opnd_set_size(&mem_opnd, op_mem_size(OP_##op));                                 \
+        instr_t *instr = INSTR_CREATE_##op##_sve_pred(                                  \
+            drcontext,                                                                  \
+            opnd_create_reg_element_vector(DR_REG_Z31,                                  \
+                                           opnd_get_vector_element_size(mem_opnd)),     \
+            opnd_create_predicate_reg(DR_REG_P0 + pg, false), mem_opnd);                \
+        test_compute_vector_address_helper(drcontext, instr, &mc, &expected, __LINE__); \
+        instr_destroy(drcontext, instr);                                                \
+        mc.svep[pg].u32[0] = 0xffffffff;                                                \
+    }
+
+#define SCALAR_PLUS_VECTOR(xn, zm, el_size, extend, scale)                             \
+    opnd_create_vector_base_disp_aarch64(DR_REG_X0 + xn, DR_REG_Z0 + zm, el_size,      \
+                                         DR_EXTEND_##extend, scale > 0, 0, 0, OPSZ_NA, \
+                                         scale)
+
+    /* Test all the scalar+vector addressing modes.
+     * The opcode used in the instruction shouldn't make a difference to the address
+     * calculation, so these tests cover all addressing modes but not all
+     * (opcode, addressing mode) combinations.
+     */
+
+    /* 32-bit scaled offset [<Xn|SP>, <Zm>.S, <mod> #N] */
+    VEC_ADDR_TEST(ld1h, /*pg=*/0, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x8000000000060000, (app_pc)0x8000000000080000,
+                         (app_pc)0x80000001fffe0000, (app_pc)0x80000001fffa0000,
+                         (app_pc)0x80000001fff80000, (app_pc)0x80000001fff60000));
+    VEC_ADDR_TEST(st1h, /*pg=*/0, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, SXTW, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x8000000000060000, (app_pc)0x8000000000080000,
+                         (app_pc)0x7ffffffffffe0000, (app_pc)0x7ffffffffffa0000,
+                         (app_pc)0x7ffffffffff80000, (app_pc)0x7ffffffffff60000));
+    VEC_ADDR_TEST(ld1w, /*pg=*/0, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x80000000000c0000, (app_pc)0x8000000000100000,
+                         (app_pc)0x80000003fffc0000, (app_pc)0x80000003fff40000,
+                         (app_pc)0x80000003fff00000, (app_pc)0x80000003ffec0000));
+    VEC_ADDR_TEST(st1w, /*pg=*/0, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, SXTW, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x80000000000c0000, (app_pc)0x8000000000100000,
+                         (app_pc)0x7ffffffffffc0000, (app_pc)0x7ffffffffff40000,
+                         (app_pc)0x7ffffffffff00000, (app_pc)0x7fffffffffec0000));
+
+    /* 32-bit unscaled offset [<Xn|SP>, <Zm>.S, <mod>] */
+    VEC_ADDR_TEST(ld1w, /*pg=*/1, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x8000000000030000, (app_pc)0x8000000000040000,
+                         (app_pc)0x80000000ffff0000, (app_pc)0x80000000fffd0000,
+                         (app_pc)0x80000000fffc0000, (app_pc)0x80000000fffb0000));
+    VEC_ADDR_TEST(st1w, /*pg=*/1, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, SXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x8000000000030000, (app_pc)0x8000000000040000,
+                         (app_pc)0x7fffffffffff0000, (app_pc)0x7ffffffffffd0000,
+                         (app_pc)0x7ffffffffffc0000, (app_pc)0x7ffffffffffb0000));
+
+    /* 32-bit unpacked scaled offset [<Xn|SP>, <Zm>.D, <mod> #N] */
+    VEC_ADDR_TEST(ld1h, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x80000001fffe0000, (app_pc)0x80000001fffc0000));
+    VEC_ADDR_TEST(st1h, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, SXTW, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x7ffffffffffe0000, (app_pc)0x7ffffffffffc0000));
+    VEC_ADDR_TEST(ld1w, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x80000003fffc0000, (app_pc)0x80000003fff80000));
+    VEC_ADDR_TEST(st1w, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, SXTW, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x7ffffffffffc0000, (app_pc)0x7ffffffffff80000));
+    VEC_ADDR_TEST(ld1d, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 3),
+                  EXPECT((app_pc)0x8000000000080000, (app_pc)0x8000000000100000,
+                         (app_pc)0x80000007fff80000, (app_pc)0x80000007fff00000));
+    VEC_ADDR_TEST(st1d, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, SXTW, 3),
+                  EXPECT((app_pc)0x8000000000080000, (app_pc)0x8000000000100000,
+                         (app_pc)0x7ffffffffff80000, (app_pc)0x7ffffffffff00000));
+
+    /* 32-bit unpacked unscaled offset [<Xn|SP>, <Zm>.D, <mod>] */
+    VEC_ADDR_TEST(ld1d, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x80000000ffff0000, (app_pc)0x80000000fffe0000));
+    VEC_ADDR_TEST(st1d, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, SXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x7fffffffffff0000, (app_pc)0x7ffffffffffe0000));
+
+    /* 64-bit scaled offset [<Xn|SP>, <Zm>.D, LSL #N] */
+    VEC_ADDR_TEST(ld1h, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTX, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x7ffffffffffe0000, (app_pc)0x7ffffffffffc0000));
+    VEC_ADDR_TEST(st1w, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTX, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x7ffffffffffc0000, (app_pc)0x7ffffffffff80000));
+    VEC_ADDR_TEST(ld1d, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTX, 3),
+                  EXPECT((app_pc)0x8000000000080000, (app_pc)0x8000000000100000,
+                         (app_pc)0x7ffffffffff80000, (app_pc)0x7ffffffffff00000));
+
+    /* 64-bit unscaled offset [<Xn|SP>, <Zm>.D] */
+    VEC_ADDR_TEST(st1d, /*pg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTX, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x7fffffffffff0000, (app_pc)0x7ffffffffffe0000));
+
+    /* Test predicate handling. */
+
+    /* Test with all elements inactive */
+    VEC_ADDR_TEST(ld1w, /*pg=*/2, 0x00000000,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 0),
+                  EXPECT(/*nothing*/));
+    VEC_ADDR_TEST(st1d, /*pg=*/3, 0x00000000,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 0),
+                  EXPECT(/*nothing*/));
+
+    /* Test with every other element active */
+    VEC_ADDR_TEST(st1b, /*pg=*/4, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000030000,
+                         (app_pc)0x80000000ffff0000, (app_pc)0x80000000fffc0000));
+    VEC_ADDR_TEST(st1h, /*pg=*/5, 0x00010001,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x80000000ffff0000));
+
+    /* Test with a single element active */
+    VEC_ADDR_TEST(ld1w, /*pg=*/6, 0x00000010,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000020000));
+    VEC_ADDR_TEST(st1d, /*pg=*/7, 0x00000100,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000020000));
+#undef SCALAR_PLUS_VECTOR
+
+#define VECTOR_PLUS_IMM(zn, el_size, imm)                                      \
+    opnd_create_vector_base_disp_aarch64(DR_REG_Z0 + zn, DR_REG_NULL, el_size, \
+                                         DR_EXTEND_UXTX, 0, imm, 0, OPSZ_NA, 0)
+
+    VEC_ADDR_TEST(ld1b, /*pg=*/0, 0x11111111, VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 0),
+                  EXPECT((app_pc)0x0000000000000000, (app_pc)0x0000000080000000,
+                         (app_pc)0x00000000ffffffff, (app_pc)0x0000000000010000,
+                         (app_pc)0x0000000010000000, (app_pc)0x0000000020000000,
+                         (app_pc)0x0000000030000000, (app_pc)0x0000000040000000));
+    VEC_ADDR_TEST(st1b, /*pg=*/0, 0x11111111, VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 31),
+                  EXPECT((app_pc)0x000000000000001f, (app_pc)0x000000008000001f,
+                         (app_pc)0x000000010000001e, (app_pc)0x000000000001001f,
+                         (app_pc)0x000000001000001f, (app_pc)0x000000002000001f,
+                         (app_pc)0x000000003000001f, (app_pc)0x000000004000001f));
+    VEC_ADDR_TEST(ld1b, /*pg=*/0, 0x01010101, VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 0),
+                  EXPECT((app_pc)0x0000000000000000, (app_pc)0x8000000000000000,
+                         (app_pc)0xffffffffffffffff, (app_pc)0x0000000010000000));
+    VEC_ADDR_TEST(st1b, /*pg=*/0, 0x11111111, VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 31),
+                  EXPECT((app_pc)0x000000000000001f, (app_pc)0x800000000000001f,
+                         (app_pc)0x000000000000001e, (app_pc)0x000000001000001f));
+
+    VEC_ADDR_TEST(ld1h, /*pg=*/0, 0x11111111, VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 62),
+                  EXPECT((app_pc)0x000000000000003e, (app_pc)0x000000008000003e,
+                         (app_pc)0x000000010000003d, (app_pc)0x000000000001003e,
+                         (app_pc)0x000000001000003e, (app_pc)0x000000002000003e,
+                         (app_pc)0x000000003000003e, (app_pc)0x000000004000003e));
+    VEC_ADDR_TEST(st1h, /*pg=*/0, 0x11111111, VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 62),
+                  EXPECT((app_pc)0x000000000000003e, (app_pc)0x800000000000003e,
+                         (app_pc)0x000000000000003d, (app_pc)0x000000001000003e));
+
+    VEC_ADDR_TEST(ld1w, /*pg=*/0, 0x11111111, VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 124),
+                  EXPECT((app_pc)0x000000000000007c, (app_pc)0x000000008000007c,
+                         (app_pc)0x000000010000007b, (app_pc)0x000000000001007c,
+                         (app_pc)0x000000001000007c, (app_pc)0x000000002000007c,
+                         (app_pc)0x000000003000007c, (app_pc)0x000000004000007c));
+    VEC_ADDR_TEST(st1w, /*pg=*/0, 0x11111111, VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 124),
+                  EXPECT((app_pc)0x000000000000007c, (app_pc)0x800000000000007c,
+                         (app_pc)0x000000000000007b, (app_pc)0x000000001000007c));
+
+    VEC_ADDR_TEST(ld1d, /*pg=*/0, 0x11111111, VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 248),
+                  EXPECT((app_pc)0x00000000000000f8, (app_pc)0x80000000000000f8,
+                         (app_pc)0x00000000000000f7, (app_pc)0x00000000100000f8));
+
+    /* Test with all elements inactive */
+    VEC_ADDR_TEST(ld1w, /*pg=*/0, 0x00000000, VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 124),
+                  EXPECT(/*nothing*/));
+    VEC_ADDR_TEST(st1w, /*pg=*/0, 0x00000000, VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 124),
+                  EXPECT(/*nothing*/));
+
+    /* Test with every other element active */
+    VEC_ADDR_TEST(ld1w, /*pg=*/0, 0x01010101, VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 124),
+                  EXPECT((app_pc)0x000000000000007c, (app_pc)0x000000010000007b,
+                         (app_pc)0x000000001000007c, (app_pc)0x000000003000007c));
+    VEC_ADDR_TEST(st1w, /*pg=*/0, 0x00010001, VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 124),
+                  EXPECT((app_pc)0x000000000000007c, (app_pc)0x000000000000007b));
+
+    /* Test with a single element active */
+    VEC_ADDR_TEST(ld1w, /*pg=*/0, 0x00000010, VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 124),
+                  EXPECT((app_pc)0x000000008000007c));
+    VEC_ADDR_TEST(st1w, /*pg=*/0, 0x00000100, VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 124),
+                  EXPECT((app_pc)0x800000000000007c));
+
+#undef VECTOR_PLUS_IMM
+
+#define VECTOR_PLUS_SCALAR(zn, el_size, xm)                                       \
+    opnd_create_vector_base_disp_aarch64(DR_REG_Z0 + zn, DR_REG_X0 + xm, el_size, \
+                                         DR_EXTEND_UXTX, 0, 0, 0, OPSZ_NA, 0)
+    VEC_ADDR_TEST(ldnt1b, /*pg=*/0, 0x11111111, VECTOR_PLUS_SCALAR(BASE_REG_S, ELSZ_S, 8),
+                  EXPECT((app_pc)0xffffffffffffffff, (app_pc)0x000000007fffffff,
+                         (app_pc)0x00000000fffffffe, (app_pc)0x000000000000ffff,
+                         (app_pc)0x000000000fffffff, (app_pc)0x000000001fffffff,
+                         (app_pc)0x000000002fffffff, (app_pc)0x000000003fffffff));
+    VEC_ADDR_TEST(stnt1b, /*pg=*/0, 0x01010101, VECTOR_PLUS_SCALAR(BASE_REG_D, ELSZ_D, 7),
+                  EXPECT((app_pc)0x0000000000000007, (app_pc)0x8000000000000007,
+                         (app_pc)0x0000000000000006, (app_pc)0x0000000010000007));
+
+    /* Test with all elements inactive */
+    VEC_ADDR_TEST(ldnt1h, /*pg=*/0, 0x00000000, VECTOR_PLUS_SCALAR(BASE_REG_S, ELSZ_S, 6),
+                  EXPECT(/*nothing*/));
+    VEC_ADDR_TEST(stnt1h, /*pg=*/0, 0x00000000, VECTOR_PLUS_SCALAR(BASE_REG_D, ELSZ_D, 5),
+                  EXPECT(/*nothing*/));
+
+    /* Test with every other element active */
+    VEC_ADDR_TEST(ldnt1w, /*pg=*/0, 0x01010101, VECTOR_PLUS_SCALAR(BASE_REG_S, ELSZ_S, 4),
+                  EXPECT((app_pc)0x0000000000000004, (app_pc)0x0000000100000003,
+                         (app_pc)0x0000000010000004, (app_pc)0x0000000030000004));
+    VEC_ADDR_TEST(stnt1w, /*pg=*/0, 0x00010001, VECTOR_PLUS_SCALAR(BASE_REG_D, ELSZ_D, 3),
+                  EXPECT((app_pc)0x0000000000000003, (app_pc)0x0000000000000002));
+
+    /* Test with a single element active */
+    VEC_ADDR_TEST(ldnt1w, /*pg=*/0, 0x00000010, VECTOR_PLUS_SCALAR(BASE_REG_S, ELSZ_S, 2),
+                  EXPECT((app_pc)0x0000000080000002));
+    VEC_ADDR_TEST(stnt1d, /*pg=*/0, 0x00000100, VECTOR_PLUS_SCALAR(BASE_REG_D, ELSZ_D, 1),
+                  EXPECT((app_pc)0x8000000000000001));
+
+#undef VECTOR_PLUS_SCALAR
+
+#undef EXPECT
+#undef VEC_ADDR_TEST
+
+    ASSERT(dr_set_sve_vector_length(original_vector_length));
+}
+
+void
+test_reg_is_simd()
+{
+    for (reg_id_t reg = DR_REG_START_32; reg <= DR_REG_STOP_32; reg++)
+        ASSERT(!reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_START_64; reg <= DR_REG_STOP_64; reg++)
+        ASSERT(!reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_Q0; reg <= DR_REG_Q0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_D0; reg <= DR_REG_D0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_S0; reg <= DR_REG_S0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_H0; reg <= DR_REG_H0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_B0; reg <= DR_REG_B0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_START_Z; reg <= DR_REG_STOP_Z; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_START_P; reg <= DR_REG_STOP_P; reg++)
+        ASSERT(!reg_is_simd(reg));
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -310,7 +768,7 @@ main(int argc, char *argv[])
      * on SVE h/w. This is validated with the direct read of vector length
      * using the SVE RDVL instruction in test_get_size() above.
      */
-    dr_standalone_init();
+    void *drcontext = dr_standalone_init();
 
     test_get_size();
 
@@ -318,6 +776,10 @@ main(int argc, char *argv[])
 
     test_opnd_invert_immed_int();
 
+    test_compute_vector_address(drcontext);
+
+    test_reg_is_simd();
+
     printf("all done\n");
     return 0;
 }
diff --git a/suite/tests/tools.c b/suite/tests/tools.c
index 70d33946549..e98a2397d0a 100644
--- a/suite/tests/tools.c
+++ b/suite/tests/tools.c
@@ -497,6 +497,86 @@ intercept_signal(int sig, handler_3_t handler, bool sigstack)
     ASSERT_NOERR(rc);
 }
 
+#        ifdef AARCH64
+#            ifdef DR_HOST_NOT_TARGET
+#                define RESERVED __reserved1
+#            else
+#                define RESERVED __reserved
+#            endif
+void
+dump_ucontext(ucontext_t *ucxt, bool is_sve, int vl_bytes)
+{
+    struct _aarch64_ctx *head = (struct _aarch64_ctx *)(ucxt->uc_mcontext.RESERVED);
+    assert(head->magic == FPSIMD_MAGIC);
+    assert(head->size == sizeof(struct fpsimd_context));
+
+    struct fpsimd_context *fpsimd =
+        (struct fpsimd_context *)(ucxt->uc_mcontext.RESERVED);
+    print("\nfpsr 0x%x\n", fpsimd->fpsr);
+    print("fpcr 0x%x\n", fpsimd->fpcr);
+    reinterpret128_2x64_t vreg;
+    int i;
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        vreg.as_128 = fpsimd->vregs[i];
+        print("q%-2d  0x%016lx %016lx\n", i, vreg.as_2x64.hi, vreg.as_2x64.lo);
+    }
+    print("\n");
+
+    if (is_sve) {
+        size_t offset = sizeof(struct fpsimd_context);
+        struct _aarch64_ctx *next_head =
+            (struct _aarch64_ctx *)(ucxt->uc_mcontext.RESERVED + offset);
+        while (next_head->magic != 0) {
+            switch (next_head->magic) {
+            case ESR_MAGIC: offset += sizeof(struct esr_context); break;
+            case EXTRA_MAGIC: offset += sizeof(struct extra_context); break;
+            case SVE_MAGIC: {
+                const struct sve_context *sve = (struct sve_context *)(next_head);
+                assert(sve->vl == vl_bytes);
+                const unsigned int vq = sve_vq_from_vl(sve->vl);
+                if (sve->head.size != sizeof(struct sve_context))
+                    assert(sve->head.size == ALIGN_FORWARD(SVE_SIG_CONTEXT_SIZE(vq), 16));
+
+                uint64 vdw; /* A vector's doubleword. */
+                int boff;   /* Byte offset for each doubleword in a vector. */
+                for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+                    print("z%-2d  0x", i);
+                    for (boff = ((vq * 2) - 1); boff >= 0; boff--) {
+                        vdw = *((uint64 *)((((byte *)sve) + (SVE_SIG_ZREG_OFFSET(vq, i)) + (boff * 8))));
+                        print("%016lx ", vdw);
+                    }
+                    print("\n");
+                }
+
+                print("\n");
+                for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+                    print("p%-2d  0x%08lx\n", i,
+                          *((uint32 *)((byte *)sve + SVE_SIG_PREG_OFFSET(vq, i))));
+                }
+                print("\n");
+                print("FFR  0x%08lx\n\n",
+                      *((uint32 *)((byte *)sve + SVE_SIG_FFR_OFFSET(vq))));
+
+                if (sve->head.size == sizeof(struct sve_context))
+                    offset += sizeof(struct sve_context);
+                else
+                    // VL / 8  x Zn  + ((( VL / 8  / 8) x Pn) + FFR)
+                    offset += sizeof(struct sve_context) +
+                        (vl_bytes * MCXT_NUM_SIMD_SVE_SLOTS) +
+                        ((vl_bytes / 8) * MCXT_NUM_SVEP_SLOTS) + 16;
+                break;
+            }
+            default:
+                print("%s %d Unhandled section with magic number 0x%x", __func__,
+                      __LINE__, next_head->magic);
+                assert(0);
+            }
+            next_head = (struct _aarch64_ctx *)(ucxt->uc_mcontext.RESERVED + offset);
+        }
+    }
+}
+#        endif
+
 #    endif /* UNIX */
 
 #else /* asm code *************************************************************/
diff --git a/suite/tests/tools.h b/suite/tests/tools.h
index e9ce5ddf8b8..1bc860bd7cd 100644
--- a/suite/tests/tools.h
+++ b/suite/tests/tools.h
@@ -301,6 +301,20 @@ typedef void (*handler_3_t)(int, siginfo_t *, ucontext_t *);
 /* set up signal_handler as the handler for signal "sig" */
 void
 intercept_signal(int sig, handler_3_t handler, bool sigstack);
+
+#    ifdef AARCH64
+void
+dump_ucontext(ucontext_t *ucxt, bool is_sve, int vl);
+#    endif
+
+/* Representation of quadwords as 2 doubles. */
+typedef union {
+    __uint128_t as_128;
+    struct {
+        uint64 lo;
+        uint64 hi;
+    } as_2x64;
+} reinterpret128_2x64_t;
 #endif
 
 /* for cross-plaform siglongjmp */