diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml
index 87d04fcd549..2ba40d9a760 100644
--- a/.github/workflows/ci-docs.yml
+++ b/.github/workflows/ci-docs.yml
@@ -90,7 +90,7 @@ jobs:
       # We only use a non-zero build # when making multiple manual builds in one day.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
diff --git a/.github/workflows/ci-package.yml b/.github/workflows/ci-package.yml
index 62e88e22689..9bff2302ddc 100644
--- a/.github/workflows/ci-package.yml
+++ b/.github/workflows/ci-package.yml
@@ -103,7 +103,7 @@ jobs:
       # We only use a non-zero build # when making multiple manual builds in one day.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -195,7 +195,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -283,7 +283,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -371,7 +371,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -451,7 +451,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -536,7 +536,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER="10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))"
+          export VERSION_NUMBER="10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))"
           export PREFIX="cronbuild-"
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9bf0a929dcb..2463024ee8c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -575,7 +575,7 @@ endif (EXISTS "${PROJECT_SOURCE_DIR}/.svn")
 
 # N.B.: When updating this, update all the default versions in ci-package.yml
 # and ci-docs.yml.  We should find a way to share (xref i#1565).
-set(VERSION_NUMBER_DEFAULT "10.0.${VERSION_NUMBER_PATCHLEVEL}")
+set(VERSION_NUMBER_DEFAULT "10.90.${VERSION_NUMBER_PATCHLEVEL}")
 # do not store the default VERSION_NUMBER in the cache to prevent a stale one
 # from preventing future version updates in a pre-existing build dir
 set(VERSION_NUMBER "" CACHE STRING "Version number: leave empty for default")
diff --git a/api/docs/release.dox b/api/docs/release.dox
index 2321dab8a4e..c2faec4ea75 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -147,6 +147,9 @@ changes:
    users sub-classing analyzer_tmpl_t).
  - Converted #dynamorio::drmemtrace::analysis_tool_tmpl_t::interval_state_snapshot_t
    into a class with all its data members marked private with public accessor functions.
+ - Changed the type of the AArch64 #dr_mcontext_t members svep and ffr to #dr_svep_t.
+   This breaks binary compatibility with clients that were built against versions of
+   DynamoRIO before this change.
 
 Further non-compatibility-affecting changes include:
  - Added DWARF-5 support to the drsyms library by linking in 4 static libraries
diff --git a/core/arch/aarch64/aarch64.asm b/core/arch/aarch64/aarch64.asm
index 7d770c2dfe1..1ea06f39de5 100644
--- a/core/arch/aarch64/aarch64.asm
+++ b/core/arch/aarch64/aarch64.asm
@@ -47,7 +47,7 @@ START_FILE
 #endif
 
 /* sizeof(priv_mcontext_t) rounded up to a multiple of 16 */
-#define PRIV_MCONTEXT_SIZE 3424
+#define PRIV_MCONTEXT_SIZE 2480
 
 /* offsetof(spill_state_t, r0) */
 #define spill_state_r0_OFFSET 0
@@ -69,7 +69,7 @@ START_FILE
 /* offsetof(priv_mcontext_t, simd) */
 #define simd_OFFSET (16 * ARG_SZ*2 + 32)
 /* offsetof(dcontext_t, dstack) */
-#define dstack_OFFSET     0xda8
+#define dstack_OFFSET     0x9f8
 /* offsetof(dcontext_t, is_exiting) */
 #define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ)
 /* offsetof(struct tlsdesc_t, arg) */
diff --git a/core/arch/aarch64/emit_utils.c b/core/arch/aarch64/emit_utils.c
index cd7656f950f..478b58a5b32 100644
--- a/core/arch/aarch64/emit_utils.c
+++ b/core/arch/aarch64/emit_utils.c
@@ -576,7 +576,7 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
     }
     if (proc_has_feature(FEATURE_SVE)) {
         for (i = 0; i < 32; i++) {
-            /* ldr z(i), [x1, #(i mul vl)]
+            /* ldr z(i), [x1, #(i * sizeof(dr_simd_t))]
              * From the SVE manual:
              * "Load a vector register from a memory address generated by a
              * 64-bit scalar base, plus an immediate offset in the range -256
@@ -584,11 +584,10 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
              * in bytes."
              */
             APP(ilist,
-                INSTR_CREATE_ldr(
-                    dcontext, opnd_create_reg(DR_REG_Z0 + i),
-                    opnd_create_base_disp(
-                        DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t),
-                        opnd_size_from_bytes(proc_get_vector_length_bytes()))));
+                INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_Z0 + i),
+                                 opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
+                                                       i * sizeof(dr_simd_t),
+                                                       OPSZ_SVE_VECLEN_BYTES)));
         }
         /* add x1, x(dcxt), #(offset svep) */
         APP(ilist,
@@ -599,40 +598,36 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
          * register for FFR load below, then restored from svep afterwards.
          */
         for (i = 0; i < 15; i++) {
-            /* ldr p(i), [x1, #(i mul vl)] */
+            /* ldr p(i), [x1, #(i * sizeof(dr_svep_t))] */
             APP(ilist,
-                INSTR_CREATE_ldr(
-                    dcontext, opnd_create_reg(DR_REG_P0 + i),
-                    opnd_create_base_disp(
-                        DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t),
-                        opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
+                INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P0 + i),
+                                 opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
+                                                       i * sizeof(dr_svep_t),
+                                                       OPSZ_SVE_PREDLEN_BYTES)));
         }
         /* There is no load instruction for the first-fault register (FFR). Use
          * a temporary predicate register to load:
          * add x2, x(dcxt), #(offset ffr)
-         * ldr p15, [x2, #(ffr)]
+         * ldr p15, [x2, #0]
          * wrffr p15.b
-         * ldr p15, [x1, #(15 mul vl)]
+         * ldr p15, [x1, #(15 * sizeof(dr_svep_t)]
          */
         APP(ilist,
             XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X2),
                                   opnd_create_reg(REG_DCXT),
                                   OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr))));
         APP(ilist,
-            INSTR_CREATE_ldr(
-                dcontext, opnd_create_reg(DR_REG_P15),
-                opnd_create_base_disp(
-                    DR_REG_X2, DR_REG_NULL, 0, 0,
-                    opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
+            INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15),
+                             opnd_create_base_disp(DR_REG_X2, DR_REG_NULL, 0, 0,
+                                                   OPSZ_SVE_PREDLEN_BYTES)));
         APP(ilist,
             INSTR_CREATE_wrffr_sve(dcontext,
                                    opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1)));
         APP(ilist,
-            INSTR_CREATE_ldr(
-                dcontext, opnd_create_reg(DR_REG_P15),
-                opnd_create_base_disp(
-                    DR_REG_X1, DR_REG_NULL, 0, 15 * sizeof(dr_simd_t),
-                    opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
+            INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15),
+                             opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
+                                                   15 * sizeof(dr_svep_t),
+                                                   OPSZ_SVE_PREDLEN_BYTES)));
     }
 }
 
@@ -791,19 +786,18 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
     }
     if (proc_has_feature(FEATURE_SVE)) {
         for (i = 0; i < 32; i++) {
-            /* str z(i), [x1, #(i mul vl)]
+            /* str z(i), [x1, #(i * sizeof(dr_simd_t))]
              * "Store a vector register to a memory address generated by a
              * 64-bit scalar base, plus an immediate offset in the range -256
              * to 255 which is multiplied by the current vector register size
              * in bytes."
              */
             APP(ilist,
-                INSTR_CREATE_str(
-                    dcontext,
-                    opnd_create_base_disp(
-                        DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t),
-                        opnd_size_from_bytes(proc_get_vector_length_bytes())),
-                    opnd_create_reg(DR_REG_Z0 + i)));
+                INSTR_CREATE_str(dcontext,
+                                 opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
+                                                       i * sizeof(dr_simd_t),
+                                                       OPSZ_SVE_VECLEN_BYTES),
+                                 opnd_create_reg(DR_REG_Z0 + i)));
         }
         /* add x1, x(dcxt), #(off) */
         APP(ilist,
@@ -811,21 +805,20 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
                                   opnd_create_reg(REG_DCXT),
                                   OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, svep))));
         for (i = 0; i < 16; i++) {
-            /* str p(i), [x1, #(i mul vl)] */
+            /* str p(i), [x1, #(i * sizeof(dr_svep_t))] */
             APP(ilist,
-                INSTR_CREATE_str(
-                    dcontext,
-                    opnd_create_base_disp(
-                        DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t),
-                        opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)),
-                    opnd_create_reg(DR_REG_P0 + i)));
+                INSTR_CREATE_str(dcontext,
+                                 opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
+                                                       i * sizeof(dr_svep_t),
+                                                       OPSZ_SVE_PREDLEN_BYTES),
+                                 opnd_create_reg(DR_REG_P0 + i)));
         }
         /* There is no store instruction for the first-fault register (FFR). Use
          * a temporary predicate register to store:
-         * rdffr p15.b
-         * add x2, x(dcxt), #(offset ffr)
-         * str p15, [x2, #(ffr)]
-         * ldr p15, [x1, #(15 mul vl)]
+         * rdffr p15.b                              ; Read FFR to P15
+         * add x2, x(dcxt), #(offset ffr)           ; Calculate FFR dcxt offset
+         * str p15, [x2, #0]                        ; Save FFR to dcxt
+         * ldr p15, [x1, #(15 * sizeof(dr_svep_t))] ; Restore app P15 value from dcxt
          */
         APP(ilist,
             INSTR_CREATE_rdffr_sve(dcontext,
@@ -835,18 +828,15 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
                                   opnd_create_reg(REG_DCXT),
                                   OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr))));
         APP(ilist,
-            INSTR_CREATE_str(
-                dcontext,
-                opnd_create_base_disp(
-                    DR_REG_X2, DR_REG_NULL, 0, 0,
-                    opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)),
-                opnd_create_reg(DR_REG_P15)));
+            INSTR_CREATE_str(dcontext,
+                             opnd_create_base_disp(DR_REG_X2, DR_REG_NULL, 0, 0,
+                                                   OPSZ_SVE_PREDLEN_BYTES),
+                             opnd_create_reg(DR_REG_P15)));
         APP(ilist,
-            INSTR_CREATE_ldr(
-                dcontext, opnd_create_reg(DR_REG_P15),
-                opnd_create_base_disp(
-                    DR_REG_X1, DR_REG_NULL, 0, 15 * sizeof(dr_simd_t),
-                    opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
+            INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15),
+                             opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
+                                                   15 * sizeof(dr_svep_t),
+                                                   OPSZ_SVE_PREDLEN_BYTES)));
     }
 }
 
diff --git a/core/arch/aarchxx/mangle.c b/core/arch/aarchxx/mangle.c
index 2093db58251..1c1aae4d395 100644
--- a/core/arch/aarchxx/mangle.c
+++ b/core/arch/aarchxx/mangle.c
@@ -565,7 +565,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
                               DR_REG_Q0, SIMD_REG_TYPE);
     }
 
-    dstack_offs += MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t);
+    dstack_offs += (MCXT_NUM_SIMD_SVE_SLOTS * sizeof(dr_simd_t)) +
+        (MCXT_NUM_SVEP_SLOTS * sizeof(dr_svep_t)) +
+        (MCXT_NUM_FFR_SLOTS * sizeof(dr_ffr_t));
 
     /* Restore the registers we used. */
     /* ldp x0, x1, [sp] */
@@ -577,6 +579,10 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
         INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_X2),
                          opnd_create_base_disp(DR_REG_SP, DR_REG_NULL, 0,
                                                REG_OFFSET(DR_REG_X2), OPSZ_8)));
+
+    /* Make dstack_offs 16-byte aligned. */
+    dstack_offs = ALIGN_FORWARD(dstack_offs, get_ABI_stack_alignment());
+
 #else
     /* vstmdb always does writeback */
     PRE(ilist, instr,
@@ -655,9 +661,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
 
     /* Make dstack_offs 8-byte algined, as we only accounted for 17 4-byte slots. */
     dstack_offs += XSP_SZ;
+#endif
     ASSERT(cci->skip_save_flags || cci->num_simd_skip != 0 || cci->num_regs_skip != 0 ||
            dstack_offs == (uint)get_clean_call_switch_stack_size());
-#endif
     return dstack_offs;
 }
 
@@ -678,8 +684,11 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist
         XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_X0),
                           opnd_create_reg(DR_REG_SP)));
 
-    current_offs =
-        get_clean_call_switch_stack_size() - (MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t));
+    current_offs = ALIGN_BACKWARD(get_clean_call_switch_stack_size() -
+                                      (MCXT_NUM_SIMD_SVE_SLOTS * sizeof(dr_simd_t)) -
+                                      (MCXT_NUM_SVEP_SLOTS * sizeof(dr_svep_t)) -
+                                      (MCXT_NUM_FFR_SLOTS * sizeof(dr_ffr_t)),
+                                  16);
 
     /* add x0, x0, current_offs */
     PRE(ilist, instr,
diff --git a/core/arch/arch.c b/core/arch/arch.c
index 655593e59c9..7cac9d47cdc 100644
--- a/core/arch/arch.c
+++ b/core/arch/arch.c
@@ -3825,21 +3825,50 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml)
     }
 #elif defined(AARCHXX)
     {
-        int i, j;
 #    ifdef AARCH64
-        int words = proc_has_feature(FEATURE_SVE) ? 16 : 4;
+        const uint vector_length_bytes =
+            (proc_has_feature(FEATURE_SVE) ? proc_get_vector_length_bytes()
+                                           : opnd_size_in_bytes(reg_get_size(DR_REG_Q0)));
+        const uint u32_count = vector_length_bytes / sizeof(uint);
+        /* XXX: should be proc_num_simd_saved(). */
+        const uint num_simd_regs = MCXT_NUM_SIMD_SVE_SLOTS;
+        const char *reg_prefix = proc_has_feature(FEATURE_SVE) ? "z" : "q";
 #    else
-        int words = 4;
-#    endif
+        const uint u32_count =
+            sizeof(context->simd[0].u32) / sizeof(context->simd[0].u32[0]);
         /* XXX: should be proc_num_simd_saved(). */
-        for (i = 0; i < proc_num_simd_registers(); i++) {
-            print_file(f, dump_xml ? "\t\tqd= \"0x" : "\tq%-3d= 0x", i);
-            for (j = 0; j < words; j++) {
+        const uint num_simd_regs = proc_num_simd_registers();
+        const char *reg_prefix = "q";
+#    endif
+        ASSERT(u32_count <=
+               sizeof(context->simd[0].u32) / sizeof(context->simd[0].u32[0]));
+        for (uint i = 0; i < num_simd_regs; i++) {
+            print_file(f, dump_xml ? "\t\t%s%d= \"0x" : "\t%s%-3d= 0x", reg_prefix, i);
+            for (uint j = 0; j < u32_count; j++) {
                 print_file(f, "%08x ", context->simd[i].u32[j]);
             }
             print_file(f, dump_xml ? "\"\n" : "\n");
         }
-        /* TODO i#5365: SVE predicate registers and FFR dump. */
+#    ifdef AARCH64
+        if (proc_has_feature(FEATURE_SVE)) {
+            /* Dump SVE P registers */
+            const uint pred_u16_count =
+                (proc_get_vector_length_bytes() / 8) / sizeof(ushort);
+            for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+                print_file(f, dump_xml ? "\t\tp%d= \"0x" : "\tp%-3d= 0x", i);
+                for (size_t j = 0; j < pred_u16_count; j++) {
+                    print_file(f, "%04x ", context->svep[i].u16[j]);
+                }
+                print_file(f, dump_xml ? "\"\n" : "\n");
+            }
+            /* Dump SVE FFR register */
+            print_file(f, dump_xml ? "\t\tffr= \"0x" : "\tffr = 0x");
+            for (size_t j = 0; j < pred_u16_count; j++) {
+                print_file(f, "%04x ", context->ffr.u16[j]);
+            }
+            print_file(f, dump_xml ? "\"\n" : "\n");
+        }
+#    endif
     }
 #endif
 
diff --git a/core/arch/arm/arm.asm b/core/arch/arm/arm.asm
index f58073eca67..5e7eaa5ad0f 100644
--- a/core/arch/arm/arm.asm
+++ b/core/arch/arm/arm.asm
@@ -60,17 +60,22 @@ DECL_EXTERN(initstack_mutex)
 
 #ifdef X64
 # define MCXT_NUM_SIMD_SLOTS 32
-# define SIMD_REG_SIZE       16
+# define SIMD_REG_SIZE       64
+# define MCXT_NUM_PRED_SLOTS 17 /* P regs + FFR */
+# define PRED_REG_SIZE        8
 # define NUM_GPR_SLOTS       33 /* incl flags */
 # define GPR_REG_SIZE         8
 #else
 # define MCXT_NUM_SIMD_SLOTS 16
 # define SIMD_REG_SIZE       16
+# define MCXT_NUM_PRED_SLOTS  0
+# define PRED_REG_SIZE        0
 # define NUM_GPR_SLOTS       17 /* incl flags */
 # define GPR_REG_SIZE         4
 #endif
 #define PRE_SIMD_PADDING     0
-#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE)
+#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE \
+                             + MCXT_NUM_PRED_SLOTS*PRED_REG_SIZE)
 #define PRIV_MCXT_SIZE (NUM_GPR_SLOTS*GPR_REG_SIZE + PRIV_MCXT_SIMD_SIZE)
 #define PRIV_MCXT_SP_FROM_SIMD (-(4*GPR_REG_SIZE)) /* flags, pc, lr, then sp */
 #define PRIV_MCXT_PC_FROM_SIMD (-(2*GPR_REG_SIZE)) /* flags, then pc */
diff --git a/core/ir/aarch64/instr.c b/core/ir/aarch64/instr.c
index 0d02a8d0642..223f9dbb0e6 100644
--- a/core/ir/aarch64/instr.c
+++ b/core/ir/aarch64/instr.c
@@ -801,7 +801,7 @@ instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size
 
     const reg_t governing_pred = opnd_get_reg(instr_get_src(instr, 1));
     ASSERT(governing_pred >= DR_REG_START_P && governing_pred <= DR_REG_STOP_P);
-    uint64 mask = mc->svep[governing_pred - DR_REG_START_P].d;
+    uint64 mask = mc->svep[governing_pred - DR_REG_START_P].u64[0];
 
     if (mask == 0) {
         return false;
diff --git a/core/lib/globals_api.h b/core/lib/globals_api.h
index 659e32d0a2b..36d85dac065 100644
--- a/core/lib/globals_api.h
+++ b/core/lib/globals_api.h
@@ -688,10 +688,9 @@ typedef uint64 dr_opmask_t;
 
 #if defined(AARCHXX)
 /**
- * 512-bit ARM Scalable Vector Extension (SVE) vector registers Zn and
- * predicate registers Pn. Low 128 bits of Zn overlap with existing ARM
- * Advanced SIMD (NEON) Vn registers. The SVE specification defines the
- * following valid vector lengths:
+ * 512-bit ARM Scalable Vector Extension (SVE) vector registers Zn.
+ * Low 128 bits of Zn overlap with existing ARM Advanced SIMD (NEON) Vn registers.
+ * The SVE specification defines the following valid vector lengths:
  * 128 256 384 512 640 768 896 1024 1152 1280 1408 1536 1664 1792 1920 2048
  * We currently support 512-bit maximum due to DR's stack size limitation,
  * (machine context stored in the stack). In AArch64, align to 16 bytes for
@@ -706,11 +705,28 @@ typedef union ALIGN_VAR(16) _dr_simd_t {
     uint s;        /**< Singleword (32 bit, Sn) scalar element of Vn, Zn and Pn. */
     uint64 d;      /**< Doubleword (64 bit, Dn) scalar element of Vn, Zn and Pn. */
     uint q[4];     /**< The full 128 bit Vn register, Qn as q[3]:q[2]:q[1]:q[0]. */
-    uint u32[16];  /**< The full 512 bit Zn, Pn and FFR registers as Singleword (32-bit)
-                      elements. */
-    uint64 u64[8]; /**< The full 512 bit Zn, Pn and FFR registers as Doubleword (64-bit)
-                      elements. */
+    uint u32[16];  /**< The full 512 bit Zn register as Singleword (32-bit) elements. */
+    uint64 u64[8]; /**< The full 512 bit Zn register as Doubleword (64-bit) elements. */
 } dr_simd_t;
+
+/**
+ * 64-bit Arm Scalable Vector Extension (SVE) predicate register Pn.
+ * SVE Pn registers are used to hold mask values that control the operation of some SVE
+ * instructions. Pn registers have one bit for every byte of a Zn register to the size
+ * of a Pn register is always 1/8 the size of a Zn register.
+ * DynamoRIO currently supports up to 512-bit Zn registers and 64-bit Pn registers.
+ */
+typedef union _dr_svep_t {
+    ushort u16[4]; /**< The full 64-bit Pn or FFR register as 16-bit elements. */
+    uint u32[2];   /**< The full 64-bit Pn or FFR register as 32-bit elements. */
+    uint64 u64[1]; /**< The full 64-bit Pn or FFR register as 64-bit elements. */
+} dr_svep_t;
+
+/**
+ * 64-bit Arm Scalable Vector Extension (SVE) First Fault Register (FFR).
+ * FFR is a special purpose predicate register used by some SVE instructions.
+ */
+typedef dr_svep_t dr_ffr_t;
 #    else
 typedef union _dr_simd_t {
     uint s[4];   /**< Representation as 4 32-bit Sn elements. */
@@ -720,7 +736,7 @@ typedef union _dr_simd_t {
 #    endif
 #    ifdef X64
 #        define MCXT_NUM_SIMD_SVE_SLOTS                                  \
-            32 /**< Number of 128-bit SIMD Vn/Zn slots in dr_mcontext_t. \
+            32 /**< Number of 512-bit SIMD Vn/Zn slots in dr_mcontext_t. \
                 */
 #        define MCXT_NUM_SVEP_SLOTS 16 /**< Number of SIMD Pn slots in dr_mcontext_t. */
 #        define MCXT_NUM_FFR_SLOTS \
diff --git a/core/lib/mcxtx_api.h b/core/lib/mcxtx_api.h
index e02543783a0..c74f5135aab 100644
--- a/core/lib/mcxtx_api.h
+++ b/core/lib/mcxtx_api.h
@@ -140,12 +140,12 @@
      * The Arm AArch64 Scalable Vector Extension (SVE) predicate registers
      * DR_REG_P0 to DR_REG_P15.
      */
-    dr_simd_t svep[MCXT_NUM_SVEP_SLOTS];
+    dr_svep_t svep[MCXT_NUM_SVEP_SLOTS];
     /**
      * The Arm AArch64 Scalable Vector Extension (SVE) first fault register
      * DR_REG_FFR, for vector load instrcutions.
      */
-    dr_simd_t ffr;
+    dr_ffr_t ffr;
 #   else
     /*
      * For the Arm AArch32 SIMD registers, we would probably be ok if we did
diff --git a/core/unix/signal_linux_aarch64.c b/core/unix/signal_linux_aarch64.c
index abd9fe92b64..24a61597a22 100644
--- a/core/unix/signal_linux_aarch64.c
+++ b/core/unix/signal_linux_aarch64.c
@@ -253,16 +253,16 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
                          */
                         memcpy(&mc->simd[i].u32,
                                (byte *)sve + SVE_SIG_ZREG_OFFSET(quads_per_vector, i),
-                               sve->vl);
+                               SVE_SIG_ZREG_SIZE(quads_per_vector));
                         memcpy(&mc->simd[i].q, &fpc->vregs[i], sizeof(mc->simd->q));
                     }
                     for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
-                        memcpy(&mc->svep[i].u32,
+                        memcpy(&mc->svep[i].u16,
                                (byte *)sve + SVE_SIG_PREG_OFFSET(quads_per_vector, i),
-                               sve->vl);
+                               SVE_SIG_PREG_SIZE(quads_per_vector));
                     }
                     memcpy(&mc->ffr, (byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector),
-                           sve->vl);
+                           SVE_SIG_FFR_SIZE(quads_per_vector));
                 }
                 break;
             }
@@ -316,9 +316,9 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
         }
         for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
             memcpy((byte *)sve + SVE_SIG_PREG_OFFSET(quads_per_vector, i),
-                   &mc->svep[i].u32, sve->vl);
+                   &mc->svep[i].u16, sve->vl / 8);
         }
-        memcpy((byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), &mc->ffr, sve->vl);
+        memcpy((byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), &mc->ffr, sve->vl / 8);
 
         size_t offset = (proc_get_vector_length_bytes() * MCXT_NUM_SIMD_SVE_SLOTS) +
             ((proc_get_vector_length_bytes() / 8) * MCXT_NUM_SVEP_SLOTS) + 16;
diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c
index 18c698767a0..e3f1960b750 100644
--- a/ext/drx/scatter_gather_aarch64.c
+++ b/ext/drx/scatter_gather_aarch64.c
@@ -1803,7 +1803,7 @@ drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info,
             const size_t reg_num = spill_slot_state.pred_slots[slot].reg - DR_REG_P0;
 
             memcpy(&info->mcontext->svep[reg_num],
-                   &((char *)pt->scratch_pred_spill_slots)[pl_bytes * slot], vl_bytes);
+                   &((char *)pt->scratch_pred_spill_slots)[pl_bytes * slot], pl_bytes);
         }
     }
 
diff --git a/suite/tests/client-interface/cleancall-opt-shared.h b/suite/tests/client-interface/cleancall-opt-shared.h
index 6d2fc746a7b..58907c9e7cf 100644
--- a/suite/tests/client-interface/cleancall-opt-shared.h
+++ b/suite/tests/client-interface/cleancall-opt-shared.h
@@ -353,30 +353,55 @@ dump_diff_mcontexts(void)
                        after_reg.u32[6], after_reg.u32[7]);
         }
 #elif defined(AARCH64)
-        const size_t mmsz = proc_get_vector_length_bytes();
-        dr_simd_t before_reg, after_reg;
+        const size_t veclen_bytes = proc_get_vector_length_bytes();
+        const size_t predlen_bytes = veclen_bytes / 8;
         char reg_name[4];
-        if (i >= (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS)) {
-            strcpy(reg_name, "FFR");
-            before_reg = before_mcontext.ffr;
-            after_reg = after_mcontext.ffr;
-        } else if (i >= MCXT_NUM_SIMD_SVE_SLOTS) {
-            dr_snprintf(reg_name, 4, "P%2d", i - MCXT_NUM_SIMD_SVE_SLOTS);
-            before_reg = before_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS];
-            after_reg = after_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS];
+        const char *diff_str = NULL;
+        if (i >= MCXT_NUM_SIMD_SVE_SLOTS) {
+            dr_svep_t before_reg, after_reg;
+
+            if (i >= (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS)) {
+                strncpy(reg_name, "FFR", sizeof(reg_name));
+                before_reg = before_mcontext.ffr;
+                after_reg = after_mcontext.ffr;
+            } else {
+                dr_snprintf(reg_name, 4, "P%-2d", i - MCXT_NUM_SIMD_SVE_SLOTS);
+                before_reg = before_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS];
+                after_reg = after_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS];
+            }
+
+            diff_str =
+                (memcmp(&before_reg, &after_reg, predlen_bytes) == 0 ? ""
+                                                                     : " <- DIFFERS");
+
+            const size_t num_elements = predlen_bytes / sizeof(before_reg.u16[0]);
+            dr_fprintf(STDERR, "%s before: ", reg_name);
+            for (int element = 0; element < num_elements; element++) {
+                dr_fprintf(STDERR, "%04x", before_reg.u16[element]);
+            }
+            dr_fprintf(STDERR, " after: ");
+            for (int element = 0; element < num_elements; element++) {
+                dr_fprintf(STDERR, "%04x", after_reg.u16[element]);
+            }
+
         } else {
-            dr_snprintf(reg_name, 4, "Z%2d", i);
-            before_reg = before_mcontext.simd[i];
-            after_reg = after_mcontext.simd[i];
-        }
+            dr_snprintf(reg_name, 4, "Z%-2d", i);
+            dr_simd_t before_reg = before_mcontext.simd[i];
+            dr_simd_t after_reg = after_mcontext.simd[i];
 
-        const char *diff_str =
-            (memcmp(&before_reg, &after_reg, mmsz) == 0 ? "" : " <- DIFFERS");
+            diff_str =
+                (memcmp(&before_reg, &after_reg, veclen_bytes) == 0 ? "" : " <- DIFFERS");
 
-        dr_fprintf(STDERR, "%s before: %08x%08x%08x%08x", reg_name, before_reg.u32[0],
-                   before_reg.u32[1], before_reg.u32[2], before_reg.u32[3]);
-        dr_fprintf(STDERR, " after: %08x%08x%08x%08x", after_reg.u32[0], after_reg.u32[1],
-                   after_reg.u32[2], after_reg.u32[3]);
+            const size_t num_elements = veclen_bytes / sizeof(before_reg.u64[0]);
+            dr_fprintf(STDERR, "%s before: ", reg_name);
+            for (size_t element = 0; element < num_elements; element++) {
+                dr_fprintf(STDERR, PFMT, before_reg.u64[element]);
+            }
+            dr_fprintf(STDERR, " after: ");
+            for (size_t element = 0; element < num_elements; element++) {
+                dr_fprintf(STDERR, PFMT, after_reg.u64[element]);
+            }
+        }
 #endif
         dr_fprintf(STDERR, "%s\n", diff_str);
     }