diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml index 87d04fcd549..2ba40d9a760 100644 --- a/.github/workflows/ci-docs.yml +++ b/.github/workflows/ci-docs.yml @@ -90,7 +90,7 @@ jobs: # We only use a non-zero build # when making multiple manual builds in one day. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi diff --git a/.github/workflows/ci-package.yml b/.github/workflows/ci-package.yml index 62e88e22689..9bff2302ddc 100644 --- a/.github/workflows/ci-package.yml +++ b/.github/workflows/ci-package.yml @@ -103,7 +103,7 @@ jobs: # We only use a non-zero build # when making multiple manual builds in one day. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -195,7 +195,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -283,7 +283,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -371,7 +371,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -451,7 +451,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -536,7 +536,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER="10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))" + export VERSION_NUMBER="10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))" export PREFIX="cronbuild-" else export VERSION_NUMBER=${{ github.event.inputs.version }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 9bf0a929dcb..2463024ee8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -575,7 +575,7 @@ endif (EXISTS "${PROJECT_SOURCE_DIR}/.svn") # N.B.: When updating this, update all the default versions in ci-package.yml # and ci-docs.yml. We should find a way to share (xref i#1565). -set(VERSION_NUMBER_DEFAULT "10.0.${VERSION_NUMBER_PATCHLEVEL}") +set(VERSION_NUMBER_DEFAULT "10.90.${VERSION_NUMBER_PATCHLEVEL}") # do not store the default VERSION_NUMBER in the cache to prevent a stale one # from preventing future version updates in a pre-existing build dir set(VERSION_NUMBER "" CACHE STRING "Version number: leave empty for default") diff --git a/api/docs/release.dox b/api/docs/release.dox index 2321dab8a4e..c2faec4ea75 100644 --- a/api/docs/release.dox +++ b/api/docs/release.dox @@ -147,6 +147,9 @@ changes: users sub-classing analyzer_tmpl_t). - Converted #dynamorio::drmemtrace::analysis_tool_tmpl_t::interval_state_snapshot_t into a class with all its data members marked private with public accessor functions. + - Changed the type of the AArch64 #dr_mcontext_t members svep and ffr to #dr_svep_t. + This breaks binary compatibility with clients that were built against versions of + DynamoRIO before this change. Further non-compatibility-affecting changes include: - Added DWARF-5 support to the drsyms library by linking in 4 static libraries diff --git a/core/arch/aarch64/aarch64.asm b/core/arch/aarch64/aarch64.asm index 7d770c2dfe1..1ea06f39de5 100644 --- a/core/arch/aarch64/aarch64.asm +++ b/core/arch/aarch64/aarch64.asm @@ -47,7 +47,7 @@ START_FILE #endif /* sizeof(priv_mcontext_t) rounded up to a multiple of 16 */ -#define PRIV_MCONTEXT_SIZE 3424 +#define PRIV_MCONTEXT_SIZE 2480 /* offsetof(spill_state_t, r0) */ #define spill_state_r0_OFFSET 0 @@ -69,7 +69,7 @@ START_FILE /* offsetof(priv_mcontext_t, simd) */ #define simd_OFFSET (16 * ARG_SZ*2 + 32) /* offsetof(dcontext_t, dstack) */ -#define dstack_OFFSET 0xda8 +#define dstack_OFFSET 0x9f8 /* offsetof(dcontext_t, is_exiting) */ #define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ) /* offsetof(struct tlsdesc_t, arg) */ diff --git a/core/arch/aarch64/emit_utils.c b/core/arch/aarch64/emit_utils.c index cd7656f950f..478b58a5b32 100644 --- a/core/arch/aarch64/emit_utils.c +++ b/core/arch/aarch64/emit_utils.c @@ -576,7 +576,7 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) } if (proc_has_feature(FEATURE_SVE)) { for (i = 0; i < 32; i++) { - /* ldr z(i), [x1, #(i mul vl)] + /* ldr z(i), [x1, #(i * sizeof(dr_simd_t))] * From the SVE manual: * "Load a vector register from a memory address generated by a * 64-bit scalar base, plus an immediate offset in the range -256 @@ -584,11 +584,10 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) * in bytes." */ APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_Z0 + i), - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes())))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_Z0 + i), + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + i * sizeof(dr_simd_t), + OPSZ_SVE_VECLEN_BYTES))); } /* add x1, x(dcxt), #(offset svep) */ APP(ilist, @@ -599,40 +598,36 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) * register for FFR load below, then restored from svep afterwards. */ for (i = 0; i < 15; i++) { - /* ldr p(i), [x1, #(i mul vl)] */ + /* ldr p(i), [x1, #(i * sizeof(dr_svep_t))] */ APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_P0 + i), - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P0 + i), + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + i * sizeof(dr_svep_t), + OPSZ_SVE_PREDLEN_BYTES))); } /* There is no load instruction for the first-fault register (FFR). Use * a temporary predicate register to load: * add x2, x(dcxt), #(offset ffr) - * ldr p15, [x2, #(ffr)] + * ldr p15, [x2, #0] * wrffr p15.b - * ldr p15, [x1, #(15 mul vl)] + * ldr p15, [x1, #(15 * sizeof(dr_svep_t)] */ APP(ilist, XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X2), opnd_create_reg(REG_DCXT), OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr)))); APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_P15), - opnd_create_base_disp( - DR_REG_X2, DR_REG_NULL, 0, 0, - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15), + opnd_create_base_disp(DR_REG_X2, DR_REG_NULL, 0, 0, + OPSZ_SVE_PREDLEN_BYTES))); APP(ilist, INSTR_CREATE_wrffr_sve(dcontext, opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1))); APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_P15), - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, 15 * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15), + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + 15 * sizeof(dr_svep_t), + OPSZ_SVE_PREDLEN_BYTES))); } } @@ -791,19 +786,18 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) } if (proc_has_feature(FEATURE_SVE)) { for (i = 0; i < 32; i++) { - /* str z(i), [x1, #(i mul vl)] + /* str z(i), [x1, #(i * sizeof(dr_simd_t))] * "Store a vector register to a memory address generated by a * 64-bit scalar base, plus an immediate offset in the range -256 * to 255 which is multiplied by the current vector register size * in bytes." */ APP(ilist, - INSTR_CREATE_str( - dcontext, - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes())), - opnd_create_reg(DR_REG_Z0 + i))); + INSTR_CREATE_str(dcontext, + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + i * sizeof(dr_simd_t), + OPSZ_SVE_VECLEN_BYTES), + opnd_create_reg(DR_REG_Z0 + i))); } /* add x1, x(dcxt), #(off) */ APP(ilist, @@ -811,21 +805,20 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) opnd_create_reg(REG_DCXT), OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, svep)))); for (i = 0; i < 16; i++) { - /* str p(i), [x1, #(i mul vl)] */ + /* str p(i), [x1, #(i * sizeof(dr_svep_t))] */ APP(ilist, - INSTR_CREATE_str( - dcontext, - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)), - opnd_create_reg(DR_REG_P0 + i))); + INSTR_CREATE_str(dcontext, + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + i * sizeof(dr_svep_t), + OPSZ_SVE_PREDLEN_BYTES), + opnd_create_reg(DR_REG_P0 + i))); } /* There is no store instruction for the first-fault register (FFR). Use * a temporary predicate register to store: - * rdffr p15.b - * add x2, x(dcxt), #(offset ffr) - * str p15, [x2, #(ffr)] - * ldr p15, [x1, #(15 mul vl)] + * rdffr p15.b ; Read FFR to P15 + * add x2, x(dcxt), #(offset ffr) ; Calculate FFR dcxt offset + * str p15, [x2, #0] ; Save FFR to dcxt + * ldr p15, [x1, #(15 * sizeof(dr_svep_t))] ; Restore app P15 value from dcxt */ APP(ilist, INSTR_CREATE_rdffr_sve(dcontext, @@ -835,18 +828,15 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) opnd_create_reg(REG_DCXT), OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr)))); APP(ilist, - INSTR_CREATE_str( - dcontext, - opnd_create_base_disp( - DR_REG_X2, DR_REG_NULL, 0, 0, - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)), - opnd_create_reg(DR_REG_P15))); + INSTR_CREATE_str(dcontext, + opnd_create_base_disp(DR_REG_X2, DR_REG_NULL, 0, 0, + OPSZ_SVE_PREDLEN_BYTES), + opnd_create_reg(DR_REG_P15))); APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_P15), - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, 15 * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15), + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + 15 * sizeof(dr_svep_t), + OPSZ_SVE_PREDLEN_BYTES))); } } diff --git a/core/arch/aarchxx/mangle.c b/core/arch/aarchxx/mangle.c index 2093db58251..1c1aae4d395 100644 --- a/core/arch/aarchxx/mangle.c +++ b/core/arch/aarchxx/mangle.c @@ -565,7 +565,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, DR_REG_Q0, SIMD_REG_TYPE); } - dstack_offs += MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t); + dstack_offs += (MCXT_NUM_SIMD_SVE_SLOTS * sizeof(dr_simd_t)) + + (MCXT_NUM_SVEP_SLOTS * sizeof(dr_svep_t)) + + (MCXT_NUM_FFR_SLOTS * sizeof(dr_ffr_t)); /* Restore the registers we used. */ /* ldp x0, x1, [sp] */ @@ -577,6 +579,10 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_X2), opnd_create_base_disp(DR_REG_SP, DR_REG_NULL, 0, REG_OFFSET(DR_REG_X2), OPSZ_8))); + + /* Make dstack_offs 16-byte aligned. */ + dstack_offs = ALIGN_FORWARD(dstack_offs, get_ABI_stack_alignment()); + #else /* vstmdb always does writeback */ PRE(ilist, instr, @@ -655,9 +661,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, /* Make dstack_offs 8-byte algined, as we only accounted for 17 4-byte slots. */ dstack_offs += XSP_SZ; +#endif ASSERT(cci->skip_save_flags || cci->num_simd_skip != 0 || cci->num_regs_skip != 0 || dstack_offs == (uint)get_clean_call_switch_stack_size()); -#endif return dstack_offs; } @@ -678,8 +684,11 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_X0), opnd_create_reg(DR_REG_SP))); - current_offs = - get_clean_call_switch_stack_size() - (MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t)); + current_offs = ALIGN_BACKWARD(get_clean_call_switch_stack_size() - + (MCXT_NUM_SIMD_SVE_SLOTS * sizeof(dr_simd_t)) - + (MCXT_NUM_SVEP_SLOTS * sizeof(dr_svep_t)) - + (MCXT_NUM_FFR_SLOTS * sizeof(dr_ffr_t)), + 16); /* add x0, x0, current_offs */ PRE(ilist, instr, diff --git a/core/arch/arch.c b/core/arch/arch.c index 655593e59c9..7cac9d47cdc 100644 --- a/core/arch/arch.c +++ b/core/arch/arch.c @@ -3825,21 +3825,50 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml) } #elif defined(AARCHXX) { - int i, j; # ifdef AARCH64 - int words = proc_has_feature(FEATURE_SVE) ? 16 : 4; + const uint vector_length_bytes = + (proc_has_feature(FEATURE_SVE) ? proc_get_vector_length_bytes() + : opnd_size_in_bytes(reg_get_size(DR_REG_Q0))); + const uint u32_count = vector_length_bytes / sizeof(uint); + /* XXX: should be proc_num_simd_saved(). */ + const uint num_simd_regs = MCXT_NUM_SIMD_SVE_SLOTS; + const char *reg_prefix = proc_has_feature(FEATURE_SVE) ? "z" : "q"; # else - int words = 4; -# endif + const uint u32_count = + sizeof(context->simd[0].u32) / sizeof(context->simd[0].u32[0]); /* XXX: should be proc_num_simd_saved(). */ - for (i = 0; i < proc_num_simd_registers(); i++) { - print_file(f, dump_xml ? "\t\tqd= \"0x" : "\tq%-3d= 0x", i); - for (j = 0; j < words; j++) { + const uint num_simd_regs = proc_num_simd_registers(); + const char *reg_prefix = "q"; +# endif + ASSERT(u32_count <= + sizeof(context->simd[0].u32) / sizeof(context->simd[0].u32[0])); + for (uint i = 0; i < num_simd_regs; i++) { + print_file(f, dump_xml ? "\t\t%s%d= \"0x" : "\t%s%-3d= 0x", reg_prefix, i); + for (uint j = 0; j < u32_count; j++) { print_file(f, "%08x ", context->simd[i].u32[j]); } print_file(f, dump_xml ? "\"\n" : "\n"); } - /* TODO i#5365: SVE predicate registers and FFR dump. */ +# ifdef AARCH64 + if (proc_has_feature(FEATURE_SVE)) { + /* Dump SVE P registers */ + const uint pred_u16_count = + (proc_get_vector_length_bytes() / 8) / sizeof(ushort); + for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) { + print_file(f, dump_xml ? "\t\tp%d= \"0x" : "\tp%-3d= 0x", i); + for (size_t j = 0; j < pred_u16_count; j++) { + print_file(f, "%04x ", context->svep[i].u16[j]); + } + print_file(f, dump_xml ? "\"\n" : "\n"); + } + /* Dump SVE FFR register */ + print_file(f, dump_xml ? "\t\tffr= \"0x" : "\tffr = 0x"); + for (size_t j = 0; j < pred_u16_count; j++) { + print_file(f, "%04x ", context->ffr.u16[j]); + } + print_file(f, dump_xml ? "\"\n" : "\n"); + } +# endif } #endif diff --git a/core/arch/arm/arm.asm b/core/arch/arm/arm.asm index f58073eca67..5e7eaa5ad0f 100644 --- a/core/arch/arm/arm.asm +++ b/core/arch/arm/arm.asm @@ -60,17 +60,22 @@ DECL_EXTERN(initstack_mutex) #ifdef X64 # define MCXT_NUM_SIMD_SLOTS 32 -# define SIMD_REG_SIZE 16 +# define SIMD_REG_SIZE 64 +# define MCXT_NUM_PRED_SLOTS 17 /* P regs + FFR */ +# define PRED_REG_SIZE 8 # define NUM_GPR_SLOTS 33 /* incl flags */ # define GPR_REG_SIZE 8 #else # define MCXT_NUM_SIMD_SLOTS 16 # define SIMD_REG_SIZE 16 +# define MCXT_NUM_PRED_SLOTS 0 +# define PRED_REG_SIZE 0 # define NUM_GPR_SLOTS 17 /* incl flags */ # define GPR_REG_SIZE 4 #endif #define PRE_SIMD_PADDING 0 -#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE) +#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE \ + + MCXT_NUM_PRED_SLOTS*PRED_REG_SIZE) #define PRIV_MCXT_SIZE (NUM_GPR_SLOTS*GPR_REG_SIZE + PRIV_MCXT_SIMD_SIZE) #define PRIV_MCXT_SP_FROM_SIMD (-(4*GPR_REG_SIZE)) /* flags, pc, lr, then sp */ #define PRIV_MCXT_PC_FROM_SIMD (-(2*GPR_REG_SIZE)) /* flags, then pc */ diff --git a/core/ir/aarch64/instr.c b/core/ir/aarch64/instr.c index 0d02a8d0642..223f9dbb0e6 100644 --- a/core/ir/aarch64/instr.c +++ b/core/ir/aarch64/instr.c @@ -801,7 +801,7 @@ instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size const reg_t governing_pred = opnd_get_reg(instr_get_src(instr, 1)); ASSERT(governing_pred >= DR_REG_START_P && governing_pred <= DR_REG_STOP_P); - uint64 mask = mc->svep[governing_pred - DR_REG_START_P].d; + uint64 mask = mc->svep[governing_pred - DR_REG_START_P].u64[0]; if (mask == 0) { return false; diff --git a/core/lib/globals_api.h b/core/lib/globals_api.h index 659e32d0a2b..36d85dac065 100644 --- a/core/lib/globals_api.h +++ b/core/lib/globals_api.h @@ -688,10 +688,9 @@ typedef uint64 dr_opmask_t; #if defined(AARCHXX) /** - * 512-bit ARM Scalable Vector Extension (SVE) vector registers Zn and - * predicate registers Pn. Low 128 bits of Zn overlap with existing ARM - * Advanced SIMD (NEON) Vn registers. The SVE specification defines the - * following valid vector lengths: + * 512-bit ARM Scalable Vector Extension (SVE) vector registers Zn. + * Low 128 bits of Zn overlap with existing ARM Advanced SIMD (NEON) Vn registers. + * The SVE specification defines the following valid vector lengths: * 128 256 384 512 640 768 896 1024 1152 1280 1408 1536 1664 1792 1920 2048 * We currently support 512-bit maximum due to DR's stack size limitation, * (machine context stored in the stack). In AArch64, align to 16 bytes for @@ -706,11 +705,28 @@ typedef union ALIGN_VAR(16) _dr_simd_t { uint s; /**< Singleword (32 bit, Sn) scalar element of Vn, Zn and Pn. */ uint64 d; /**< Doubleword (64 bit, Dn) scalar element of Vn, Zn and Pn. */ uint q[4]; /**< The full 128 bit Vn register, Qn as q[3]:q[2]:q[1]:q[0]. */ - uint u32[16]; /**< The full 512 bit Zn, Pn and FFR registers as Singleword (32-bit) - elements. */ - uint64 u64[8]; /**< The full 512 bit Zn, Pn and FFR registers as Doubleword (64-bit) - elements. */ + uint u32[16]; /**< The full 512 bit Zn register as Singleword (32-bit) elements. */ + uint64 u64[8]; /**< The full 512 bit Zn register as Doubleword (64-bit) elements. */ } dr_simd_t; + +/** + * 64-bit Arm Scalable Vector Extension (SVE) predicate register Pn. + * SVE Pn registers are used to hold mask values that control the operation of some SVE + * instructions. Pn registers have one bit for every byte of a Zn register to the size + * of a Pn register is always 1/8 the size of a Zn register. + * DynamoRIO currently supports up to 512-bit Zn registers and 64-bit Pn registers. + */ +typedef union _dr_svep_t { + ushort u16[4]; /**< The full 64-bit Pn or FFR register as 16-bit elements. */ + uint u32[2]; /**< The full 64-bit Pn or FFR register as 32-bit elements. */ + uint64 u64[1]; /**< The full 64-bit Pn or FFR register as 64-bit elements. */ +} dr_svep_t; + +/** + * 64-bit Arm Scalable Vector Extension (SVE) First Fault Register (FFR). + * FFR is a special purpose predicate register used by some SVE instructions. + */ +typedef dr_svep_t dr_ffr_t; # else typedef union _dr_simd_t { uint s[4]; /**< Representation as 4 32-bit Sn elements. */ @@ -720,7 +736,7 @@ typedef union _dr_simd_t { # endif # ifdef X64 # define MCXT_NUM_SIMD_SVE_SLOTS \ - 32 /**< Number of 128-bit SIMD Vn/Zn slots in dr_mcontext_t. \ + 32 /**< Number of 512-bit SIMD Vn/Zn slots in dr_mcontext_t. \ */ # define MCXT_NUM_SVEP_SLOTS 16 /**< Number of SIMD Pn slots in dr_mcontext_t. */ # define MCXT_NUM_FFR_SLOTS \ diff --git a/core/lib/mcxtx_api.h b/core/lib/mcxtx_api.h index e02543783a0..c74f5135aab 100644 --- a/core/lib/mcxtx_api.h +++ b/core/lib/mcxtx_api.h @@ -140,12 +140,12 @@ * The Arm AArch64 Scalable Vector Extension (SVE) predicate registers * DR_REG_P0 to DR_REG_P15. */ - dr_simd_t svep[MCXT_NUM_SVEP_SLOTS]; + dr_svep_t svep[MCXT_NUM_SVEP_SLOTS]; /** * The Arm AArch64 Scalable Vector Extension (SVE) first fault register * DR_REG_FFR, for vector load instrcutions. */ - dr_simd_t ffr; + dr_ffr_t ffr; # else /* * For the Arm AArch32 SIMD registers, we would probably be ok if we did diff --git a/core/unix/signal_linux_aarch64.c b/core/unix/signal_linux_aarch64.c index abd9fe92b64..24a61597a22 100644 --- a/core/unix/signal_linux_aarch64.c +++ b/core/unix/signal_linux_aarch64.c @@ -253,16 +253,16 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full) */ memcpy(&mc->simd[i].u32, (byte *)sve + SVE_SIG_ZREG_OFFSET(quads_per_vector, i), - sve->vl); + SVE_SIG_ZREG_SIZE(quads_per_vector)); memcpy(&mc->simd[i].q, &fpc->vregs[i], sizeof(mc->simd->q)); } for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) { - memcpy(&mc->svep[i].u32, + memcpy(&mc->svep[i].u16, (byte *)sve + SVE_SIG_PREG_OFFSET(quads_per_vector, i), - sve->vl); + SVE_SIG_PREG_SIZE(quads_per_vector)); } memcpy(&mc->ffr, (byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), - sve->vl); + SVE_SIG_FFR_SIZE(quads_per_vector)); } break; } @@ -316,9 +316,9 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc) } for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) { memcpy((byte *)sve + SVE_SIG_PREG_OFFSET(quads_per_vector, i), - &mc->svep[i].u32, sve->vl); + &mc->svep[i].u16, sve->vl / 8); } - memcpy((byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), &mc->ffr, sve->vl); + memcpy((byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), &mc->ffr, sve->vl / 8); size_t offset = (proc_get_vector_length_bytes() * MCXT_NUM_SIMD_SVE_SLOTS) + ((proc_get_vector_length_bytes() / 8) * MCXT_NUM_SVEP_SLOTS) + 16; diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c index 18c698767a0..e3f1960b750 100644 --- a/ext/drx/scatter_gather_aarch64.c +++ b/ext/drx/scatter_gather_aarch64.c @@ -1803,7 +1803,7 @@ drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info, const size_t reg_num = spill_slot_state.pred_slots[slot].reg - DR_REG_P0; memcpy(&info->mcontext->svep[reg_num], - &((char *)pt->scratch_pred_spill_slots)[pl_bytes * slot], vl_bytes); + &((char *)pt->scratch_pred_spill_slots)[pl_bytes * slot], pl_bytes); } } diff --git a/suite/tests/client-interface/cleancall-opt-shared.h b/suite/tests/client-interface/cleancall-opt-shared.h index 6d2fc746a7b..58907c9e7cf 100644 --- a/suite/tests/client-interface/cleancall-opt-shared.h +++ b/suite/tests/client-interface/cleancall-opt-shared.h @@ -353,30 +353,55 @@ dump_diff_mcontexts(void) after_reg.u32[6], after_reg.u32[7]); } #elif defined(AARCH64) - const size_t mmsz = proc_get_vector_length_bytes(); - dr_simd_t before_reg, after_reg; + const size_t veclen_bytes = proc_get_vector_length_bytes(); + const size_t predlen_bytes = veclen_bytes / 8; char reg_name[4]; - if (i >= (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS)) { - strcpy(reg_name, "FFR"); - before_reg = before_mcontext.ffr; - after_reg = after_mcontext.ffr; - } else if (i >= MCXT_NUM_SIMD_SVE_SLOTS) { - dr_snprintf(reg_name, 4, "P%2d", i - MCXT_NUM_SIMD_SVE_SLOTS); - before_reg = before_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS]; - after_reg = after_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS]; + const char *diff_str = NULL; + if (i >= MCXT_NUM_SIMD_SVE_SLOTS) { + dr_svep_t before_reg, after_reg; + + if (i >= (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS)) { + strncpy(reg_name, "FFR", sizeof(reg_name)); + before_reg = before_mcontext.ffr; + after_reg = after_mcontext.ffr; + } else { + dr_snprintf(reg_name, 4, "P%-2d", i - MCXT_NUM_SIMD_SVE_SLOTS); + before_reg = before_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS]; + after_reg = after_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS]; + } + + diff_str = + (memcmp(&before_reg, &after_reg, predlen_bytes) == 0 ? "" + : " <- DIFFERS"); + + const size_t num_elements = predlen_bytes / sizeof(before_reg.u16[0]); + dr_fprintf(STDERR, "%s before: ", reg_name); + for (int element = 0; element < num_elements; element++) { + dr_fprintf(STDERR, "%04x", before_reg.u16[element]); + } + dr_fprintf(STDERR, " after: "); + for (int element = 0; element < num_elements; element++) { + dr_fprintf(STDERR, "%04x", after_reg.u16[element]); + } + } else { - dr_snprintf(reg_name, 4, "Z%2d", i); - before_reg = before_mcontext.simd[i]; - after_reg = after_mcontext.simd[i]; - } + dr_snprintf(reg_name, 4, "Z%-2d", i); + dr_simd_t before_reg = before_mcontext.simd[i]; + dr_simd_t after_reg = after_mcontext.simd[i]; - const char *diff_str = - (memcmp(&before_reg, &after_reg, mmsz) == 0 ? "" : " <- DIFFERS"); + diff_str = + (memcmp(&before_reg, &after_reg, veclen_bytes) == 0 ? "" : " <- DIFFERS"); - dr_fprintf(STDERR, "%s before: %08x%08x%08x%08x", reg_name, before_reg.u32[0], - before_reg.u32[1], before_reg.u32[2], before_reg.u32[3]); - dr_fprintf(STDERR, " after: %08x%08x%08x%08x", after_reg.u32[0], after_reg.u32[1], - after_reg.u32[2], after_reg.u32[3]); + const size_t num_elements = veclen_bytes / sizeof(before_reg.u64[0]); + dr_fprintf(STDERR, "%s before: ", reg_name); + for (size_t element = 0; element < num_elements; element++) { + dr_fprintf(STDERR, PFMT, before_reg.u64[element]); + } + dr_fprintf(STDERR, " after: "); + for (size_t element = 0; element < num_elements; element++) { + dr_fprintf(STDERR, PFMT, after_reg.u64[element]); + } + } #endif dr_fprintf(STDERR, "%s\n", diff_str); }