From 2f8bdd958fdcf3bb4f44ca0b9bb39c6a9cab43ce Mon Sep 17 00:00:00 2001 From: Jack Gallagher Date: Tue, 9 Apr 2024 13:49:58 +0200 Subject: [PATCH 1/8] i#6760 AArch64: Use smaller data types for SVE P and FFR registers PR #6757 fixed the way we read/write SVE register slots but unfortunately it is now broken on systems with 128-bit vector length. Both SVE vector and predicate registers use dr_simd_t slots which is a 64-byte type meant to store up to 512-bit vector registers. SVE predicate registers are always 1/8 the size of the vector register so for 512-bit vector length systems we only really need 64 / 8 = 8 bytes to store predicate registers. The ldr/str instructions we use to read and write the predicate register slots have a base+offset memory operand where the offset is a value in the range [-256, 255] scaled based by predicate register length. We read and write the registers by setting the base address to the address of the first slot, and setting the offset to n * sizeof(dr_simd_t) for each register Pn. For systems with 128-bit vector length, this means the predicate registers are 16 / 8 = 2 bytes so the maximum offset we can reach is 2 * 255 = 510 bytes. This means on 128-bit VL systems we can only reach the first 8 predicate registers (8 * sizeof(dr_simd_t) = 512). By changing the predicate register and FFR slots to use a new type dr_svep_t which is 1/8 the size of dr_simd_t we can fix this bug and save space. dr_svep_t is currently 8 bytes to correspond to 64 byte vectors, but even if we extend DynamoRIO to support the maximum SVE vector length of 2048-bits (256 bytes) dr_svep_t will only need to be increased to 256 / 8 = 32 bytes so the maximum offset (15 * 32 = 480 bytes) will always be in range. As this changes the size of the predicate register and FFR slots, this changes the size of the dr_mcontext_t structure and breaks backwards compatibility with earlier versions of DynamoRIO so the version number is increased to 10.90. Issues: #6760, #5365 Fixes: #6760 --- .github/workflows/ci-docs.yml | 2 +- .github/workflows/ci-package.yml | 12 +-- CMakeLists.txt | 2 +- api/docs/release.dox | 3 + core/arch/aarch64/aarch64.asm | 4 +- core/arch/aarch64/emit_utils.c | 92 +++++++++---------- core/arch/aarchxx/mangle.c | 17 +++- core/arch/arch.c | 45 +++++++-- core/arch/arm/arm.asm | 8 +- core/ir/aarch64/instr.c | 2 +- core/lib/globals_api.h | 34 +++++-- core/lib/mcxtx_api.h | 4 +- core/unix/signal_linux_aarch64.c | 10 +- ext/drx/scatter_gather_aarch64.c | 2 +- .../client-interface/cleancall-opt-shared.h | 65 +++++++++---- 15 files changed, 188 insertions(+), 114 deletions(-) diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml index 87d04fcd549..2ba40d9a760 100644 --- a/.github/workflows/ci-docs.yml +++ b/.github/workflows/ci-docs.yml @@ -90,7 +90,7 @@ jobs: # We only use a non-zero build # when making multiple manual builds in one day. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi diff --git a/.github/workflows/ci-package.yml b/.github/workflows/ci-package.yml index 62e88e22689..9bff2302ddc 100644 --- a/.github/workflows/ci-package.yml +++ b/.github/workflows/ci-package.yml @@ -103,7 +103,7 @@ jobs: # We only use a non-zero build # when making multiple manual builds in one day. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -195,7 +195,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -283,7 +283,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -371,7 +371,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -451,7 +451,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24))) + export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24))) else export VERSION_NUMBER=${{ github.event.inputs.version }} fi @@ -536,7 +536,7 @@ jobs: # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt. run: | if test -z "${{ github.event.inputs.version }}"; then - export VERSION_NUMBER="10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))" + export VERSION_NUMBER="10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))" export PREFIX="cronbuild-" else export VERSION_NUMBER=${{ github.event.inputs.version }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 9bf0a929dcb..2463024ee8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -575,7 +575,7 @@ endif (EXISTS "${PROJECT_SOURCE_DIR}/.svn") # N.B.: When updating this, update all the default versions in ci-package.yml # and ci-docs.yml. We should find a way to share (xref i#1565). -set(VERSION_NUMBER_DEFAULT "10.0.${VERSION_NUMBER_PATCHLEVEL}") +set(VERSION_NUMBER_DEFAULT "10.90.${VERSION_NUMBER_PATCHLEVEL}") # do not store the default VERSION_NUMBER in the cache to prevent a stale one # from preventing future version updates in a pre-existing build dir set(VERSION_NUMBER "" CACHE STRING "Version number: leave empty for default") diff --git a/api/docs/release.dox b/api/docs/release.dox index 2321dab8a4e..c2faec4ea75 100644 --- a/api/docs/release.dox +++ b/api/docs/release.dox @@ -147,6 +147,9 @@ changes: users sub-classing analyzer_tmpl_t). - Converted #dynamorio::drmemtrace::analysis_tool_tmpl_t::interval_state_snapshot_t into a class with all its data members marked private with public accessor functions. + - Changed the type of the AArch64 #dr_mcontext_t members svep and ffr to #dr_svep_t. + This breaks binary compatibility with clients that were built against versions of + DynamoRIO before this change. Further non-compatibility-affecting changes include: - Added DWARF-5 support to the drsyms library by linking in 4 static libraries diff --git a/core/arch/aarch64/aarch64.asm b/core/arch/aarch64/aarch64.asm index 7d770c2dfe1..1ea06f39de5 100644 --- a/core/arch/aarch64/aarch64.asm +++ b/core/arch/aarch64/aarch64.asm @@ -47,7 +47,7 @@ START_FILE #endif /* sizeof(priv_mcontext_t) rounded up to a multiple of 16 */ -#define PRIV_MCONTEXT_SIZE 3424 +#define PRIV_MCONTEXT_SIZE 2480 /* offsetof(spill_state_t, r0) */ #define spill_state_r0_OFFSET 0 @@ -69,7 +69,7 @@ START_FILE /* offsetof(priv_mcontext_t, simd) */ #define simd_OFFSET (16 * ARG_SZ*2 + 32) /* offsetof(dcontext_t, dstack) */ -#define dstack_OFFSET 0xda8 +#define dstack_OFFSET 0x9f8 /* offsetof(dcontext_t, is_exiting) */ #define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ) /* offsetof(struct tlsdesc_t, arg) */ diff --git a/core/arch/aarch64/emit_utils.c b/core/arch/aarch64/emit_utils.c index cd7656f950f..e12781aae8b 100644 --- a/core/arch/aarch64/emit_utils.c +++ b/core/arch/aarch64/emit_utils.c @@ -576,7 +576,7 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) } if (proc_has_feature(FEATURE_SVE)) { for (i = 0; i < 32; i++) { - /* ldr z(i), [x1, #(i mul vl)] + /* ldr z(i), [x1, #(i * sizeof(dr_simd_t))] * From the SVE manual: * "Load a vector register from a memory address generated by a * 64-bit scalar base, plus an immediate offset in the range -256 @@ -584,11 +584,10 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) * in bytes." */ APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_Z0 + i), - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes())))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_Z0 + i), + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + i * sizeof(dr_simd_t), + OPSZ_SVE_VECLEN_BYTES))); } /* add x1, x(dcxt), #(offset svep) */ APP(ilist, @@ -599,40 +598,36 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) * register for FFR load below, then restored from svep afterwards. */ for (i = 0; i < 15; i++) { - /* ldr p(i), [x1, #(i mul vl)] */ + /* ldr p(i), [x1, #(i * sizeof(dr_svep_t))] */ APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_P0 + i), - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P0 + i), + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + i * sizeof(dr_svep_t), + OPSZ_SVE_PREDLEN_BYTES))); } /* There is no load instruction for the first-fault register (FFR). Use * a temporary predicate register to load: * add x2, x(dcxt), #(offset ffr) - * ldr p15, [x2, #(ffr)] + * ldr p15, [x2, #0] * wrffr p15.b - * ldr p15, [x1, #(15 mul vl)] + * ldr p15, [x1, #(15 * sizeof(dr_svep_t)] */ APP(ilist, XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X2), opnd_create_reg(REG_DCXT), OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr)))); APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_P15), - opnd_create_base_disp( - DR_REG_X2, DR_REG_NULL, 0, 0, - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15), + opnd_create_base_disp(DR_REG_X2, DR_REG_NULL, 0, 0, + OPSZ_SVE_PREDLEN_BYTES))); APP(ilist, INSTR_CREATE_wrffr_sve(dcontext, opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1))); APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_P15), - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, 15 * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15), + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + 15 * sizeof(dr_svep_t), + OPSZ_SVE_PREDLEN_BYTES))); } } @@ -791,19 +786,18 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) } if (proc_has_feature(FEATURE_SVE)) { for (i = 0; i < 32; i++) { - /* str z(i), [x1, #(i mul vl)] + /* str z(i), [x1, #(i * sizeof(dr_simd_t))] * "Store a vector register to a memory address generated by a * 64-bit scalar base, plus an immediate offset in the range -256 * to 255 which is multiplied by the current vector register size * in bytes." */ APP(ilist, - INSTR_CREATE_str( - dcontext, - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes())), - opnd_create_reg(DR_REG_Z0 + i))); + INSTR_CREATE_str(dcontext, + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + i * sizeof(dr_simd_t), + OPSZ_SVE_VECLEN_BYTES), + opnd_create_reg(DR_REG_Z0 + i))); } /* add x1, x(dcxt), #(off) */ APP(ilist, @@ -811,21 +805,20 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) opnd_create_reg(REG_DCXT), OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, svep)))); for (i = 0; i < 16; i++) { - /* str p(i), [x1, #(i mul vl)] */ + /* str p(i), [x1, #(i * sizeof(dr_svep_t))] */ APP(ilist, - INSTR_CREATE_str( - dcontext, - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)), - opnd_create_reg(DR_REG_P0 + i))); + INSTR_CREATE_str(dcontext, + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + i * sizeof(dr_svep_t), + OPSZ_SVE_PREDLEN_BYTES), + opnd_create_reg(DR_REG_P0 + i))); } /* There is no store instruction for the first-fault register (FFR). Use * a temporary predicate register to store: * rdffr p15.b * add x2, x(dcxt), #(offset ffr) - * str p15, [x2, #(ffr)] - * ldr p15, [x1, #(15 mul vl)] + * str p15, [x2, #0] + * ldr p15, [x1, #(15 * sizeof(dr_svep_t))] */ APP(ilist, INSTR_CREATE_rdffr_sve(dcontext, @@ -835,18 +828,15 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) opnd_create_reg(REG_DCXT), OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr)))); APP(ilist, - INSTR_CREATE_str( - dcontext, - opnd_create_base_disp( - DR_REG_X2, DR_REG_NULL, 0, 0, - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)), - opnd_create_reg(DR_REG_P15))); + INSTR_CREATE_str(dcontext, + opnd_create_base_disp(DR_REG_X2, DR_REG_NULL, 0, 0, + OPSZ_SVE_PREDLEN_BYTES), + opnd_create_reg(DR_REG_P15))); APP(ilist, - INSTR_CREATE_ldr( - dcontext, opnd_create_reg(DR_REG_P15), - opnd_create_base_disp( - DR_REG_X1, DR_REG_NULL, 0, 15 * sizeof(dr_simd_t), - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)))); + INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15), + opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, + 15 * sizeof(dr_svep_t), + OPSZ_SVE_PREDLEN_BYTES))); } } diff --git a/core/arch/aarchxx/mangle.c b/core/arch/aarchxx/mangle.c index 2093db58251..1c1aae4d395 100644 --- a/core/arch/aarchxx/mangle.c +++ b/core/arch/aarchxx/mangle.c @@ -565,7 +565,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, DR_REG_Q0, SIMD_REG_TYPE); } - dstack_offs += MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t); + dstack_offs += (MCXT_NUM_SIMD_SVE_SLOTS * sizeof(dr_simd_t)) + + (MCXT_NUM_SVEP_SLOTS * sizeof(dr_svep_t)) + + (MCXT_NUM_FFR_SLOTS * sizeof(dr_ffr_t)); /* Restore the registers we used. */ /* ldp x0, x1, [sp] */ @@ -577,6 +579,10 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_X2), opnd_create_base_disp(DR_REG_SP, DR_REG_NULL, 0, REG_OFFSET(DR_REG_X2), OPSZ_8))); + + /* Make dstack_offs 16-byte aligned. */ + dstack_offs = ALIGN_FORWARD(dstack_offs, get_ABI_stack_alignment()); + #else /* vstmdb always does writeback */ PRE(ilist, instr, @@ -655,9 +661,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, /* Make dstack_offs 8-byte algined, as we only accounted for 17 4-byte slots. */ dstack_offs += XSP_SZ; +#endif ASSERT(cci->skip_save_flags || cci->num_simd_skip != 0 || cci->num_regs_skip != 0 || dstack_offs == (uint)get_clean_call_switch_stack_size()); -#endif return dstack_offs; } @@ -678,8 +684,11 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_X0), opnd_create_reg(DR_REG_SP))); - current_offs = - get_clean_call_switch_stack_size() - (MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t)); + current_offs = ALIGN_BACKWARD(get_clean_call_switch_stack_size() - + (MCXT_NUM_SIMD_SVE_SLOTS * sizeof(dr_simd_t)) - + (MCXT_NUM_SVEP_SLOTS * sizeof(dr_svep_t)) - + (MCXT_NUM_FFR_SLOTS * sizeof(dr_ffr_t)), + 16); /* add x0, x0, current_offs */ PRE(ilist, instr, diff --git a/core/arch/arch.c b/core/arch/arch.c index 655593e59c9..d3573560b3a 100644 --- a/core/arch/arch.c +++ b/core/arch/arch.c @@ -3825,21 +3825,48 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml) } #elif defined(AARCHXX) { - int i, j; # ifdef AARCH64 - int words = proc_has_feature(FEATURE_SVE) ? 16 : 4; + const uint elements = proc_has_feature(FEATURE_SVE) + ? (proc_get_vector_length_bytes() / sizeof(uint64)) + : 2; + /* XXX: should be proc_num_simd_saved(). */ + const uint num_simd_regs = MCXT_NUM_SIMD_SVE_SLOTS; + const char *reg_prefix = proc_has_feature(FEATURE_SVE) ? "z" : "q"; # else - int words = 4; -# endif + const uint elements = 2; /* XXX: should be proc_num_simd_saved(). */ - for (i = 0; i < proc_num_simd_registers(); i++) { - print_file(f, dump_xml ? "\t\tqd= \"0x" : "\tq%-3d= 0x", i); - for (j = 0; j < words; j++) { - print_file(f, "%08x ", context->simd[i].u32[j]); + const uint num_simd_regs = proc_num_simd_registers(); + const char *reg_prefix = "q"; +# endif + ASSERT(elements <= + sizeof(context->simd[0].u64) / sizeof(context->simd[0].u64[0])); + for (uint i = 0; i < num_simd_regs; i++) { + print_file(f, dump_xml ? "\t\t%s%d= \"0x" : "\t%s%-3d= 0x", reg_prefix, i); + for (uint j = 0; j < elements; j++) { + print_file(f, PFMT " ", context->simd[i].u64[j]); } print_file(f, dump_xml ? "\"\n" : "\n"); } - /* TODO i#5365: SVE predicate registers and FFR dump. */ +# ifdef AARCH64 + if (proc_has_feature(FEATURE_SVE)) { + /* Dump SVE P registers */ + const uint pred_u16_elements = + (proc_get_vector_length_bytes() / 8) / sizeof(ushort); + for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) { + print_file(f, dump_xml ? "\t\tp%d= \"0x" : "\tp%-3d= 0x", i); + for (size_t j = 0; j < pred_u16_elements; j++) { + print_file(f, "%04x ", context->svep[i].u16[j]); + } + print_file(f, dump_xml ? "\"\n" : "\n"); + } + /* Dump SVE FFR register */ + print_file(f, dump_xml ? "\t\tffr= \"0x" : "\tffr = 0x"); + for (size_t j = 0; j < pred_u16_elements; j++) { + print_file(f, "%04x ", context->ffr.u16[j]); + } + print_file(f, dump_xml ? "\"\n" : "\n"); + } +# endif } #endif diff --git a/core/arch/arm/arm.asm b/core/arch/arm/arm.asm index f58073eca67..6f385e88b16 100644 --- a/core/arch/arm/arm.asm +++ b/core/arch/arm/arm.asm @@ -60,17 +60,21 @@ DECL_EXTERN(initstack_mutex) #ifdef X64 # define MCXT_NUM_SIMD_SLOTS 32 -# define SIMD_REG_SIZE 16 +# define SIMD_REG_SIZE 64 +# define MCXT_NUM_PRED_SLOTS 17 /* P regs + FFR */ +# define PRED_REG_SIZE 8 # define NUM_GPR_SLOTS 33 /* incl flags */ # define GPR_REG_SIZE 8 #else # define MCXT_NUM_SIMD_SLOTS 16 # define SIMD_REG_SIZE 16 +# define MCXT_NUM_PRED_SLOTS 0 +# define PRED_REG_SIZE 0 # define NUM_GPR_SLOTS 17 /* incl flags */ # define GPR_REG_SIZE 4 #endif #define PRE_SIMD_PADDING 0 -#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE) +#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE + MCXT_NUM_PRED_SLOTS*PRED_REG_SIZE) #define PRIV_MCXT_SIZE (NUM_GPR_SLOTS*GPR_REG_SIZE + PRIV_MCXT_SIMD_SIZE) #define PRIV_MCXT_SP_FROM_SIMD (-(4*GPR_REG_SIZE)) /* flags, pc, lr, then sp */ #define PRIV_MCXT_PC_FROM_SIMD (-(2*GPR_REG_SIZE)) /* flags, then pc */ diff --git a/core/ir/aarch64/instr.c b/core/ir/aarch64/instr.c index 0d02a8d0642..223f9dbb0e6 100644 --- a/core/ir/aarch64/instr.c +++ b/core/ir/aarch64/instr.c @@ -801,7 +801,7 @@ instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size const reg_t governing_pred = opnd_get_reg(instr_get_src(instr, 1)); ASSERT(governing_pred >= DR_REG_START_P && governing_pred <= DR_REG_STOP_P); - uint64 mask = mc->svep[governing_pred - DR_REG_START_P].d; + uint64 mask = mc->svep[governing_pred - DR_REG_START_P].u64[0]; if (mask == 0) { return false; diff --git a/core/lib/globals_api.h b/core/lib/globals_api.h index 659e32d0a2b..36d85dac065 100644 --- a/core/lib/globals_api.h +++ b/core/lib/globals_api.h @@ -688,10 +688,9 @@ typedef uint64 dr_opmask_t; #if defined(AARCHXX) /** - * 512-bit ARM Scalable Vector Extension (SVE) vector registers Zn and - * predicate registers Pn. Low 128 bits of Zn overlap with existing ARM - * Advanced SIMD (NEON) Vn registers. The SVE specification defines the - * following valid vector lengths: + * 512-bit ARM Scalable Vector Extension (SVE) vector registers Zn. + * Low 128 bits of Zn overlap with existing ARM Advanced SIMD (NEON) Vn registers. + * The SVE specification defines the following valid vector lengths: * 128 256 384 512 640 768 896 1024 1152 1280 1408 1536 1664 1792 1920 2048 * We currently support 512-bit maximum due to DR's stack size limitation, * (machine context stored in the stack). In AArch64, align to 16 bytes for @@ -706,11 +705,28 @@ typedef union ALIGN_VAR(16) _dr_simd_t { uint s; /**< Singleword (32 bit, Sn) scalar element of Vn, Zn and Pn. */ uint64 d; /**< Doubleword (64 bit, Dn) scalar element of Vn, Zn and Pn. */ uint q[4]; /**< The full 128 bit Vn register, Qn as q[3]:q[2]:q[1]:q[0]. */ - uint u32[16]; /**< The full 512 bit Zn, Pn and FFR registers as Singleword (32-bit) - elements. */ - uint64 u64[8]; /**< The full 512 bit Zn, Pn and FFR registers as Doubleword (64-bit) - elements. */ + uint u32[16]; /**< The full 512 bit Zn register as Singleword (32-bit) elements. */ + uint64 u64[8]; /**< The full 512 bit Zn register as Doubleword (64-bit) elements. */ } dr_simd_t; + +/** + * 64-bit Arm Scalable Vector Extension (SVE) predicate register Pn. + * SVE Pn registers are used to hold mask values that control the operation of some SVE + * instructions. Pn registers have one bit for every byte of a Zn register to the size + * of a Pn register is always 1/8 the size of a Zn register. + * DynamoRIO currently supports up to 512-bit Zn registers and 64-bit Pn registers. + */ +typedef union _dr_svep_t { + ushort u16[4]; /**< The full 64-bit Pn or FFR register as 16-bit elements. */ + uint u32[2]; /**< The full 64-bit Pn or FFR register as 32-bit elements. */ + uint64 u64[1]; /**< The full 64-bit Pn or FFR register as 64-bit elements. */ +} dr_svep_t; + +/** + * 64-bit Arm Scalable Vector Extension (SVE) First Fault Register (FFR). + * FFR is a special purpose predicate register used by some SVE instructions. + */ +typedef dr_svep_t dr_ffr_t; # else typedef union _dr_simd_t { uint s[4]; /**< Representation as 4 32-bit Sn elements. */ @@ -720,7 +736,7 @@ typedef union _dr_simd_t { # endif # ifdef X64 # define MCXT_NUM_SIMD_SVE_SLOTS \ - 32 /**< Number of 128-bit SIMD Vn/Zn slots in dr_mcontext_t. \ + 32 /**< Number of 512-bit SIMD Vn/Zn slots in dr_mcontext_t. \ */ # define MCXT_NUM_SVEP_SLOTS 16 /**< Number of SIMD Pn slots in dr_mcontext_t. */ # define MCXT_NUM_FFR_SLOTS \ diff --git a/core/lib/mcxtx_api.h b/core/lib/mcxtx_api.h index e02543783a0..c74f5135aab 100644 --- a/core/lib/mcxtx_api.h +++ b/core/lib/mcxtx_api.h @@ -140,12 +140,12 @@ * The Arm AArch64 Scalable Vector Extension (SVE) predicate registers * DR_REG_P0 to DR_REG_P15. */ - dr_simd_t svep[MCXT_NUM_SVEP_SLOTS]; + dr_svep_t svep[MCXT_NUM_SVEP_SLOTS]; /** * The Arm AArch64 Scalable Vector Extension (SVE) first fault register * DR_REG_FFR, for vector load instrcutions. */ - dr_simd_t ffr; + dr_ffr_t ffr; # else /* * For the Arm AArch32 SIMD registers, we would probably be ok if we did diff --git a/core/unix/signal_linux_aarch64.c b/core/unix/signal_linux_aarch64.c index abd9fe92b64..3f13be57943 100644 --- a/core/unix/signal_linux_aarch64.c +++ b/core/unix/signal_linux_aarch64.c @@ -257,12 +257,12 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full) memcpy(&mc->simd[i].q, &fpc->vregs[i], sizeof(mc->simd->q)); } for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) { - memcpy(&mc->svep[i].u32, + memcpy(&mc->svep[i].u16, (byte *)sve + SVE_SIG_PREG_OFFSET(quads_per_vector, i), - sve->vl); + sve->vl / 8); } memcpy(&mc->ffr, (byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), - sve->vl); + sve->vl / 8); } break; } @@ -316,9 +316,9 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc) } for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) { memcpy((byte *)sve + SVE_SIG_PREG_OFFSET(quads_per_vector, i), - &mc->svep[i].u32, sve->vl); + &mc->svep[i].u16, sve->vl / 8); } - memcpy((byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), &mc->ffr, sve->vl); + memcpy((byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), &mc->ffr, sve->vl / 8); size_t offset = (proc_get_vector_length_bytes() * MCXT_NUM_SIMD_SVE_SLOTS) + ((proc_get_vector_length_bytes() / 8) * MCXT_NUM_SVEP_SLOTS) + 16; diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c index 18c698767a0..e3f1960b750 100644 --- a/ext/drx/scatter_gather_aarch64.c +++ b/ext/drx/scatter_gather_aarch64.c @@ -1803,7 +1803,7 @@ drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info, const size_t reg_num = spill_slot_state.pred_slots[slot].reg - DR_REG_P0; memcpy(&info->mcontext->svep[reg_num], - &((char *)pt->scratch_pred_spill_slots)[pl_bytes * slot], vl_bytes); + &((char *)pt->scratch_pred_spill_slots)[pl_bytes * slot], pl_bytes); } } diff --git a/suite/tests/client-interface/cleancall-opt-shared.h b/suite/tests/client-interface/cleancall-opt-shared.h index 6d2fc746a7b..831b930206a 100644 --- a/suite/tests/client-interface/cleancall-opt-shared.h +++ b/suite/tests/client-interface/cleancall-opt-shared.h @@ -353,30 +353,55 @@ dump_diff_mcontexts(void) after_reg.u32[6], after_reg.u32[7]); } #elif defined(AARCH64) - const size_t mmsz = proc_get_vector_length_bytes(); - dr_simd_t before_reg, after_reg; + const size_t veclen_bytes = proc_get_vector_length_bytes(); + const size_t predlen_bytes = veclen_bytes / 8; char reg_name[4]; - if (i >= (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS)) { - strcpy(reg_name, "FFR"); - before_reg = before_mcontext.ffr; - after_reg = after_mcontext.ffr; - } else if (i >= MCXT_NUM_SIMD_SVE_SLOTS) { - dr_snprintf(reg_name, 4, "P%2d", i - MCXT_NUM_SIMD_SVE_SLOTS); - before_reg = before_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS]; - after_reg = after_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS]; + const char *diff_str = NULL; + if (i >= MCXT_NUM_SIMD_SVE_SLOTS) { + dr_svep_t before_reg, after_reg; + + if (i >= (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS)) { + strcpy(reg_name, "FFR"); + before_reg = before_mcontext.ffr; + after_reg = after_mcontext.ffr; + } else { + dr_snprintf(reg_name, 4, "P%-2d", i - MCXT_NUM_SIMD_SVE_SLOTS); + before_reg = before_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS]; + after_reg = after_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS]; + } + + diff_str = + (memcmp(&before_reg, &after_reg, predlen_bytes) == 0 ? "" + : " <- DIFFERS"); + + const size_t num_elements = predlen_bytes / sizeof(before_reg.u16[0]); + dr_fprintf(STDERR, "%s before: ", reg_name); + for (int element = 0; element < num_elements; element++) { + dr_fprintf(STDERR, "%04x", before_reg.u16[element]); + } + dr_fprintf(STDERR, " after: "); + for (int element = 0; element < num_elements; element++) { + dr_fprintf(STDERR, "%04x", after_reg.u16[element]); + } + } else { - dr_snprintf(reg_name, 4, "Z%2d", i); - before_reg = before_mcontext.simd[i]; - after_reg = after_mcontext.simd[i]; - } + dr_snprintf(reg_name, 4, "Z%-2d", i); + dr_simd_t before_reg = before_mcontext.simd[i]; + dr_simd_t after_reg = after_mcontext.simd[i]; - const char *diff_str = - (memcmp(&before_reg, &after_reg, mmsz) == 0 ? "" : " <- DIFFERS"); + diff_str = + (memcmp(&before_reg, &after_reg, veclen_bytes) == 0 ? "" : " <- DIFFERS"); - dr_fprintf(STDERR, "%s before: %08x%08x%08x%08x", reg_name, before_reg.u32[0], - before_reg.u32[1], before_reg.u32[2], before_reg.u32[3]); - dr_fprintf(STDERR, " after: %08x%08x%08x%08x", after_reg.u32[0], after_reg.u32[1], - after_reg.u32[2], after_reg.u32[3]); + const size_t num_elements = veclen_bytes / sizeof(before_reg.u64[0]); + dr_fprintf(STDERR, "%s before: ", reg_name); + for (size_t element = 0; element < num_elements; element++) { + dr_fprintf(STDERR, PFMT, before_reg.u64[element]); + } + dr_fprintf(STDERR, " after: "); + for (size_t element = 0; element < num_elements; element++) { + dr_fprintf(STDERR, PFMT, after_reg.u64[element]); + } + } #endif dr_fprintf(STDERR, "%s\n", diff_str); } From 2ecba024232f30ea3a9541af85d577b99c6c11ed Mon Sep 17 00:00:00 2001 From: Jack Gallagher Date: Fri, 12 Apr 2024 11:50:51 +0000 Subject: [PATCH 2/8] Fix aarch32 build --- core/arch/arch.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/arch/arch.c b/core/arch/arch.c index d3573560b3a..c2abce397b9 100644 --- a/core/arch/arch.c +++ b/core/arch/arch.c @@ -3827,23 +3827,23 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml) { # ifdef AARCH64 const uint elements = proc_has_feature(FEATURE_SVE) - ? (proc_get_vector_length_bytes() / sizeof(uint64)) - : 2; + ? (proc_get_vector_length_bytes() / sizeof(uint)) + : 4; /* XXX: should be proc_num_simd_saved(). */ const uint num_simd_regs = MCXT_NUM_SIMD_SVE_SLOTS; const char *reg_prefix = proc_has_feature(FEATURE_SVE) ? "z" : "q"; # else - const uint elements = 2; + const uint elements = 4; /* XXX: should be proc_num_simd_saved(). */ const uint num_simd_regs = proc_num_simd_registers(); const char *reg_prefix = "q"; # endif ASSERT(elements <= - sizeof(context->simd[0].u64) / sizeof(context->simd[0].u64[0])); + sizeof(context->simd[0].u32) / sizeof(context->simd[0].u32[0])); for (uint i = 0; i < num_simd_regs; i++) { print_file(f, dump_xml ? "\t\t%s%d= \"0x" : "\t%s%-3d= 0x", reg_prefix, i); for (uint j = 0; j < elements; j++) { - print_file(f, PFMT " ", context->simd[i].u64[j]); + print_file(f, PFMT " ", context->simd[i].u32[j]); } print_file(f, dump_xml ? "\"\n" : "\n"); } From a91b96d05536df0015d545a34281eb5dbf9495f5 Mon Sep 17 00:00:00 2001 From: Jack Gallagher Date: Fri, 12 Apr 2024 11:59:41 +0000 Subject: [PATCH 3/8] Fix format --- core/arch/arch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/arch/arch.c b/core/arch/arch.c index c2abce397b9..9b8b8426300 100644 --- a/core/arch/arch.c +++ b/core/arch/arch.c @@ -3843,7 +3843,7 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml) for (uint i = 0; i < num_simd_regs; i++) { print_file(f, dump_xml ? "\t\t%s%d= \"0x" : "\t%s%-3d= 0x", reg_prefix, i); for (uint j = 0; j < elements; j++) { - print_file(f, PFMT " ", context->simd[i].u32[j]); + print_file(f, "%08x ", context->simd[i].u32[j]); } print_file(f, dump_xml ? "\"\n" : "\n"); } From b51bf2dfd86d7b85026fae53129a5d99e52cf65c Mon Sep 17 00:00:00 2001 From: Jack Gallagher Date: Mon, 15 Apr 2024 15:56:48 +0000 Subject: [PATCH 4/8] Add comment --- core/arch/aarch64/emit_utils.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/arch/aarch64/emit_utils.c b/core/arch/aarch64/emit_utils.c index e12781aae8b..478b58a5b32 100644 --- a/core/arch/aarch64/emit_utils.c +++ b/core/arch/aarch64/emit_utils.c @@ -815,10 +815,10 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute) } /* There is no store instruction for the first-fault register (FFR). Use * a temporary predicate register to store: - * rdffr p15.b - * add x2, x(dcxt), #(offset ffr) - * str p15, [x2, #0] - * ldr p15, [x1, #(15 * sizeof(dr_svep_t))] + * rdffr p15.b ; Read FFR to P15 + * add x2, x(dcxt), #(offset ffr) ; Calculate FFR dcxt offset + * str p15, [x2, #0] ; Save FFR to dcxt + * ldr p15, [x1, #(15 * sizeof(dr_svep_t))] ; Restore app P15 value from dcxt */ APP(ilist, INSTR_CREATE_rdffr_sve(dcontext, From 107fa8879cb3b2511042575b7c3f60b0d715bb4c Mon Sep 17 00:00:00 2001 From: Jack Gallagher Date: Mon, 15 Apr 2024 16:08:56 +0000 Subject: [PATCH 5/8] Tidy up printing code --- core/arch/arch.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/core/arch/arch.c b/core/arch/arch.c index 9b8b8426300..7cac9d47cdc 100644 --- a/core/arch/arch.c +++ b/core/arch/arch.c @@ -3826,23 +3826,25 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml) #elif defined(AARCHXX) { # ifdef AARCH64 - const uint elements = proc_has_feature(FEATURE_SVE) - ? (proc_get_vector_length_bytes() / sizeof(uint)) - : 4; + const uint vector_length_bytes = + (proc_has_feature(FEATURE_SVE) ? proc_get_vector_length_bytes() + : opnd_size_in_bytes(reg_get_size(DR_REG_Q0))); + const uint u32_count = vector_length_bytes / sizeof(uint); /* XXX: should be proc_num_simd_saved(). */ const uint num_simd_regs = MCXT_NUM_SIMD_SVE_SLOTS; const char *reg_prefix = proc_has_feature(FEATURE_SVE) ? "z" : "q"; # else - const uint elements = 4; + const uint u32_count = + sizeof(context->simd[0].u32) / sizeof(context->simd[0].u32[0]); /* XXX: should be proc_num_simd_saved(). */ const uint num_simd_regs = proc_num_simd_registers(); const char *reg_prefix = "q"; # endif - ASSERT(elements <= + ASSERT(u32_count <= sizeof(context->simd[0].u32) / sizeof(context->simd[0].u32[0])); for (uint i = 0; i < num_simd_regs; i++) { print_file(f, dump_xml ? "\t\t%s%d= \"0x" : "\t%s%-3d= 0x", reg_prefix, i); - for (uint j = 0; j < elements; j++) { + for (uint j = 0; j < u32_count; j++) { print_file(f, "%08x ", context->simd[i].u32[j]); } print_file(f, dump_xml ? "\"\n" : "\n"); @@ -3850,18 +3852,18 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml) # ifdef AARCH64 if (proc_has_feature(FEATURE_SVE)) { /* Dump SVE P registers */ - const uint pred_u16_elements = + const uint pred_u16_count = (proc_get_vector_length_bytes() / 8) / sizeof(ushort); for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) { print_file(f, dump_xml ? "\t\tp%d= \"0x" : "\tp%-3d= 0x", i); - for (size_t j = 0; j < pred_u16_elements; j++) { + for (size_t j = 0; j < pred_u16_count; j++) { print_file(f, "%04x ", context->svep[i].u16[j]); } print_file(f, dump_xml ? "\"\n" : "\n"); } /* Dump SVE FFR register */ print_file(f, dump_xml ? "\t\tffr= \"0x" : "\tffr = 0x"); - for (size_t j = 0; j < pred_u16_elements; j++) { + for (size_t j = 0; j < pred_u16_count; j++) { print_file(f, "%04x ", context->ffr.u16[j]); } print_file(f, dump_xml ? "\"\n" : "\n"); From 4526e13363230a1010f2fbd5a9d99d1abdb7d0ef Mon Sep 17 00:00:00 2001 From: Jack Gallagher Date: Mon, 15 Apr 2024 16:11:29 +0000 Subject: [PATCH 6/8] Use strncpy() --- suite/tests/client-interface/cleancall-opt-shared.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/suite/tests/client-interface/cleancall-opt-shared.h b/suite/tests/client-interface/cleancall-opt-shared.h index 831b930206a..58907c9e7cf 100644 --- a/suite/tests/client-interface/cleancall-opt-shared.h +++ b/suite/tests/client-interface/cleancall-opt-shared.h @@ -361,7 +361,7 @@ dump_diff_mcontexts(void) dr_svep_t before_reg, after_reg; if (i >= (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS)) { - strcpy(reg_name, "FFR"); + strncpy(reg_name, "FFR", sizeof(reg_name)); before_reg = before_mcontext.ffr; after_reg = after_mcontext.ffr; } else { From c3cd479d8dac4a4fa8a16d73dabb2febe755e564 Mon Sep 17 00:00:00 2001 From: Jack Gallagher Date: Mon, 15 Apr 2024 16:22:24 +0000 Subject: [PATCH 7/8] Use kernel macros for register sizes to copy --- core/unix/signal_linux_aarch64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/unix/signal_linux_aarch64.c b/core/unix/signal_linux_aarch64.c index 3f13be57943..24a61597a22 100644 --- a/core/unix/signal_linux_aarch64.c +++ b/core/unix/signal_linux_aarch64.c @@ -253,16 +253,16 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full) */ memcpy(&mc->simd[i].u32, (byte *)sve + SVE_SIG_ZREG_OFFSET(quads_per_vector, i), - sve->vl); + SVE_SIG_ZREG_SIZE(quads_per_vector)); memcpy(&mc->simd[i].q, &fpc->vregs[i], sizeof(mc->simd->q)); } for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) { memcpy(&mc->svep[i].u16, (byte *)sve + SVE_SIG_PREG_OFFSET(quads_per_vector, i), - sve->vl / 8); + SVE_SIG_PREG_SIZE(quads_per_vector)); } memcpy(&mc->ffr, (byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), - sve->vl / 8); + SVE_SIG_FFR_SIZE(quads_per_vector)); } break; } From 234f33f689b510a3870f3b6d09f225c22660db39 Mon Sep 17 00:00:00 2001 From: Jack Gallagher Date: Mon, 15 Apr 2024 16:25:29 +0000 Subject: [PATCH 8/8] Shorten line --- core/arch/arm/arm.asm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/arch/arm/arm.asm b/core/arch/arm/arm.asm index 6f385e88b16..5e7eaa5ad0f 100644 --- a/core/arch/arm/arm.asm +++ b/core/arch/arm/arm.asm @@ -74,7 +74,8 @@ DECL_EXTERN(initstack_mutex) # define GPR_REG_SIZE 4 #endif #define PRE_SIMD_PADDING 0 -#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE + MCXT_NUM_PRED_SLOTS*PRED_REG_SIZE) +#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE \ + + MCXT_NUM_PRED_SLOTS*PRED_REG_SIZE) #define PRIV_MCXT_SIZE (NUM_GPR_SLOTS*GPR_REG_SIZE + PRIV_MCXT_SIMD_SIZE) #define PRIV_MCXT_SP_FROM_SIMD (-(4*GPR_REG_SIZE)) /* flags, pc, lr, then sp */ #define PRIV_MCXT_PC_FROM_SIMD (-(2*GPR_REG_SIZE)) /* flags, then pc */