Skip to content

Commit

Permalink
i#5365: Add AArch64 SVE support to the core (part 2)
Browse files Browse the repository at this point in the history
This patch adds SVE support for signals in the core. It is the follow
on patch from the SVE core work part 1, in PR #5835 (f646a63) and
includes vector address computation for SVE scatter/gather, enabling
first-fault load handling.

Issue: #5365, #5036

Co-authored-by: Jack Gallagher <jack.gallagher@arm.com>
  • Loading branch information
AssadHashmi and jackgallagher-arm committed Mar 26, 2024
1 parent 781f15e commit 026b04e
Show file tree
Hide file tree
Showing 17 changed files with 1,138 additions and 102 deletions.
3 changes: 3 additions & 0 deletions core/arch/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ mixed_mode_enabled(void)
# define SCRATCH_REG4_OFFS R4_OFFSET
# define SCRATCH_REG5_OFFS R5_OFFSET
# define REG_OFFSET(reg) (R0_OFFSET + ((reg)-DR_REG_R0) * sizeof(reg_t))
# define Z_REG_OFFSET(reg) \
((MC_OFFS) + \
(offsetof(priv_mcontext_t, simd) + ((reg)-DR_REG_Z0) * sizeof(dr_simd_t)))
# define CALL_SCRATCH_REG DR_REG_R11
# define MC_IBL_REG r2
# define MC_RETVAL_REG r0
Expand Down
15 changes: 0 additions & 15 deletions core/ir/aarch64/codec.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,21 +57,6 @@ encode_common(byte *pc, instr_t *i, decode_info_t *di);
#define BITS(_enc, bitmax, bitmin) \
((((uint32)(_enc)) >> (bitmin)) & (uint32)MASK((bitmax) - (bitmin) + 1))

#if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER) && !defined(BUILD_TESTS)
# define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
# define OPSZ_SVE_PL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
#else
/* SVE vector length for off-line decoder set using -vl option with drdisas,
* e.g.
* $ drdisas -vl 256 e58057a1 85865e6b
* e58057a1 str %z1 -> +0x05(%x29)[32byte]
* 85865e6b ldr +0x37(%x19)[32byte] -> %z11
* $
*/
# define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
# define OPSZ_SVE_PL_BYTES opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
#endif

#define RETURN_FALSE \
do { \
CLIENT_ASSERT(false, "Unexpected state in AArch64 codec"); \
Expand Down
124 changes: 122 additions & 2 deletions core/ir/aarch64/instr.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* **********************************************************
* Copyright (c) 2017-2023 Google, Inc. All rights reserved.
* Copyright (c) 2016 ARM Limited. All rights reserved.
* Copyright (c) 2016-2024 ARM Limited. All rights reserved.
* **********************************************************/

/*
Expand Down Expand Up @@ -37,6 +37,8 @@
#include "encode_api.h"
#include "opcode_names.h"

#include <stddef.h>

/* XXX i#6690: currently only A64 is supported for instruction encoding.
* We want to add support for A64 decoding and synthetic ISA encoding as well.
* XXX i#1684: move this function to core/ir/instr_shared.c once we can support
Expand Down Expand Up @@ -447,7 +449,7 @@ reg_is_gpr(reg_id_t reg)
bool
reg_is_simd(reg_id_t reg)
{
return (DR_REG_Q0 <= reg && reg <= DR_REG_B31);
return reg_is_z(reg) || (DR_REG_Q0 <= reg && reg <= DR_REG_B31);
}

bool
Expand Down Expand Up @@ -737,3 +739,121 @@ instr_invert_predicate(dr_pred_type_t pred)
default: CLIENT_ASSERT(false, "Incorrect predicate value"); return DR_PRED_NONE;
}
}

ptr_int_t
compute_scaled_index_aarch64(opnd_t opnd, reg_t index_val)
{
bool scaled = false;
uint amount = 0;
dr_extend_type_t type = opnd_get_index_extend(opnd, &scaled, &amount);
reg_t extended = 0;
uint msb = 0;
switch (type) {
default: CLIENT_ASSERT(false, "Unsupported extend type"); return 0;
case DR_EXTEND_UXTW: extended = (index_val << (63u - 31u)) >> (63u - 31u); break;
case DR_EXTEND_SXTW:
extended = (index_val << (63u - 31u)) >> (63u - 31u);
msb = extended >> 31u;
if (msb == 1) {
extended = ((~0ull) << 32u) | extended;
}
break;
case DR_EXTEND_UXTX:
case DR_EXTEND_SXTX: extended = index_val; break;
}
if (scaled) {
return extended << amount;
} else {
return extended;
}
}

static bool
is_active_in_mask(size_t element, uint64 mask, size_t element_size_bytes)
{
const uint64 element_flag = 1ull << (element_size_bytes * element);
return TESTALL(element_flag, mask);
}

bool
instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
dr_mcontext_flags_t mc_flags, opnd_t curop, uint addr_index,
DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
DR_PARAM_OUT bool *write)
{
CLIENT_ASSERT(have_addr != NULL && addr != NULL && mc != NULL,
"SVE address computation: invalid args");
CLIENT_ASSERT(TEST(DR_MC_MULTIMEDIA, mc_flags),
"dr_mcontext_t.flags must include DR_MC_MULTIMEDIA");
CLIENT_ASSERT(mc_size >= offsetof(dr_mcontext_t, svep) + sizeof(mc->svep),
"Incompatible client, invalid dr_mcontext_t.size.");

ASSERT(write != NULL);
*write = instr_is_scatter(instr);
ASSERT(*write || instr_is_gather(instr));

const size_t vl_bytes = opnd_size_in_bytes(OPSZ_SVE_VL_BYTES);
/* DynamoRIO currently supports up to 512-bit vector registers so a predicate register
* value should be <= 64-bits.
* If DynamoRIO is extended in the future to support large vector lengths this
* function will need to be updated to cope with larger predicate mask values.
*/
ASSERT(vl_bytes / 8 < sizeof(uint64));

const reg_t governing_pred = opnd_get_reg(instr_get_src(instr, 1));
ASSERT(governing_pred >= DR_REG_START_P && governing_pred <= DR_REG_STOP_P);
uint64 mask = mc->svep[governing_pred - DR_REG_START_P].d;

if (mask == 0) {
return false;
}

const size_t element_size_bytes =
opnd_size_in_bytes(opnd_get_vector_element_size(curop));
const size_t num_elements = vl_bytes / element_size_bytes;

size_t active_elements_found = 0;
for (size_t element = 0; element < num_elements; element++) {
if (is_active_in_mask(element, mask, element_size_bytes)) {
active_elements_found++;
if (active_elements_found == addr_index + 1) {
const reg_t base_reg = opnd_get_base(curop);
if (reg_is_z(base_reg)) {
size_t base_reg_num = base_reg - DR_REG_START_Z;
if (element_size_bytes == 4) {
*addr = (app_pc)(reg_t)mc->simd[base_reg_num].u32[element];
} else {
ASSERT(element_size_bytes == 8);
*addr = (app_pc)mc->simd[base_reg_num].u64[element];
}
} else {
*addr = (app_pc)reg_get_value_priv(base_reg, mc);
}

const reg_t index_reg = opnd_get_index(curop);
reg_t unscaled_index_val = 0;
if (reg_is_z(index_reg)) {
/* Vector index, extract the current element */
size_t index_reg_num = index_reg - DR_REG_START_Z;
if (element_size_bytes == 4) {
unscaled_index_val = mc->simd[index_reg_num].u32[element];
} else {
ASSERT(element_size_bytes == 8);
unscaled_index_val = mc->simd[index_reg_num].u64[element];
}
} else {
/* scalar index or no index */
unscaled_index_val = reg_get_value_priv(index_reg, mc);
}

*have_addr = true;
*addr += compute_scaled_index_aarch64(curop, unscaled_index_val);
*addr += opnd_get_disp(curop);

return addr_index < num_elements;
}
}
}

return false;
}
2 changes: 2 additions & 0 deletions core/ir/aarchxx/opnd.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ opnd_get_reg_dcontext_offs(reg_id_t reg)
return R0_OFFSET + (R1_OFFSET - R0_OFFSET) * (reg - DR_REG_W0);
if (reg == DR_REG_XSP || reg == DR_REG_WSP)
return XSP_OFFSET;
if (DR_REG_Z0 <= reg && reg <= DR_REG_Z31)
return Z_REG_OFFSET(reg);
CLIENT_ASSERT(false, "opnd_get_reg_dcontext_offs: invalid reg");
return -1;
#else
Expand Down
12 changes: 8 additions & 4 deletions core/ir/instr.h
Original file line number Diff line number Diff line change
Expand Up @@ -676,11 +676,15 @@ int
instr_length_arch(dcontext_t *dcontext, instr_t *instr);
bool
opc_is_not_a_real_memory_load(int opc);

#if defined(X86) || defined(AARCH64)
bool
instr_compute_address_VSIB(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
DR_PARAM_OUT bool *write);
instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
DR_PARAM_OUT bool *write);
#endif

uint
instr_branch_type(instr_t *cti_instr);
#ifdef AARCH64
Expand Down
31 changes: 17 additions & 14 deletions core/ir/instr_shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -2644,21 +2644,22 @@ instr_compute_address_helper(instr_t *instr, priv_mcontext_t *mc, size_t mc_size
for (i = 0; i < instr_num_dsts(instr); i++) {
curop = instr_get_dst(instr, i);
if (opnd_is_memory_reference(curop)) {
if (opnd_is_vsib(curop)) {
#ifdef X86
if (instr_compute_address_VSIB(instr, mc, mc_size, mc_flags, curop, index,
&have_addr, addr, &write)) {
CLIENT_ASSERT(
write,
"VSIB found in destination but instruction is not a scatter");
#if defined(X86) || defined(AARCH64)
if (opnd_is_vector_base_disp(curop)) {
if (instr_compute_vector_address(instr, mc, mc_size, mc_flags, curop,
index, &have_addr, addr, &write)) {
CLIENT_ASSERT(write,
"Vector address found in destination but instruction "
"is not a scatter");
break;
} else {
return false;
}
}
#else
CLIENT_ASSERT(false, "VSIB should be x86-only");
CLIENT_ASSERT(
false, "Vector address computation implemented for AArch64 and x86 only");
#endif
}
memcount++;
if (memcount == (int)index) {
write = true;
Expand All @@ -2672,15 +2673,17 @@ instr_compute_address_helper(instr_t *instr, priv_mcontext_t *mc, size_t mc_size
for (i = 0; i < instr_num_srcs(instr); i++) {
curop = instr_get_src(instr, i);
if (opnd_is_memory_reference(curop)) {
if (opnd_is_vsib(curop)) {
#ifdef X86
if (instr_compute_address_VSIB(instr, mc, mc_size, mc_flags, curop,
index, &have_addr, addr, &write))
#if defined(X86) || defined(AARCH64)
if (opnd_is_vector_base_disp(curop)) {
if (instr_compute_vector_address(instr, mc, mc_size, mc_flags, curop,
index, &have_addr, addr, &write))
break;
else
return false;
#else
CLIENT_ASSERT(false, "VSIB should be x86-only");
CLIENT_ASSERT(
false,
"Vector address computation implemented for AArch64 and x86 only");
#endif
}
memcount++;
Expand Down
24 changes: 24 additions & 0 deletions core/ir/opnd.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,12 @@ opnd_compute_address_helper(opnd_t opnd, priv_mcontext_t *mc, ptr_int_t scaled_i
bool
opnd_is_abs_base_disp(opnd_t opnd);

#if defined(AARCH64)
/* Internal function shared with vector address calculation */
ptr_int_t
compute_scaled_index_aarch64(opnd_t opnd, reg_t index_val);
#endif

#ifndef STANDALONE_DECODER
opnd_t
opnd_create_dcontext_field(dcontext_t *dcontext, int offs);
Expand Down Expand Up @@ -339,4 +345,22 @@ extern reg_id_t dr_reg_stolen;
extern reg_id_t dr_reg_stolen;
#endif

#ifdef AARCH64
#if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER) && !defined(BUILD_TESTS)
# define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
# define OPSZ_SVE_PL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
#else
/* SVE vector length for off-line decoder set using -vl option with drdisas,
* e.g.
* $ drdisas -vl 256 e58057a1 85865e6b
* e58057a1 str %z1 -> +0x05(%x29)[32byte]
* 85865e6b ldr +0x37(%x19)[32byte] -> %z11
* $
*/
# define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
# define OPSZ_SVE_PL_BYTES opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
#endif
#endif /*AARCH64*/


#endif /* _OPND_H_ */
15 changes: 14 additions & 1 deletion core/ir/opnd_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1119,6 +1119,10 @@ enum {
DR_REG_STOP_32 = DR_REG_WSP, /**< End of 32-bit general register enum values */
DR_REG_START_GPR = DR_REG_X0, /**< Start of full-size general-purpose registers */
DR_REG_STOP_GPR = DR_REG_XSP, /**< End of full-size general-purpose registers */
DR_REG_START_Z = DR_REG_Z0, /**< Start of Z scalable vector registers */
DR_REG_STOP_Z = DR_REG_Z31, /**< Start of Z scalable vector registers */
DR_REG_START_P = DR_REG_P0, /**< Start of P scalable predicate registers */
DR_REG_STOP_P = DR_REG_P15, /**< Start of P scalable predicate registers */
# else
DR_REG_START_32 = DR_REG_R0, /**< Start of 32-bit general register enum values */
DR_REG_STOP_32 = DR_REG_R15, /**< End of 32-bit general register enum values */
Expand All @@ -1128,7 +1132,8 @@ enum {

DR_NUM_GPR_REGS = DR_REG_STOP_GPR - DR_REG_START_GPR + 1, /**< Count of GPR regs. */
# ifdef AARCH64
DR_NUM_SIMD_VECTOR_REGS = DR_REG_Z31 - DR_REG_Z0 + 1, /**< Count of SIMD regs. */
DR_NUM_SIMD_VECTOR_REGS =
DR_REG_STOP_Z - DR_REG_START_Z + 1, /**< Count of SIMD regs. */
# else
/* XXX: maybe we want more distinct names that provide counts for 64-bit D or 32-bit
* S registers.
Expand Down Expand Up @@ -2604,6 +2609,14 @@ DR_API
bool
opnd_is_vsib(opnd_t opnd);

DR_API
/**
* Returns true iff \p opnd is a base+disp memory reference operand which uses vector
* registers.
*/
bool
opnd_is_vector_base_disp(opnd_t opnd);

DR_API
/**
* Returns true iff \p opnd is a (near or far) absolute address operand.
Expand Down
Loading

0 comments on commit 026b04e

Please sign in to comment.