From 0ddaa0fbaea9a47e9f0f247dea4f41e2990676d1 Mon Sep 17 00:00:00 2001
From: Jack Gallagher <jack.gallagher@arm.com>
Date: Thu, 18 Jan 2024 09:50:24 +0000
Subject: [PATCH] i#5365 AArch64: Fix 0 size read/write records in drmemtrace
 (#6544)

When debugging i#6499 we noticed that drcachesim was producing 0 byte
read/write records for some SVE load/store instructions:

```
 ifetch       4 byte(s) @ 0x0000000000405b3c a54a4681   ld1w   (%x20,%x10,lsl #2) %p1/z -> %z1.s
 read         0 byte(s) @ 0x0000000000954e80 by PC 0x0000000000405b3c
 read         0 byte(s) @ 0x0000000000954e84 by PC 0x0000000000405b3c
 read         0 byte(s) @ 0x0000000000954e88 by PC 0x0000000000405b3c
 read         0 byte(s) @ 0x0000000000954e8c by PC 0x0000000000405b3c
 read         0 byte(s) @ 0x0000000000954e90 by PC 0x0000000000405b3c
 read         0 byte(s) @ 0x0000000000954e94 by PC 0x0000000000405b3c
 read         0 byte(s) @ 0x0000000000954e98 by PC 0x0000000000405b3c
 read         0 byte(s) @ 0x0000000000954e9c by PC 0x0000000000405b3c
 ifetch       4 byte(s) @ 0x0000000000405b4
```

This turned out to be due to drdecode being linked into drcachesim
twice: once into the drcachesim executable, once into libdynamorio.
drdecode uses a global variable to store the SVE vector length to use
when decoding so we end up with two copies of that variable and only one
was being initialized.
To fix this properly we would need to refactor the libraries so that
there is only one copy of the sve_veclen global variable, or change the
way that the decoder gets the vector length so its no longer stored in a
global variable. In the mean time we have a workaround which makes sure
both copies of the variable get initialized and drcachesim produces
correct results.

With that workaround in place however, the results were still wrong. For
expanded scatter/gather instructions when you are using an offline
trace, raw2trace doesn't have access to the load/store instructions from
the expansion, only the original app scatter/gather instruction. It has
to create the read/write records using only information from the
original scatter/gather instruction and it uses the size of the memory
operand to determine the size of each read/write. This works for x86
because the x86 IR uses the per-element data size as for the memory
operand of scatter/gather instructions. This doesn't work for AArch64
because the AArch64 codec uses the maximum data transferred (per-element
data size * number of elements) like other SIMD load/store instructions.

We plan to make the AArch64 IR consistent with x86 by changing it to use
the same convention as x86 for scatter/gather instructions but in the
mean time we can work around the inconsistency by fixing the size in
raw2trace based on the instruction's opcode.

Issues: #6499, #5365, #5036
---
 clients/drcachesim/tracer/raw2trace.cpp | 132 +++++++++++++++++++++++-
 1 file changed, 130 insertions(+), 2 deletions(-)

diff --git a/clients/drcachesim/tracer/raw2trace.cpp b/clients/drcachesim/tracer/raw2trace.cpp
index 621ad4e9944..04c9b6f195f 100644
--- a/clients/drcachesim/tracer/raw2trace.cpp
+++ b/clients/drcachesim/tracer/raw2trace.cpp
@@ -2882,6 +2882,91 @@ raw2trace_t::set_instr_summary_flags(raw2trace_thread_data_t *tdata, uint64 modi
     return true;
 }
 
+#if defined(AARCH64)
+/* TODO i#5365, i#5036: append_bb_entries() takes the size of the scatter/gather memory
+ * operand to be the per-element value and uses that to create the read/write memref
+ * entries.
+ * The AArch64 IR currently uses the maximum amount of data transferred by the instruction
+ * (number of elements * per element size) instead so until we change the codec to use the
+ * per-element size we need to use this function to set the per-element size based on the
+ * instruction opcode.
+ * When we have made the codec/IR changes this function can be removed.
+
+ * An alternative solution @derekbruening suggested is to store the encoding during
+ * tracing like we do for vdso or JIT code (see
+ * offline_instru_t::record_instr_encodings()), but not the encoding of the emulated
+ * instr info's pointer to the original scatter/gather: rather, the unrolled single
+ * load/store. This would greatly simplify raw2trace and questions about what the IR
+ * should store. OTOH it adds some tracing overhead: which probably outweighs gains in
+ * reducing code complexity.
+ */
+opnd_size_t
+get_aarch64_scatter_gather_value_size(int opcode)
+{
+    switch (opcode) {
+    case OP_ld1b:
+    case OP_ld1sb:
+    case OP_ldff1b:
+    case OP_ldnf1b:
+    case OP_ldnt1b:
+    case OP_ld1rqb:
+    case OP_ld2b:
+    case OP_ld3b:
+    case OP_ld4b:
+    case OP_st1b:
+    case OP_stnt1b:
+    case OP_st2b:
+    case OP_st3b:
+    case OP_st4b: return OPSZ_1;
+    case OP_ld1h:
+    case OP_ld1sh:
+    case OP_ldff1h:
+    case OP_ldnf1h:
+    case OP_ldnt1h:
+    case OP_ld1rqh:
+    case OP_ld2h:
+    case OP_ld3h:
+    case OP_ld4h:
+    case OP_st1h:
+    case OP_stnt1h:
+    case OP_st2h:
+    case OP_st3h:
+    case OP_st4h: return OPSZ_2;
+    case OP_ld1w:
+    case OP_ld1sw:
+    case OP_ldff1w:
+    case OP_ldnf1w:
+    case OP_ldnt1w:
+    case OP_ld1rqw:
+    case OP_ld2w:
+    case OP_ld3w:
+    case OP_ld4w:
+    case OP_st1w:
+    case OP_stnt1w:
+    case OP_st2w:
+    case OP_st3w:
+    case OP_st4w: return OPSZ_4;
+    case OP_ld1d:
+    case OP_ldff1d:
+    case OP_ldnf1d:
+    case OP_ldnt1d:
+    case OP_ld1rqd:
+    case OP_ld2d:
+    case OP_ld3d:
+    case OP_ld4d:
+    case OP_st1d:
+    case OP_stnt1d:
+    case OP_st2d:
+    case OP_st3d:
+    case OP_st4d: return OPSZ_8;
+    }
+    DR_ASSERT_MSG(
+        false,
+        "Instruction is not a scatter/gather/predicated contiguous load/store operation");
+    return OPSZ_0;
+}
+#endif // defined(AARCH64)
+
 bool
 instr_summary_t::construct(void *dcontext, app_pc block_start, DR_PARAM_INOUT app_pc *pc,
                            app_pc orig_pc, DR_PARAM_OUT instr_summary_t *desc,
@@ -2962,15 +3047,47 @@ instr_summary_t::construct(void *dcontext, app_pc block_start, DR_PARAM_INOUT ap
     if (reads_memory || writes_memory) {
         for (int i = 0, e = instr_num_srcs(instr); i < e; ++i) {
             opnd_t op = instr_get_src(instr, i);
-            if (opnd_is_memory_reference(op))
+            if (opnd_is_memory_reference(op)) {
+#if defined(AARCH64)
+                /* TODO i#5365: append_bb_entries() takes the size of the scatter/gather
+                 * memory operand to be the per-element value and uses that to create the
+                 * read/write memref entries.
+                 * The AArch64 IR currently uses the maximum amount of data transferred
+                 * by the instruction (number of elements * per element size) instead so
+                 * until we change the codec to use the per-element size we need to fix
+                 * it up here.
+                 */
+                if (desc->is_scatter_or_gather()) {
+                    opnd_set_size(
+                        &op,
+                        get_aarch64_scatter_gather_value_size(instr_get_opcode(instr)));
+                }
+#endif
                 desc->mem_srcs_and_dests_.push_back(memref_summary_t(op));
+            }
         }
         desc->num_mem_srcs_ = static_cast<uint8_t>(desc->mem_srcs_and_dests_.size());
 
         for (int i = 0, e = instr_num_dsts(instr); i < e; ++i) {
             opnd_t op = instr_get_dst(instr, i);
-            if (opnd_is_memory_reference(op))
+            if (opnd_is_memory_reference(op)) {
+#if defined(AARCH64)
+                /* TODO i#5365: append_bb_entries() takes the size of the scatter/gather
+                 * memory operand to be the per-element value and uses that to create the
+                 * read/write memref entries.
+                 * The AArch64 IR currently uses the maximum amount of data transferred
+                 * by the instruction (number of elements * per element size) instead so
+                 * until we change the codec to use the per-element size we need to fix
+                 * it up here.
+                 */
+                if (desc->is_scatter_or_gather()) {
+                    opnd_set_size(
+                        &op,
+                        get_aarch64_scatter_gather_value_size(instr_get_opcode(instr)));
+                }
+#endif
                 desc->mem_srcs_and_dests_.push_back(memref_summary_t(op));
+            }
         }
     }
     return true;
@@ -3738,6 +3855,17 @@ raw2trace_t::raw2trace_t(
     decode_cache_.reserve(cache_count);
     for (int i = 0; i < cache_count; ++i)
         decode_cache_.emplace_back(cache_count);
+
+#if defined(AARCH64)
+    // TODO i#6556, i#1684: The decoder uses a global sve_veclen variable to store the
+    // vector length value it uses when decoding. drdecodelib ends up being linked into
+    // drcachesim twice: once into the drcachesim executable, and one into libdynamorio.
+    // When we call dr_standalone_init() above it will initialize the version of
+    // sve_veclen in libdynamorio, but not the one in drcachesim.
+    // Unfortunately it is the version of sve_veclen in drcachesim that gets used when
+    // decoding in raw2trace so we need to explicitly initialize its sve_veclen here.
+    dr_set_sve_vector_length(proc_get_vector_length_bytes() * 8);
+#endif
 }
 
 raw2trace_t::~raw2trace_t()