From 667033edbd03b2d5bf34b37a3bc40127b0085f80 Mon Sep 17 00:00:00 2001
From: Alex Richardson <Alexander.Richardson@cl.cam.ac.uk>
Date: Wed, 7 Jul 2021 13:02:02 +0100
Subject: [PATCH] Stop packing capreg_state into a single 64-bit integer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bitwise operations are more expensive than expanding the 64-bit value
to a 32 byte array. This results in a 1.21x speedup running the MFS_ROOT
kernel and a 1.08x speedup for the full purecap kernel+purecap userspace
boot. Benchmarks using a previous version of this patch indicated a 1.08
speedup for the MFS_ROOT case. My assumption is that removing the TCG
global accesses to cpu_capreg_state now has a larger impact after
0c09763123571243c586316b1149b6164d66b720 remove the incorrect NO_RWG flag.

MFS_ROOT boot:
```
hyperfine -L qemu /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri  '{qemu} -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' -w 1
Benchmark #1: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh
  Time (mean ± σ):      9.499 s ±  0.027 s    [User: 8.581 s, System: 0.136 s]
  Range (min … max):    9.448 s …  9.539 s    10 runs

Benchmark #2: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh
  Time (mean ± σ):      9.281 s ±  0.029 s    [User: 8.260 s, System: 0.137 s]
  Range (min … max):    9.234 s …  9.326 s    10 runs

Benchmark #3: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh
  Time (mean ± σ):      7.852 s ±  0.053 s    [User: 7.523 s, System: 0.182 s]
  Range (min … max):    7.793 s …  7.933 s    10 runs

Summary
  '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' ran
    1.18 ± 0.01 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh'
    1.21 ± 0.01 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh'
```
Full purecap+purecap boot:
```
hyperfine -L qemu /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd {qemu} --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' -m 3
Benchmark #1: /home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest
  Time (mean ± σ):     227.351 s ±  0.484 s    [User: 213.193 s, System: 3.544 s]
  Range (min … max):   226.792 s … 227.646 s    3 runs

  Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet PC without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options.

Benchmark #2: /home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest
  Time (mean ± σ):     210.156 s ±  3.601 s    [User: 197.448 s, System: 1.979 s]
  Range (min … max):   206.397 s … 213.575 s    3 runs

Summary
  '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' ran
    1.08 ± 0.02 times faster than '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest'
```
---
 include/tcg/tcg.h                             |  1 -
 .../cheri-common/cheri-lazy-capregs-types.h   |  2 +-
 target/cheri-common/cheri-lazy-capregs.h      | 33 ++++++-------------
 target/riscv/translate.c                      | 10 +++---
 tcg/tcg.c                                     |  1 -
 5 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index c1db419af14..579a3ffcb31 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -689,7 +689,6 @@ extern uintptr_t tcg_splitwx_diff;
 extern TCGv_env cpu_env;
 #ifdef TARGET_CHERI
 extern TCGv ddc_interposition;
-extern TCGv_i64 cpu_capreg_state; // 32 times 2 bits
 #endif
 #ifdef CONFIG_DEBUG_TCG
 extern TCGv _pc_is_current;
diff --git a/target/cheri-common/cheri-lazy-capregs-types.h b/target/cheri-common/cheri-lazy-capregs-types.h
index 81cccf06271..8c9dae1bd5e 100644
--- a/target/cheri-common/cheri-lazy-capregs-types.h
+++ b/target/cheri-common/cheri-lazy-capregs-types.h
@@ -83,6 +83,6 @@ typedef struct GPCapRegs {
     // We cache the decompressed capregs here (to avoid constantly decompressing
     // values such as $csp which are used frequently)
     cap_register_t decompressed[32];
-    uint64_t capreg_state; // 32 times CapRegState compressed to one uint64_t
+    uint8_t capreg_state[32] QEMU_ALIGNED(64); /* 32 times CapRegState */
 } GPCapRegs;
 #endif
diff --git a/target/cheri-common/cheri-lazy-capregs.h b/target/cheri-common/cheri-lazy-capregs.h
index 3f4eedfa815..7f154b81c9e 100644
--- a/target/cheri-common/cheri-lazy-capregs.h
+++ b/target/cheri-common/cheri-lazy-capregs.h
@@ -47,20 +47,11 @@
 
 static inline GPCapRegs *cheri_get_gpcrs(CPUArchState *env);
 
-static inline QEMU_ALWAYS_INLINE uint64_t
-capreg_state_set_to_integer_mask(unsigned reg)
-{
-    return ~(((uint64_t)CREG_STATE_MASK) << (reg * 2));
-}
-
-static inline CapRegState get_capreg_state(const GPCapRegs *gpcrs, unsigned reg)
+static inline QEMU_ALWAYS_INLINE CapRegState
+get_capreg_state(const GPCapRegs *gpcrs, unsigned reg)
 {
     cheri_debug_assert(reg < 32);
-    /*
-     * Note: QEMU's extract64 has assertions enabled (even in release mode).
-     * Since this is a hot path, we re-implement it without assertions here.
-     */
-    return (CapRegState)((gpcrs->capreg_state >> (reg * 2)) & CREG_STATE_MASK);
+    return (CapRegState)gpcrs->capreg_state[reg];
 }
 
 static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum)
@@ -106,7 +97,6 @@ static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum)
 #endif // CONFIG_DEBUG_TCG
 }
 
-/* Marked as always_inline to avoid the |= if called with CREG_INTEGER. */
 static inline QEMU_ALWAYS_INLINE void
 set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, CapRegState new_state)
 {
@@ -117,14 +107,7 @@ set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, CapRegState new_state)
     }
 
     cheri_debug_assert(regnum < 32);
-    /*
-     * Note: QEMU's deposit64 has assertions enabled (even in release mode).
-     * Since this is a hot path, we re-implement it without assertions here.
-     */
-    gpcrs->capreg_state &= capreg_state_set_to_integer_mask(regnum);
-    if (!__builtin_constant_p(new_state) || new_state != 0) {
-        gpcrs->capreg_state |= (((uint64_t)new_state) << (regnum * 2));
-    }
+    gpcrs->capreg_state[regnum] = new_state;
     // Check that the compressed and decompressed caps are in sync
     sanity_check_capreg(gpcrs, regnum);
 }
@@ -417,7 +400,9 @@ static inline void reset_capregs(CPUArchState *env)
 {
     // Reset all to NULL:
     GPCapRegs *gpcrs = cheri_get_gpcrs(env);
-    gpcrs->capreg_state = UINT64_MAX; // All decompressed values
+    for (size_t i = 0; i < ARRAY_SIZE(gpcrs->capreg_state); i++) {
+        gpcrs->capreg_state[i] = CREG_FULLY_DECOMPRESSED;
+    }
     for (size_t i = 0; i < ARRAY_SIZE(gpcrs->decompressed); i++) {
         const cap_register_t* newval = null_capability(&gpcrs->decompressed[i]);
         // Register should be fully decompressed
@@ -432,7 +417,9 @@ static inline void set_max_perms_capregs(CPUArchState *env)
 {
     // Reset all to max perms (except NULL of course):
     GPCapRegs *gpcrs = cheri_get_gpcrs(env);
-    gpcrs->capreg_state = UINT64_MAX; // All decompressed values
+    for (size_t i = 0; i < ARRAY_SIZE(gpcrs->capreg_state); i++) {
+        gpcrs->capreg_state[i] = CREG_FULLY_DECOMPRESSED;
+    }
     null_capability(&gpcrs->decompressed[NULL_CAPREG_INDEX]);
     sanity_check_capreg(gpcrs, NULL_CAPREG_INDEX);
     for (size_t i = 0; i < ARRAY_SIZE(gpcrs->decompressed); i++) {
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index 1cd8792f282..f58770c5781 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -225,8 +225,11 @@ static inline void gen_mark_gpr_as_integer(int reg_num_dst)
 {
     /* Currently, the integer flag is 0, so we can mask the 64-bit value holding
      * the capreg state appropriately to clear the bits for register N. */
-    tcg_gen_andi_i64(cpu_capreg_state, cpu_capreg_state,
-                     capreg_state_set_to_integer_mask(reg_num_dst));
+    TCGv_i32 integer_state = tcg_const_i32(CREG_INTEGER);
+    tcg_gen_st8_i32(
+        integer_state, cpu_env,
+        offsetof(CPURISCVState, gpcapregs.capreg_state[reg_num_dst]));
+    tcg_temp_free_i32(integer_state);
     tcg_gen_movi_tl(_cpu_pesbt_do_not_access_directly[reg_num_dst],
                     CAP_NULL_PESBT);
     /* TODO: maybe all ones is more efficient? We can just do an or and don't
@@ -1085,9 +1088,6 @@ void riscv_translate_init(void)
             offsetof(CPURISCVState, gpcapregs.decompressed[i].cached_pesbt),
             cheri_gp_regnames[i]);
     }
-    cpu_capreg_state = tcg_global_mem_new_i64(
-        cpu_env, offsetof(CPURISCVState, gpcapregs.capreg_state),
-        "capreg_state");
 #endif
 #ifdef CONFIG_RVFI_DII
     cpu_rvfi_available_fields = tcg_global_mem_new_i32(
diff --git a/tcg/tcg.c b/tcg/tcg.c
index acdd72e6d52..fe24485c970 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -168,7 +168,6 @@ TCGv _pc_is_current = 0;
 #endif
 #ifdef TARGET_CHERI
 TCGv ddc_interposition;
-TCGv_i64 cpu_capreg_state; // 32 times 2 bits
 #endif
 
 #ifndef CONFIG_TCG_INTERPRETER