Stop packing capreg_state into a single 64-bit integer

The bitwise operations are more expensive than expanding the 64-bit value to a 32 byte array. This results in a 1.21x speedup running the MFS_ROOT kernel and a 1.08x speedup for the full purecap kernel+purecap userspace boot. Benchmarks using a previous version of this patch indicated a 1.08 speedup for the MFS_ROOT case. My assumption is that removing the TCG global accesses to cpu_capreg_state now has a larger impact after 0c09763 remove the incorrect NO_RWG flag. MFS_ROOT boot: ``` hyperfine -L qemu /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri '{qemu} -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' -w 1 Benchmark #1: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh Time (mean ± σ): 9.499 s ± 0.027 s [User: 8.581 s, System: 0.136 s] Range (min … max): 9.448 s … 9.539 s 10 runs Benchmark #2: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh Time (mean ± σ): 9.281 s ± 0.029 s [User: 8.260 s, System: 0.137 s] Range (min … max): 9.234 s … 9.326 s 10 runs Benchmark #3: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh Time (mean ± σ): 7.852 s ± 0.053 s [User: 7.523 s, System: 0.182 s] Range (min … max): 7.793 s … 7.933 s 10 runs Summary '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' ran 1.18 ± 0.01 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' 1.21 ± 0.01 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' ``` Full purecap+purecap boot: ``` hyperfine -L qemu /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd {qemu} --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' -m 3 Benchmark #1: /home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest Time (mean ± σ): 227.351 s ± 0.484 s [User: 213.193 s, System: 3.544 s] Range (min … max): 226.792 s … 227.646 s 3 runs Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet PC without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options. Benchmark #2: /home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest Time (mean ± σ): 210.156 s ± 3.601 s [User: 197.448 s, System: 1.979 s] Range (min … max): 206.397 s … 213.575 s 3 runs Summary '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' ran 1.08 ± 0.02 times faster than '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' ```
CTSRD-CHERI · Jul 7, 2021 · 667033e · 667033e
1 parent 2d7de03
commit 667033e
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 31 deletions.
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
@@ -689,7 +689,6 @@ extern uintptr_t tcg_splitwx_diff;
 extern TCGv_env cpu_env;
 #ifdef TARGET_CHERI
 extern TCGv ddc_interposition;
-extern TCGv_i64 cpu_capreg_state; // 32 times 2 bits
 #endif
 #ifdef CONFIG_DEBUG_TCG
 extern TCGv _pc_is_current;

diff --git a/target/cheri-common/cheri-lazy-capregs-types.h b/target/cheri-common/cheri-lazy-capregs-types.h
@@ -83,6 +83,6 @@ typedef struct GPCapRegs {
     // We cache the decompressed capregs here (to avoid constantly decompressing
     // values such as $csp which are used frequently)
     cap_register_t decompressed[32];
-    uint64_t capreg_state; // 32 times CapRegState compressed to one uint64_t
+    uint8_t capreg_state[32] QEMU_ALIGNED(64); /* 32 times CapRegState */
 } GPCapRegs;
 #endif
diff --git a/target/cheri-common/cheri-lazy-capregs.h b/target/cheri-common/cheri-lazy-capregs.h
@@ -47,20 +47,11 @@
 
 static inline GPCapRegs *cheri_get_gpcrs(CPUArchState *env);
 
-static inline QEMU_ALWAYS_INLINE uint64_t
-capreg_state_set_to_integer_mask(unsigned reg)
-{
-    return ~(((uint64_t)CREG_STATE_MASK) << (reg * 2));
-}
-
-static inline CapRegState get_capreg_state(const GPCapRegs *gpcrs, unsigned reg)
+static inline QEMU_ALWAYS_INLINE CapRegState
+get_capreg_state(const GPCapRegs *gpcrs, unsigned reg)
 {
     cheri_debug_assert(reg < 32);
-    /*
-     * Note: QEMU's extract64 has assertions enabled (even in release mode).
-     * Since this is a hot path, we re-implement it without assertions here.
-     */
-    return (CapRegState)((gpcrs->capreg_state >> (reg * 2)) & CREG_STATE_MASK);
+    return (CapRegState)gpcrs->capreg_state[reg];
 }
 
 static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum)
@@ -106,7 +97,6 @@ static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum)
 #endif // CONFIG_DEBUG_TCG
 }
 
-/* Marked as always_inline to avoid the |= if called with CREG_INTEGER. */
 static inline QEMU_ALWAYS_INLINE void
 set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, CapRegState new_state)
 {
@@ -117,14 +107,7 @@ set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, CapRegState new_state)
     }
 
     cheri_debug_assert(regnum < 32);
-    /*
-     * Note: QEMU's deposit64 has assertions enabled (even in release mode).
-     * Since this is a hot path, we re-implement it without assertions here.
-     */
-    gpcrs->capreg_state &= capreg_state_set_to_integer_mask(regnum);
-    if (!__builtin_constant_p(new_state) || new_state != 0) {
-        gpcrs->capreg_state |= (((uint64_t)new_state) << (regnum * 2));
-    }
+    gpcrs->capreg_state[regnum] = new_state;
     // Check that the compressed and decompressed caps are in sync
     sanity_check_capreg(gpcrs, regnum);
 }
@@ -417,7 +400,9 @@ static inline void reset_capregs(CPUArchState *env)
 {
     // Reset all to NULL:
     GPCapRegs *gpcrs = cheri_get_gpcrs(env);
-    gpcrs->capreg_state = UINT64_MAX; // All decompressed values
+    for (size_t i = 0; i < ARRAY_SIZE(gpcrs->capreg_state); i++) {
+        gpcrs->capreg_state[i] = CREG_FULLY_DECOMPRESSED;
+    }
     for (size_t i = 0; i < ARRAY_SIZE(gpcrs->decompressed); i++) {
         const cap_register_t* newval = null_capability(&gpcrs->decompressed[i]);
         // Register should be fully decompressed
@@ -432,7 +417,9 @@ static inline void set_max_perms_capregs(CPUArchState *env)
 {
     // Reset all to max perms (except NULL of course):
     GPCapRegs *gpcrs = cheri_get_gpcrs(env);
-    gpcrs->capreg_state = UINT64_MAX; // All decompressed values
+    for (size_t i = 0; i < ARRAY_SIZE(gpcrs->capreg_state); i++) {
+        gpcrs->capreg_state[i] = CREG_FULLY_DECOMPRESSED;
+    }
     null_capability(&gpcrs->decompressed[NULL_CAPREG_INDEX]);
     sanity_check_capreg(gpcrs, NULL_CAPREG_INDEX);
     for (size_t i = 0; i < ARRAY_SIZE(gpcrs->decompressed); i++) {

diff --git a/target/riscv/translate.c b/target/riscv/translate.c
@@ -225,8 +225,11 @@ static inline void gen_mark_gpr_as_integer(int reg_num_dst)
 {
     /* Currently, the integer flag is 0, so we can mask the 64-bit value holding
      * the capreg state appropriately to clear the bits for register N. */
-    tcg_gen_andi_i64(cpu_capreg_state, cpu_capreg_state,
-                     capreg_state_set_to_integer_mask(reg_num_dst));
+    TCGv_i32 integer_state = tcg_const_i32(CREG_INTEGER);
+    tcg_gen_st8_i32(
+        integer_state, cpu_env,
+        offsetof(CPURISCVState, gpcapregs.capreg_state[reg_num_dst]));
+    tcg_temp_free_i32(integer_state);
     tcg_gen_movi_tl(_cpu_pesbt_do_not_access_directly[reg_num_dst],
                     CAP_NULL_PESBT);
     /* TODO: maybe all ones is more efficient? We can just do an or and don't
@@ -1085,9 +1088,6 @@ void riscv_translate_init(void)
             offsetof(CPURISCVState, gpcapregs.decompressed[i].cached_pesbt),
             cheri_gp_regnames[i]);
     }
-    cpu_capreg_state = tcg_global_mem_new_i64(
-        cpu_env, offsetof(CPURISCVState, gpcapregs.capreg_state),
-        "capreg_state");
 #endif
 #ifdef CONFIG_RVFI_DII
     cpu_rvfi_available_fields = tcg_global_mem_new_i32(

diff --git a/tcg/tcg.c b/tcg/tcg.c
@@ -168,7 +168,6 @@ TCGv _pc_is_current = 0;
 #endif
 #ifdef TARGET_CHERI
 TCGv ddc_interposition;
-TCGv_i64 cpu_capreg_state; // 32 times 2 bits
 #endif
 
 #ifndef CONFIG_TCG_INTERPRETER