From 667033edbd03b2d5bf34b37a3bc40127b0085f80 Mon Sep 17 00:00:00 2001 From: Alex Richardson Date: Wed, 7 Jul 2021 13:02:02 +0100 Subject: [PATCH] Stop packing capreg_state into a single 64-bit integer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bitwise operations are more expensive than expanding the 64-bit value to a 32 byte array. This results in a 1.21x speedup running the MFS_ROOT kernel and a 1.08x speedup for the full purecap kernel+purecap userspace boot. Benchmarks using a previous version of this patch indicated a 1.08 speedup for the MFS_ROOT case. My assumption is that removing the TCG global accesses to cpu_capreg_state now has a larger impact after 0c09763123571243c586316b1149b6164d66b720 remove the incorrect NO_RWG flag. MFS_ROOT boot: ``` hyperfine -L qemu /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri '{qemu} -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' -w 1 Benchmark #1: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh Time (mean ± σ): 9.499 s ± 0.027 s [User: 8.581 s, System: 0.136 s] Range (min … max): 9.448 s … 9.539 s 10 runs Benchmark #2: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh Time (mean ± σ): 9.281 s ± 0.029 s [User: 8.260 s, System: 0.137 s] Range (min … max): 9.234 s … 9.326 s 10 runs Benchmark #3: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh Time (mean ± σ): 7.852 s ± 0.053 s [User: 7.523 s, System: 0.182 s] Range (min … max): 7.793 s … 7.933 s 10 runs Summary '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' ran 1.18 ± 0.01 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' 1.21 ± 0.01 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' ``` Full purecap+purecap boot: ``` hyperfine -L qemu /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd {qemu} --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' -m 3 Benchmark #1: /home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest Time (mean ± σ): 227.351 s ± 0.484 s [User: 213.193 s, System: 3.544 s] Range (min … max): 226.792 s … 227.646 s 3 runs Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet PC without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options. Benchmark #2: /home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest Time (mean ± σ): 210.156 s ± 3.601 s [User: 197.448 s, System: 1.979 s] Range (min … max): 206.397 s … 213.575 s 3 runs Summary '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' ran 1.08 ± 0.02 times faster than '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' ``` --- include/tcg/tcg.h | 1 - .../cheri-common/cheri-lazy-capregs-types.h | 2 +- target/cheri-common/cheri-lazy-capregs.h | 33 ++++++------------- target/riscv/translate.c | 10 +++--- tcg/tcg.c | 1 - 5 files changed, 16 insertions(+), 31 deletions(-) diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index c1db419af14..579a3ffcb31 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -689,7 +689,6 @@ extern uintptr_t tcg_splitwx_diff; extern TCGv_env cpu_env; #ifdef TARGET_CHERI extern TCGv ddc_interposition; -extern TCGv_i64 cpu_capreg_state; // 32 times 2 bits #endif #ifdef CONFIG_DEBUG_TCG extern TCGv _pc_is_current; diff --git a/target/cheri-common/cheri-lazy-capregs-types.h b/target/cheri-common/cheri-lazy-capregs-types.h index 81cccf06271..8c9dae1bd5e 100644 --- a/target/cheri-common/cheri-lazy-capregs-types.h +++ b/target/cheri-common/cheri-lazy-capregs-types.h @@ -83,6 +83,6 @@ typedef struct GPCapRegs { // We cache the decompressed capregs here (to avoid constantly decompressing // values such as $csp which are used frequently) cap_register_t decompressed[32]; - uint64_t capreg_state; // 32 times CapRegState compressed to one uint64_t + uint8_t capreg_state[32] QEMU_ALIGNED(64); /* 32 times CapRegState */ } GPCapRegs; #endif diff --git a/target/cheri-common/cheri-lazy-capregs.h b/target/cheri-common/cheri-lazy-capregs.h index 3f4eedfa815..7f154b81c9e 100644 --- a/target/cheri-common/cheri-lazy-capregs.h +++ b/target/cheri-common/cheri-lazy-capregs.h @@ -47,20 +47,11 @@ static inline GPCapRegs *cheri_get_gpcrs(CPUArchState *env); -static inline QEMU_ALWAYS_INLINE uint64_t -capreg_state_set_to_integer_mask(unsigned reg) -{ - return ~(((uint64_t)CREG_STATE_MASK) << (reg * 2)); -} - -static inline CapRegState get_capreg_state(const GPCapRegs *gpcrs, unsigned reg) +static inline QEMU_ALWAYS_INLINE CapRegState +get_capreg_state(const GPCapRegs *gpcrs, unsigned reg) { cheri_debug_assert(reg < 32); - /* - * Note: QEMU's extract64 has assertions enabled (even in release mode). - * Since this is a hot path, we re-implement it without assertions here. - */ - return (CapRegState)((gpcrs->capreg_state >> (reg * 2)) & CREG_STATE_MASK); + return (CapRegState)gpcrs->capreg_state[reg]; } static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum) @@ -106,7 +97,6 @@ static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum) #endif // CONFIG_DEBUG_TCG } -/* Marked as always_inline to avoid the |= if called with CREG_INTEGER. */ static inline QEMU_ALWAYS_INLINE void set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, CapRegState new_state) { @@ -117,14 +107,7 @@ set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, CapRegState new_state) } cheri_debug_assert(regnum < 32); - /* - * Note: QEMU's deposit64 has assertions enabled (even in release mode). - * Since this is a hot path, we re-implement it without assertions here. - */ - gpcrs->capreg_state &= capreg_state_set_to_integer_mask(regnum); - if (!__builtin_constant_p(new_state) || new_state != 0) { - gpcrs->capreg_state |= (((uint64_t)new_state) << (regnum * 2)); - } + gpcrs->capreg_state[regnum] = new_state; // Check that the compressed and decompressed caps are in sync sanity_check_capreg(gpcrs, regnum); } @@ -417,7 +400,9 @@ static inline void reset_capregs(CPUArchState *env) { // Reset all to NULL: GPCapRegs *gpcrs = cheri_get_gpcrs(env); - gpcrs->capreg_state = UINT64_MAX; // All decompressed values + for (size_t i = 0; i < ARRAY_SIZE(gpcrs->capreg_state); i++) { + gpcrs->capreg_state[i] = CREG_FULLY_DECOMPRESSED; + } for (size_t i = 0; i < ARRAY_SIZE(gpcrs->decompressed); i++) { const cap_register_t* newval = null_capability(&gpcrs->decompressed[i]); // Register should be fully decompressed @@ -432,7 +417,9 @@ static inline void set_max_perms_capregs(CPUArchState *env) { // Reset all to max perms (except NULL of course): GPCapRegs *gpcrs = cheri_get_gpcrs(env); - gpcrs->capreg_state = UINT64_MAX; // All decompressed values + for (size_t i = 0; i < ARRAY_SIZE(gpcrs->capreg_state); i++) { + gpcrs->capreg_state[i] = CREG_FULLY_DECOMPRESSED; + } null_capability(&gpcrs->decompressed[NULL_CAPREG_INDEX]); sanity_check_capreg(gpcrs, NULL_CAPREG_INDEX); for (size_t i = 0; i < ARRAY_SIZE(gpcrs->decompressed); i++) { diff --git a/target/riscv/translate.c b/target/riscv/translate.c index 1cd8792f282..f58770c5781 100644 --- a/target/riscv/translate.c +++ b/target/riscv/translate.c @@ -225,8 +225,11 @@ static inline void gen_mark_gpr_as_integer(int reg_num_dst) { /* Currently, the integer flag is 0, so we can mask the 64-bit value holding * the capreg state appropriately to clear the bits for register N. */ - tcg_gen_andi_i64(cpu_capreg_state, cpu_capreg_state, - capreg_state_set_to_integer_mask(reg_num_dst)); + TCGv_i32 integer_state = tcg_const_i32(CREG_INTEGER); + tcg_gen_st8_i32( + integer_state, cpu_env, + offsetof(CPURISCVState, gpcapregs.capreg_state[reg_num_dst])); + tcg_temp_free_i32(integer_state); tcg_gen_movi_tl(_cpu_pesbt_do_not_access_directly[reg_num_dst], CAP_NULL_PESBT); /* TODO: maybe all ones is more efficient? We can just do an or and don't @@ -1085,9 +1088,6 @@ void riscv_translate_init(void) offsetof(CPURISCVState, gpcapregs.decompressed[i].cached_pesbt), cheri_gp_regnames[i]); } - cpu_capreg_state = tcg_global_mem_new_i64( - cpu_env, offsetof(CPURISCVState, gpcapregs.capreg_state), - "capreg_state"); #endif #ifdef CONFIG_RVFI_DII cpu_rvfi_available_fields = tcg_global_mem_new_i32( diff --git a/tcg/tcg.c b/tcg/tcg.c index acdd72e6d52..fe24485c970 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -168,7 +168,6 @@ TCGv _pc_is_current = 0; #endif #ifdef TARGET_CHERI TCGv ddc_interposition; -TCGv_i64 cpu_capreg_state; // 32 times 2 bits #endif #ifndef CONFIG_TCG_INTERPRETER