From 4d32a7a889b8ce47d7e3293395b4635b33a505a4 Mon Sep 17 00:00:00 2001 From: Alex Richardson Date: Wed, 7 Jul 2021 12:45:42 +0100 Subject: [PATCH] Avoid QEMU's deposit64/extract64 in hot code paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions have assertions that are enabled even in debug mode and those assertion show up while profiling QEMU booting CheriBSD. Re-implementing them without assertions gives a small but measurable speedup: ``` hyperfine -L qemu /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri '{qemu} -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' Benchmark #1: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh Time (mean ± σ): 9.494 s ± 0.054 s [User: 8.519 s, System: 0.178 s] Range (min … max): 9.443 s … 9.600 s 10 runs Benchmark #2: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh Time (mean ± σ): 9.284 s ± 0.043 s [User: 8.249 s, System: 0.135 s] Range (min … max): 9.234 s … 9.381 s 10 runs Summary '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' ran 1.02 ± 0.01 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' ``` --- .../cheri-common/cheri-lazy-capregs-types.h | 3 ++- target/cheri-common/cheri-lazy-capregs.h | 26 ++++++++++++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/target/cheri-common/cheri-lazy-capregs-types.h b/target/cheri-common/cheri-lazy-capregs-types.h index da903bbbc6b..81cccf06271 100644 --- a/target/cheri-common/cheri-lazy-capregs-types.h +++ b/target/cheri-common/cheri-lazy-capregs-types.h @@ -63,7 +63,8 @@ typedef enum CapRegState { CREG_TAGGED_CAP = 0b10, /// This capability register holds a fully decompressed capability. /// The tag bit can be read from the cap_register_t structure. - CREG_FULLY_DECOMPRESSED = 0b11 + CREG_FULLY_DECOMPRESSED = 0b11, + CREG_STATE_MASK = 0b11, } CapRegState; // Cap registers should be padded so they are easier to move. diff --git a/target/cheri-common/cheri-lazy-capregs.h b/target/cheri-common/cheri-lazy-capregs.h index 5ae871e5fd8..3f4eedfa815 100644 --- a/target/cheri-common/cheri-lazy-capregs.h +++ b/target/cheri-common/cheri-lazy-capregs.h @@ -47,15 +47,20 @@ static inline GPCapRegs *cheri_get_gpcrs(CPUArchState *env); -static inline uint64_t capreg_state_set_to_integer_mask(unsigned reg) +static inline QEMU_ALWAYS_INLINE uint64_t +capreg_state_set_to_integer_mask(unsigned reg) { - return ~(UINT64_C(3) << (reg * 2)); + return ~(((uint64_t)CREG_STATE_MASK) << (reg * 2)); } static inline CapRegState get_capreg_state(const GPCapRegs *gpcrs, unsigned reg) { cheri_debug_assert(reg < 32); - return (CapRegState)extract64(gpcrs->capreg_state, reg * 2, 2); + /* + * Note: QEMU's extract64 has assertions enabled (even in release mode). + * Since this is a hot path, we re-implement it without assertions here. + */ + return (CapRegState)((gpcrs->capreg_state >> (reg * 2)) & CREG_STATE_MASK); } static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum) @@ -101,8 +106,9 @@ static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum) #endif // CONFIG_DEBUG_TCG } -static inline void set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, - CapRegState new_state) +/* Marked as always_inline to avoid the |= if called with CREG_INTEGER. */ +static inline QEMU_ALWAYS_INLINE void +set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, CapRegState new_state) { if (regnum == NULL_CAPREG_INDEX) { cheri_debug_assert(new_state == CREG_FULLY_DECOMPRESSED && @@ -111,8 +117,14 @@ static inline void set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, } cheri_debug_assert(regnum < 32); - gpcrs->capreg_state = - deposit64(gpcrs->capreg_state, regnum * 2, 2, new_state); + /* + * Note: QEMU's deposit64 has assertions enabled (even in release mode). + * Since this is a hot path, we re-implement it without assertions here. + */ + gpcrs->capreg_state &= capreg_state_set_to_integer_mask(regnum); + if (!__builtin_constant_p(new_state) || new_state != 0) { + gpcrs->capreg_state |= (((uint64_t)new_state) << (regnum * 2)); + } // Check that the compressed and decompressed caps are in sync sanity_check_capreg(gpcrs, regnum); }