Skip to content

Commit

Permalink
Stop packing capreg_state into a single 64-bit integer
Browse files Browse the repository at this point in the history
The bitwise operations are more expensive than expanding the 64-bit value
to a 32 byte array. This results in a 1.21x speedup running the MFS_ROOT
kernel and a 1.08x speedup for the full purecap kernel+purecap userspace
boot. Benchmarks using a previous version of this patch indicated a 1.08
speedup for the MFS_ROOT case. My assumption is that removing the TCG
global accesses to cpu_capreg_state now has a larger impact after
0c09763 remove the incorrect NO_RWG flag.

MFS_ROOT boot:
```
hyperfine -L qemu /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri  '{qemu} -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' -w 1
Benchmark #1: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh
  Time (mean ± σ):      9.499 s ±  0.027 s    [User: 8.581 s, System: 0.136 s]
  Range (min … max):    9.448 s …  9.539 s    10 runs

Benchmark #2: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh
  Time (mean ± σ):      9.281 s ±  0.029 s    [User: 8.260 s, System: 0.137 s]
  Range (min … max):    9.234 s …  9.326 s    10 runs

Benchmark #3: /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh
  Time (mean ± σ):      7.852 s ±  0.053 s    [User: 7.523 s, System: 0.182 s]
  Range (min … max):    7.793 s …  7.933 s    10 runs

Summary
  '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh' ran
    1.18 ± 0.01 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.no-deposit-assertions -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh'
    1.21 ± 0.01 times faster than '/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 -M virt -m 2048 -nographic -bios bbl-riscv64cheri-virt-fw_jump.bin -kernel /local/scratch/alr48/cheri/output/kernel-riscv64-purecap.CHERI-PURECAP-QEMU-MFS-ROOT -append init_path=/sbin/startup-benchmark.sh'
```
Full purecap+purecap boot:
```
hyperfine -L qemu /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123,/local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd {qemu} --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' -m 3
Benchmark #1: /home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest
  Time (mean ± σ):     227.351 s ±  0.484 s    [User: 213.193 s, System: 3.544 s]
  Range (min … max):   226.792 s … 227.646 s    3 runs

  Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet PC without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options.

Benchmark #2: /home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest
  Time (mean ± σ):     210.156 s ±  3.601 s    [User: 197.448 s, System: 1.979 s]
  Range (min … max):   206.397 s … 213.575 s    3 runs

Summary
  '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest' ran
    1.08 ± 0.02 times faster than '/home/alr48/devel/cheribuild/test-scripts/run_cheribsd_tests.py --ssh-key /home/alr48/.ssh/insecure_id_ed25519.pub --architecture riscv64-purecap --kernel /local/scratch/alr48/cheri/output/rootfs-riscv64-purecap/boot/kernel.CHERI-PURECAP-QEMU/kernel --qemu-cmd /local/scratch/alr48/cheri/output/sdk/bin/qemu-system-riscv64cheri.v5.2.0-933-g0c09763123 --disk-image /local/scratch/alr48/cheri/output/cheribsd-riscv64-purecap.img --no-run-cheribsdtest'
```
  • Loading branch information
arichardson committed Jul 7, 2021
1 parent 2d7de03 commit 667033e
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 31 deletions.
1 change: 0 additions & 1 deletion include/tcg/tcg.h
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,6 @@ extern uintptr_t tcg_splitwx_diff;
extern TCGv_env cpu_env;
#ifdef TARGET_CHERI
extern TCGv ddc_interposition;
extern TCGv_i64 cpu_capreg_state; // 32 times 2 bits
#endif
#ifdef CONFIG_DEBUG_TCG
extern TCGv _pc_is_current;
Expand Down
2 changes: 1 addition & 1 deletion target/cheri-common/cheri-lazy-capregs-types.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,6 @@ typedef struct GPCapRegs {
// We cache the decompressed capregs here (to avoid constantly decompressing
// values such as $csp which are used frequently)
cap_register_t decompressed[32];
uint64_t capreg_state; // 32 times CapRegState compressed to one uint64_t
uint8_t capreg_state[32] QEMU_ALIGNED(64); /* 32 times CapRegState */
} GPCapRegs;
#endif
33 changes: 10 additions & 23 deletions target/cheri-common/cheri-lazy-capregs.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,11 @@

static inline GPCapRegs *cheri_get_gpcrs(CPUArchState *env);

static inline QEMU_ALWAYS_INLINE uint64_t
capreg_state_set_to_integer_mask(unsigned reg)
{
return ~(((uint64_t)CREG_STATE_MASK) << (reg * 2));
}

static inline CapRegState get_capreg_state(const GPCapRegs *gpcrs, unsigned reg)
static inline QEMU_ALWAYS_INLINE CapRegState
get_capreg_state(const GPCapRegs *gpcrs, unsigned reg)
{
cheri_debug_assert(reg < 32);
/*
* Note: QEMU's extract64 has assertions enabled (even in release mode).
* Since this is a hot path, we re-implement it without assertions here.
*/
return (CapRegState)((gpcrs->capreg_state >> (reg * 2)) & CREG_STATE_MASK);
return (CapRegState)gpcrs->capreg_state[reg];
}

static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum)
Expand Down Expand Up @@ -106,7 +97,6 @@ static inline void sanity_check_capreg(GPCapRegs *gpcrs, unsigned regnum)
#endif // CONFIG_DEBUG_TCG
}

/* Marked as always_inline to avoid the |= if called with CREG_INTEGER. */
static inline QEMU_ALWAYS_INLINE void
set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, CapRegState new_state)
{
Expand All @@ -117,14 +107,7 @@ set_capreg_state(GPCapRegs *gpcrs, unsigned regnum, CapRegState new_state)
}

cheri_debug_assert(regnum < 32);
/*
* Note: QEMU's deposit64 has assertions enabled (even in release mode).
* Since this is a hot path, we re-implement it without assertions here.
*/
gpcrs->capreg_state &= capreg_state_set_to_integer_mask(regnum);
if (!__builtin_constant_p(new_state) || new_state != 0) {
gpcrs->capreg_state |= (((uint64_t)new_state) << (regnum * 2));
}
gpcrs->capreg_state[regnum] = new_state;
// Check that the compressed and decompressed caps are in sync
sanity_check_capreg(gpcrs, regnum);
}
Expand Down Expand Up @@ -417,7 +400,9 @@ static inline void reset_capregs(CPUArchState *env)
{
// Reset all to NULL:
GPCapRegs *gpcrs = cheri_get_gpcrs(env);
gpcrs->capreg_state = UINT64_MAX; // All decompressed values
for (size_t i = 0; i < ARRAY_SIZE(gpcrs->capreg_state); i++) {
gpcrs->capreg_state[i] = CREG_FULLY_DECOMPRESSED;
}
for (size_t i = 0; i < ARRAY_SIZE(gpcrs->decompressed); i++) {
const cap_register_t* newval = null_capability(&gpcrs->decompressed[i]);
// Register should be fully decompressed
Expand All @@ -432,7 +417,9 @@ static inline void set_max_perms_capregs(CPUArchState *env)
{
// Reset all to max perms (except NULL of course):
GPCapRegs *gpcrs = cheri_get_gpcrs(env);
gpcrs->capreg_state = UINT64_MAX; // All decompressed values
for (size_t i = 0; i < ARRAY_SIZE(gpcrs->capreg_state); i++) {
gpcrs->capreg_state[i] = CREG_FULLY_DECOMPRESSED;
}
null_capability(&gpcrs->decompressed[NULL_CAPREG_INDEX]);
sanity_check_capreg(gpcrs, NULL_CAPREG_INDEX);
for (size_t i = 0; i < ARRAY_SIZE(gpcrs->decompressed); i++) {
Expand Down
10 changes: 5 additions & 5 deletions target/riscv/translate.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,11 @@ static inline void gen_mark_gpr_as_integer(int reg_num_dst)
{
/* Currently, the integer flag is 0, so we can mask the 64-bit value holding
* the capreg state appropriately to clear the bits for register N. */
tcg_gen_andi_i64(cpu_capreg_state, cpu_capreg_state,
capreg_state_set_to_integer_mask(reg_num_dst));
TCGv_i32 integer_state = tcg_const_i32(CREG_INTEGER);
tcg_gen_st8_i32(
integer_state, cpu_env,
offsetof(CPURISCVState, gpcapregs.capreg_state[reg_num_dst]));
tcg_temp_free_i32(integer_state);
tcg_gen_movi_tl(_cpu_pesbt_do_not_access_directly[reg_num_dst],
CAP_NULL_PESBT);
/* TODO: maybe all ones is more efficient? We can just do an or and don't
Expand Down Expand Up @@ -1085,9 +1088,6 @@ void riscv_translate_init(void)
offsetof(CPURISCVState, gpcapregs.decompressed[i].cached_pesbt),
cheri_gp_regnames[i]);
}
cpu_capreg_state = tcg_global_mem_new_i64(
cpu_env, offsetof(CPURISCVState, gpcapregs.capreg_state),
"capreg_state");
#endif
#ifdef CONFIG_RVFI_DII
cpu_rvfi_available_fields = tcg_global_mem_new_i32(
Expand Down
1 change: 0 additions & 1 deletion tcg/tcg.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,6 @@ TCGv _pc_is_current = 0;
#endif
#ifdef TARGET_CHERI
TCGv ddc_interposition;
TCGv_i64 cpu_capreg_state; // 32 times 2 bits
#endif

#ifndef CONFIG_TCG_INTERPRETER
Expand Down

0 comments on commit 667033e

Please sign in to comment.