diff --git a/benchmarks/lockhammer/Makefile b/benchmarks/lockhammer/Makefile index ad1e730..600c8c4 100644 --- a/benchmarks/lockhammer/Makefile +++ b/benchmarks/lockhammer/Makefile @@ -31,7 +31,8 @@ TEST_TARGETS=lh_swap_mutex \ lh_queued_spinlock \ lh_empty \ lh_jvm_objectmonitor \ - lh_tbb_spin_rw_mutex + lh_tbb_spin_rw_mutex \ + lh_osq_lock ifeq ($(TARGET_ARCH),aarch64) TEST_TARGETS+=lh_hybrid_spinlock \ @@ -55,6 +56,9 @@ lh_hybrid_spinlock: ../../ext/linux/hybrid_spinlock.h include/atomics.h ../../ex lh_hybrid_spinlock_fastdequeue: ../../ext/linux/hybrid_spinlock_fastdequeue.h include/atomics.h ../../ext/linux/include/lk_atomics.h src/lockhammer.c ${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS} +lh_osq_lock: ../../ext/linux/osq_lock.h ../../ext/linux/include/lk_atomics.h ../../ext/linux/include/lk_barrier.h ../../ext/linux/include/lk_cmpxchg.h include/atomics.h src/lockhammer.c + ${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS} + lh_queued_spinlock: ../../ext/linux/queued_spinlock.h include/atomics.h ../../ext/linux/include/lk_atomics.h src/lockhammer.c ${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS} diff --git a/ext/linux/include/lk_atomics.h b/ext/linux/include/lk_atomics.h index 95b0cbb..86e9b06 100644 --- a/ext/linux/include/lk_atomics.h +++ b/ext/linux/include/lk_atomics.h @@ -392,3 +392,7 @@ do { \ #define arch_mcs_spin_unlock_contended(l) \ smp_store_release((l), 1) + +#define ATOMIC_INIT(i) { (i) } +#define atomic_read(v) READ_ONCE((v)->counter) +#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) diff --git a/ext/linux/include/lk_barrier.h b/ext/linux/include/lk_barrier.h new file mode 100644 index 0000000..057ac57 --- /dev/null +++ b/ext/linux/include/lk_barrier.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Based on Linux kernel 4.16.10 + * arch/arm64/include/asm/barrier.h + * arch/x86/include/asm/barrier.h + * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115 + */ + +#ifndef __ASM_BARRIER_H +#define __ASM_BARRIER_H + +#include "lk_cmpxchg.h" + +#if defined(__x86_64__) + +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#define dma_rmb() barrier() +#define dma_wmb() barrier() +#define smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc") +#define smp_rmb() dma_rmb() +#define smp_wmb() barrier() +#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) + + +/* Atomic operations are already serializing on x86 */ +#define __smp_mb__before_atomic() barrier() +#define __smp_mb__after_atomic() barrier() + + +#elif defined(__aarch64__) + +#define isb() asm volatile("isb" : : : "memory") +#define dmb(opt) asm volatile("dmb " #opt : : : "memory") +#define dsb(opt) asm volatile("dsb " #opt : : : "memory") +#define psb_csync() asm volatile("hint #17" : : : "memory") +#define csdb() asm volatile("hint #20" : : : "memory") +#define mb() dsb(sy) +#define rmb() dsb(ld) +#define wmb() dsb(st) +#define dma_rmb() dmb(oshld) +#define dma_wmb() dmb(oshst) +#define smp_mb() dmb(ish) +#define smp_rmb() dmb(ishld) +#define smp_wmb() dmb(ishst) + +#else /* No Arch */ + /* TODO: No Arch Default */ +#endif /* __x86_64__ */ + +#endif /* __ASM_BARRIER_H */ diff --git a/ext/linux/include/lk_cmpxchg.h b/ext/linux/include/lk_cmpxchg.h new file mode 100644 index 0000000..85178b8 --- /dev/null +++ b/ext/linux/include/lk_cmpxchg.h @@ -0,0 +1,463 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Based on Linux kernel 4.16.10 + * arch/arm64/include/asm/cmpxchg.h + * arch/x86/include/asm/cmpxchg.h + * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115 + */ + +#ifndef __ASM_CMPXCHG_H +#define __ASM_CMPXCHG_H + +#if defined(__x86_64__) +#define LOCK_PREFIX_HERE \ + ".pushsection .smp_locks,\"a\"\n" \ + ".balign 4\n" \ + ".long 671f - .\n" /* offset */ \ + ".popsection\n" \ + "671:" + +#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " + +/* + * Constants for operation sizes. On 32-bit, the 64-bit size it set to + * -1 because sizeof will never return -1, thereby making those switch + * case statements guaranteeed dead code which the compiler will + * eliminate, and allowing the "missing symbol in the default case" to + * indicate a usage error. + */ +#define __X86_CASE_B 1 +#define __X86_CASE_W 2 +#define __X86_CASE_L 4 +#define __X86_CASE_Q 8 + +/* + * An exchange-type operation, which takes a value and a pointer, and + * returns the old value. + */ +#define __xchg_op(ptr, arg, op, lock) \ + ({ \ + __typeof__ (*(ptr)) __ret = (arg); \ + switch (sizeof(*(ptr))) { \ + case __X86_CASE_B: \ + asm volatile (lock #op "b %b0, %1\n" \ + : "+q" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_W: \ + asm volatile (lock #op "w %w0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_L: \ + asm volatile (lock #op "l %0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_Q: \ + asm volatile (lock #op "q %q0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + } \ + __ret; \ + }) + +/* + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. + * Since this is generally used to protect other memory information, we + * use "asm volatile" and "memory" clobbers to prevent gcc from moving + * information around. + */ +#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") + + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ +#define __raw_cmpxchg(ptr, old, new, size, lock) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile(lock "cmpxchgb %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "q" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile(lock "cmpxchgw %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ + asm volatile(lock "cmpxchgq %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + } \ + __ret; \ +}) + +#define __cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) + +#define __sync_cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") + +#define __cmpxchg_local(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "") + +#define cmpxchg(ptr, old, new) \ + __cmpxchg(ptr, old, new, sizeof(*(ptr))) + +#define sync_cmpxchg(ptr, old, new) \ + __sync_cmpxchg(ptr, old, new, sizeof(*(ptr))) + +#define cmpxchg_local(ptr, old, new) \ + __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) + +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +static inline int atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} + +#define atomic_cmpxchg_relaxed atomic_cmpxchg +#define atomic_cmpxchg_acquire atomic_cmpxchg +#define atomic_cmpxchg_release atomic_cmpxchg +#define atomic_xchg_relaxed atomic_xchg +#define atomic_xchg_acquire atomic_xchg +#define atomic_xchg_release atomic_xchg + + +#elif defined(__aarch64__) + +#define unreachable() \ + do { \ + asm volatile(""); \ + __builtin_unreachable(); \ + } while (0) + +#define notrace __attribute__((no_instrument_function)) + +#define __nops(n) ".rept " #n "\nnop\n.endr\n" +#define nops(n) asm volatile(__nops(n)) + +/* Move the ll/sc atomics out-of-line */ +#define __LL_SC_INLINE notrace +#define __LL_SC_PREFIX(x) __ll_sc_##x + +#define __CMPXCHG_CASE(w, sz, name, mb, acq, rel, cl) \ +__LL_SC_INLINE unsigned long \ +__LL_SC_PREFIX(__cmpxchg_case_##name(volatile void *ptr, \ + unsigned long old, \ + unsigned long new)) \ +{ \ + unsigned long tmp, oldval; \ + \ + asm volatile( \ + /* LL/SC */ \ + " prfm pstl1strm, %[v]\n" \ + "1: ld" #acq "xr" #sz "\t%" #w "[oldval], %[v]\n" \ + " eor %" #w "[tmp], %" #w "[oldval], %" #w "[old]\n" \ + " cbnz %" #w "[tmp], 2f\n" \ + " st" #rel "xr" #sz "\t%w[tmp], %" #w "[new], %[v]\n" \ + " cbnz %w[tmp], 1b\n" \ + " " #mb "\n" \ + "2:" \ + : [tmp] "=&r" (tmp), [oldval] "=&r" (oldval), \ + [v] "+Q" (*(unsigned long *)ptr) \ + : [old] "Lr" (old), [new] "r" (new) \ + : cl); \ + \ + return oldval; \ +} \ + +__CMPXCHG_CASE(w, b, 1, , , , ) +__CMPXCHG_CASE(w, h, 2, , , , ) +__CMPXCHG_CASE(w, , 4, , , , ) +__CMPXCHG_CASE( , , 8, , , , ) +__CMPXCHG_CASE(w, b, acq_1, , a, , "memory") +__CMPXCHG_CASE(w, h, acq_2, , a, , "memory") +__CMPXCHG_CASE(w, , acq_4, , a, , "memory") +__CMPXCHG_CASE( , , acq_8, , a, , "memory") +__CMPXCHG_CASE(w, b, rel_1, , , l, "memory") +__CMPXCHG_CASE(w, h, rel_2, , , l, "memory") +__CMPXCHG_CASE(w, , rel_4, , , l, "memory") +__CMPXCHG_CASE( , , rel_8, , , l, "memory") +__CMPXCHG_CASE(w, b, mb_1, dmb ish, , l, "memory") +__CMPXCHG_CASE(w, h, mb_2, dmb ish, , l, "memory") +__CMPXCHG_CASE(w, , mb_4, dmb ish, , l, "memory") +__CMPXCHG_CASE( , , mb_8, dmb ish, , l, "memory") + +#undef __CMPXCHG_CASE + +#define __LSE_CMPXCHG_CASE(w, sz, name, mb, cl...) \ +static inline unsigned long __cmpxchg_case_##name(volatile void *ptr, \ + unsigned long old, \ + unsigned long new) \ +{ \ + register unsigned long x0 asm ("x0") = (unsigned long)ptr; \ + register unsigned long x1 asm ("x1") = old; \ + register unsigned long x2 asm ("x2") = new; \ + \ + asm volatile( \ + /* LSE atomics */ \ + " mov " #w "30, %" #w "[old]\n" \ + " cas" #mb #sz "\t" #w "30, %" #w "[new], %[v]\n" \ + " mov %" #w "[ret], " #w "30" \ + : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr) \ + : [old] "r" (x1), [new] "r" (x2) \ + : cl); \ + \ + return x0; \ +} + +__LSE_CMPXCHG_CASE(w, b, 1, ) +__LSE_CMPXCHG_CASE(w, h, 2, ) +__LSE_CMPXCHG_CASE(w, , 4, ) +__LSE_CMPXCHG_CASE(x, , 8, ) +__LSE_CMPXCHG_CASE(w, b, acq_1, a, "memory") +__LSE_CMPXCHG_CASE(w, h, acq_2, a, "memory") +__LSE_CMPXCHG_CASE(w, , acq_4, a, "memory") +__LSE_CMPXCHG_CASE(x, , acq_8, a, "memory") +__LSE_CMPXCHG_CASE(w, b, rel_1, l, "memory") +__LSE_CMPXCHG_CASE(w, h, rel_2, l, "memory") +__LSE_CMPXCHG_CASE(w, , rel_4, l, "memory") +__LSE_CMPXCHG_CASE(x, , rel_8, l, "memory") +__LSE_CMPXCHG_CASE(w, b, mb_1, al, "memory") +__LSE_CMPXCHG_CASE(w, h, mb_2, al, "memory") +__LSE_CMPXCHG_CASE(w, , mb_4, al, "memory") +__LSE_CMPXCHG_CASE(x, , mb_8, al, "memory") + +#undef __LSE_CMPXCHG_CASE + +/* + * aarch64 cmpxchg implementation has been modified to disable runtime + * binary patching and use LL/SC assemblies directly without hard branch + * and link inside LSE CMPXCHG_GEN. This resolved a bug which was related + * to missing CFLAGS_atomic_ll_sc.o in user space. The special CFLAGS in + * arch/arm64/lib/Makefile tells the compiler to treat all general + * purpose registers (with the exception of the IP registers, which are + * already handled by the caller in case of a PLT) as callee-saved, which + * allows for efficient runtime patching of the bl instruction in the + * caller with an atomic instruction when supported by the CPU. Result + * and argument registers are handled correctly, based on the function + * prototype. (__LL_SC_CLOBBERS and original __CMPXCHG_CASE) + */ + +#if defined(USE_LSE) /* ARMv8.1 with LSE */ +#define __CMPXCHG_GEN(sfx) \ +static inline unsigned long __cmpxchg##sfx(volatile void *ptr, \ + unsigned long old, \ + unsigned long new, \ + int size) \ +{ \ + switch (size) { \ + case 1: \ + return __cmpxchg_case##sfx##_1(ptr, (u8)old, new); \ + case 2: \ + return __cmpxchg_case##sfx##_2(ptr, (u16)old, new); \ + case 4: \ + return __cmpxchg_case##sfx##_4(ptr, old, new); \ + case 8: \ + return __cmpxchg_case##sfx##_8(ptr, old, new); \ + } \ + \ + unreachable(); \ +} +#else /* ARMv8.0 without LSE */ +#define __CMPXCHG_GEN(sfx) \ +static inline unsigned long __cmpxchg##sfx(volatile void *ptr, \ + unsigned long old, \ + unsigned long new, \ + int size) \ +{ \ + switch (size) { \ + case 1: \ + return __ll_sc___cmpxchg_case##sfx##_1(ptr, (u8)old, new); \ + case 2: \ + return __ll_sc___cmpxchg_case##sfx##_2(ptr, (u16)old, new); \ + case 4: \ + return __ll_sc___cmpxchg_case##sfx##_4(ptr, old, new); \ + case 8: \ + return __ll_sc___cmpxchg_case##sfx##_8(ptr, old, new); \ + } \ + \ + unreachable(); \ +} +#endif /* ARMv8 with or without LSE */ + +__CMPXCHG_GEN() +__CMPXCHG_GEN(_acq) +__CMPXCHG_GEN(_rel) +__CMPXCHG_GEN(_mb) + +#undef __CMPXCHG_GEN + +#define __cmpxchg_wrapper(sfx, ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __ret = (__typeof__(*(ptr))) \ + __cmpxchg##sfx((ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr))); \ + __ret; \ +}) + +/* cmpxchg */ +#define cmpxchg_relaxed(...) __cmpxchg_wrapper( , __VA_ARGS__) +#define cmpxchg_acquire(...) __cmpxchg_wrapper(_acq, __VA_ARGS__) +#define cmpxchg_release(...) __cmpxchg_wrapper(_rel, __VA_ARGS__) +#define cmpxchg(...) __cmpxchg_wrapper( _mb, __VA_ARGS__) +#define cmpxchg_local cmpxchg_relaxed + + +/* + * Original ARM64_LSE_ATOMIC_INSN is defined as ALTERNATIVE and would + * check runtime CPU capability and dynamically patch kernel binary. + * New ARM64_LSE_ATOMIC_INSN has been modified to use the first or second + * argument as output string depending on external USE_LSE define. + */ + +#if defined(USE_LSE) /* ARMv8.1 with LSE */ +#define ARM64_LSE_ATOMIC_INSN(llsc, lse) lse +__asm__(".arch_extension lse"); +#else /* ARMv8.0 without LSE */ +#define ARM64_LSE_ATOMIC_INSN(llsc, lse) llsc +#endif + +/* + * We need separate acquire parameters for ll/sc and lse, since the full + * barrier case is generated as release+dmb for the former and + * acquire+release for the latter. + */ +#define __XCHG_CASE(w, sz, name, mb, nop_lse, acq, acq_lse, rel, cl) \ +static inline unsigned long __xchg_case_##name(unsigned long x, \ + volatile void *ptr) \ +{ \ + unsigned long ret, tmp; \ + \ + asm volatile(ARM64_LSE_ATOMIC_INSN( \ + /* LL/SC */ \ + " prfm pstl1strm, %2\n" \ + "1: ld" #acq "xr" #sz "\t%" #w "0, %2\n" \ + " st" #rel "xr" #sz "\t%w1, %" #w "3, %2\n" \ + " cbnz %w1, 1b\n" \ + " " #mb, \ + /* LSE atomics */ \ + " swp" #acq_lse #rel #sz "\t%" #w "3, %" #w "0, %2\n" \ + __nops(3) \ + " " #nop_lse) \ + : "=&r" (ret), "=&r" (tmp), "+Q" (*(unsigned long *)ptr) \ + : "r" (x) \ + : cl); \ + \ + return ret; \ +} + +__XCHG_CASE(w, b, 1, , , , , , ) +__XCHG_CASE(w, h, 2, , , , , , ) +__XCHG_CASE(w, , 4, , , , , , ) +__XCHG_CASE( , , 8, , , , , , ) +__XCHG_CASE(w, b, acq_1, , , a, a, , "memory") +__XCHG_CASE(w, h, acq_2, , , a, a, , "memory") +__XCHG_CASE(w, , acq_4, , , a, a, , "memory") +__XCHG_CASE( , , acq_8, , , a, a, , "memory") +__XCHG_CASE(w, b, rel_1, , , , , l, "memory") +__XCHG_CASE(w, h, rel_2, , , , , l, "memory") +__XCHG_CASE(w, , rel_4, , , , , l, "memory") +__XCHG_CASE( , , rel_8, , , , , l, "memory") +__XCHG_CASE(w, b, mb_1, dmb ish, nop, , a, l, "memory") +__XCHG_CASE(w, h, mb_2, dmb ish, nop, , a, l, "memory") +__XCHG_CASE(w, , mb_4, dmb ish, nop, , a, l, "memory") +__XCHG_CASE( , , mb_8, dmb ish, nop, , a, l, "memory") + +#undef __XCHG_CASE + +#define __XCHG_GEN(sfx) \ +static inline unsigned long __xchg##sfx(unsigned long x, \ + volatile void *ptr, \ + int size) \ +{ \ + switch (size) { \ + case 1: \ + return __xchg_case##sfx##_1(x, ptr); \ + case 2: \ + return __xchg_case##sfx##_2(x, ptr); \ + case 4: \ + return __xchg_case##sfx##_4(x, ptr); \ + case 8: \ + return __xchg_case##sfx##_8(x, ptr); \ + } \ + \ + unreachable(); \ +} + +__XCHG_GEN() +__XCHG_GEN(_acq) +__XCHG_GEN(_rel) +__XCHG_GEN(_mb) + +#undef __XCHG_GEN + +#define __xchg_wrapper(sfx, ptr, x) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __ret = (__typeof__(*(ptr))) \ + __xchg##sfx((unsigned long)(x), (ptr), sizeof(*(ptr))); \ + __ret; \ +}) + +/* xchg */ +#define xchg_relaxed(...) __xchg_wrapper( , __VA_ARGS__) +#define xchg_acquire(...) __xchg_wrapper(_acq, __VA_ARGS__) +#define xchg_release(...) __xchg_wrapper(_rel, __VA_ARGS__) +#define xchg(...) __xchg_wrapper( _mb, __VA_ARGS__) + +#define atomic_cmpxchg_relaxed(v, old, new) \ + cmpxchg_relaxed(&((v)->counter), (old), (new)) +#define atomic_cmpxchg_acquire(v, old, new) \ + cmpxchg_acquire(&((v)->counter), (old), (new)) +#define atomic_cmpxchg_release(v, old, new) \ + cmpxchg_release(&((v)->counter), (old), (new)) +#define atomic_cmpxchg(v, old, new) cmpxchg(&((v)->counter), (old), (new)) + +#define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new)) +#define atomic_xchg_acquire(v, new) xchg_acquire(&((v)->counter), (new)) +#define atomic_xchg_release(v, new) xchg_release(&((v)->counter), (new)) +#define atomic_xchg(v, new) xchg(&((v)->counter), (new)) + +#else /* Unknown Arch */ + /* TODO: No Arch Default */ +#endif /* __x86_64__ */ + +#endif /* __ASM_CMPXCHG_H */ diff --git a/ext/linux/osq_lock.h b/ext/linux/osq_lock.h new file mode 100644 index 0000000..d576a20 --- /dev/null +++ b/ext/linux/osq_lock.h @@ -0,0 +1,526 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Based on Linux kernel 4.16.10 + * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git + * /commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115 + * + * Description: + * + * This workload implements kernel 'optimistic spin queue' derived from mcs + * lock. Tunable unqueue_retry times and max_backoff_sleep duration have + * also been added to simulate need_resched() condition and unqueue current + * cpu node from spinning queue and put to sleep. + * + * Changes from Linux kernel osq_lock.c + * + * The original DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, + * osq_node) was modified to 128 byte aligned optimistic_spin_node C array + * allocated in heap during osq_lock_init() in main thread. It was pointed + * by global_osq_nodepool_ptr pointer. The osq lock queue struct itself was + * declared as a global variable too, which would substitute upper level + * mutex lock struct indicated by lock pointer. Therefore we don't need to + * get the lock pointer from lock_acquire() and lock_release() interface. + * The spinning node structure can be linearly located by osq_nodepool_ptr + * with threadnum/coreid as offset. The tail of osq_lock can be accessed + * by global_osq directly. + * + * We haven't changed the algorithm except adding unqueue_retry and max_ + * sleep_us as optional backoff sleep to mimic kernel rescheduling events. + * By default we essentially disable unqueue_retry and backoff sleep so + * that osq_lock performance is more stable and similar to mcs queue spin + * lock. + * + * Internals: + * + * In order to port osq_lock from kernel space to user space, we added + * lk_barrier.h and lk_cmpxchg.h to synchronization-benchmarks/ext/linux/ + * include. Because there are some special gcc options to restrict compiler + * from allocating x16/x17 registers in arch/arm64/lib/Makefile for + * atomic_ll_sc.o, and our osq_lock.h included from lockhammer.c will not + * generate any other separate object file, we have to modify cmpxchg.h + * and change cmpxchg LLSC/LSE implementation for aarch64. + * + * Kernel arm64 cmpxchg.h supports both LLSC (load-link/store-conditional) + * and LSE (Armv8.1 large system extension) via dynamic binary patching. + * If CONFIG_AS_LSE and CONFIG_ARM64_LSE_ATOMICS have been enabled, kernel + * will use Armv8.1 new atomic instructions CAS to implement the compare + * and swap function. This inline function has 3 instructions mov/cas/mov, + * which will be overwritten during system boot up if the CPU doesn't + * support Armv8.1 LSE. The 3 new instructions are bl/nop/nop. The branch + * and link instruction will redirect program flow to Armv8.0 LLSC function + * without saving any of the caller's local registers. These registers are + * guaranteed to be safe because LLSC function in atomic_ll_sc.o only uses + * x16/x17 and LSE caller doesn't use x16/x17. + * + * Since lockhammer doesn't have runtime cpu detection, whether to use LLSC + * or LSE is manually defined in lockhammer Makefile. Therefore our new + * cmpxchg is also statically defined without branch and link or binary + * patching. LLSC and LSE cmpxchg will share the same interface but use + * different assembly codes and functions. + * + * Workings: + * + * osq_lock works similar to mcs spinlock except the optional unqueue path. + * Linux kernel qspinlock is slightly different than original mcs spinlock. + * + * Tuning Parameters + * + * Optional unqueue and backoff sleep feature like kernel mutex + * + * [-- [-u unqueue_retry]]: how many spin retries before jumping to unqueue + * path and stop spinning. + * + * [-- [-s max_sleep_us]]: how long to sleep after unqueue from osq before + * another osq_lock() acquisition attempt. This + * parameter only defines the maximum sleep time in + * microseconds, each thread will sleep for random + * time less than this max_sleep_us. The actual + * sleep time is predetermined during main thread + * initialization phase with uniform distribution + * random function rand(). + * + */ + +#ifndef __LINUX_OSQ_LOCK_H +#define __LINUX_OSQ_LOCK_H + +/* redefine initialize_lock and parse_test_args with local functions */ +#ifdef initialize_lock +#undef initialize_lock +#endif + +#ifdef parse_test_args +#undef parse_test_args +#endif + +#define initialize_lock(lock, threads) osq_lock_init(lock, threads) +#define parse_test_args(args, argc, argv) osq_parse_args(args, argc, argv) + +#include +#include "atomics.h" +#include "lk_atomics.h" +#include "lk_cmpxchg.h" +#include "lk_barrier.h" + +#define ATOMIC_INIT(i) { (i) } + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + * + * Using 128 bytes alignment to eliminate false sharing for various Armv8 core + * cache line size + */ +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; + int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # + 1 value */ + int random_sleep; /* random sleep in us */ +} __attribute__ ((aligned (128))); + +struct optimistic_spin_queue { + /* + * Stores an encoded value of the CPU # of the tail node in the queue. + * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL. + */ + atomic_t tail; +}; + +/* 0 means thread unlocked, 1~N represents each individual thread on core 1~N */ +#define OSQ_UNLOCKED_VAL (0) + +/* + * maximum backoff sleep time in microseconds (default 0us, no sleep) + * linux kernel scheduling intrinsic delay is less than 7us, however + * we need to tune this parameter for different machines. + * http://www.brendangregg.com/blog/2017-03-16/perf-sched.html + */ +#define MAX_BACKOFF_SLEEP_US 0 + +/* + * Default unqueue retry times, most system spins at least 500~1000 times + * before unqueue from optimistic_spin_queue. Default large value simply + * disables unqueue path and make osq_lock more like mcs_queue_spinlock. + */ +#define DEFAULT_UNQUEUE_RETRY 2000000000 + +/* Init macro and function. */ +#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } + +/* Newly added global variables used by osq_lock algorithm */ +static long long unqueue_retry; +static long long max_sleep_us; +static struct optimistic_spin_queue global_osq; +static struct optimistic_spin_node *global_osq_nodepool_ptr; + +/* Newly added additional tuning parameters for optional backoff sleep */ +static void osq_parse_args(test_args unused, int argc, char** argv) { + int i = 0; + char *endptr; + unqueue_retry = DEFAULT_UNQUEUE_RETRY; + max_sleep_us = MAX_BACKOFF_SLEEP_US; + + /* extended options retrieved after '--' operator */ + while ((i = getopt(argc, argv, "u:s:")) != -1) + { + switch (i) { + case 'u': + errno = 0; + unqueue_retry = strtoll(optarg, &endptr, 10); + if ((errno == ERANGE && (unqueue_retry == LONG_LONG_MAX)) + || (errno != 0 && unqueue_retry == 0) || endptr == optarg) { + fprintf(stderr, "unqueue_retry: value unsuitable " + "for 'long long int'\n"); + exit(1); + } + break; + + case 's': + errno = 0; + max_sleep_us = strtoll(optarg, &endptr, 10); + if ((errno == ERANGE && (max_sleep_us == LONG_LONG_MAX)) + || (errno != 0 && max_sleep_us == 0) || endptr == optarg) { + fprintf(stderr, "max_sleep_us: value unsuitable " + "for 'long long int'\n"); + exit(1); + } else if (max_sleep_us < 0) { + fprintf(stderr, "max_sleep_us must be a positive integer.\n"); + exit(1); + } + break; + + default: + fprintf(stderr, + "osq_lock additional options after --:\n" + "\t[-h print this msg]\n" + "\t[-u max spin retries before unqueue, default 2 billions]\n" + "\t[-s max unqueue sleep in microseconds, default 0]\n"); + exit(2); + } + } +} + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static inline void osq_lock_init(uint64_t *lock, unsigned long cores) +{ + /* + * Allocate optimistic_spin_node from heap during main thread initialization. + * Each cpu core will have its own spinning node, aligned to 128 bytes maximum + * cache line, calloc will set memory to zero automatically, therefore no need + * to bzero the nodepool. + */ + global_osq_nodepool_ptr = calloc(cores + 1, sizeof(struct optimistic_spin_node)); + if (global_osq_nodepool_ptr == NULL) exit(errno); + + /* + * If osq spins more than unqueue_retry times, the spinning cpu may backoff + * and sleep for 1 ~ 10 microseconds (on average 5 microseconds). Each spinning + * thread uses a different backoff sleep time, and we can adjust the maximum + * sleep time by redefine MAX_BACKOFF_SLEEP_US or tuning via parameter '-s' + * By default, we disable this sleep (MAX_BACKOFF_SLEEP_US = 0) + * + * Note: Avoid assigning random_sleep a negative value, otherwise usleep would + * have a very large sleep time after implicit casting negative to uint32_t. + */ + srand(time(0)); + for (int i = 0; i < cores; i++) { + if (max_sleep_us > 0) + (global_osq_nodepool_ptr + i)->random_sleep = rand() % max_sleep_us + 1; + } + + /* Initialize global osq tail indicater to OSQ_UNLOCKED_VAL (0: unlocked) */ + atomic_set(&global_osq.tail, OSQ_UNLOCKED_VAL); +} + +static inline bool osq_is_locked(struct optimistic_spin_queue *lock) +{ + return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL; +} + +/* + * Value 0 represents "no CPU" or "unlocked", thus the encoded value will be + * the CPU number incremented by 1. + */ +static inline int encode_cpu(int cpu_nr) +{ + return cpu_nr + 1; +} + +static inline int node_to_cpu(struct optimistic_spin_node *node) +{ + return node->cpu - 1; +} + +/* + * optimistic_spin_node for each cpu is stored linearly in main heap starting + * from global_osq_nodepool_ptr + */ +static inline struct optimistic_spin_node * cpu_to_node(int encoded_cpu_val) +{ + int cpu_nr = encoded_cpu_val - 1; + return global_osq_nodepool_ptr + cpu_nr; +} + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_node * +osq_wait_next(struct optimistic_spin_queue *lock, + struct optimistic_spin_node *node, + struct optimistic_spin_node *prev, + unsigned long cpu_number) +{ + struct optimistic_spin_node *next = NULL; + int curr = encode_cpu(cpu_number); + int old; + + /* + * If there is a prev node in queue, then the 'old' value will be + * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if + * we're currently last in queue, then the queue will then become empty. + */ + old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; + + for (;;) { + + if (atomic_read(&lock->tail) == curr && + atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { + /* + * We were the last queued, we moved @lock back. @prev + * will now observe @lock and will complete its + * unlock()/unqueue(). + */ + break; + } + + /* + * We must xchg() the @node->next value, because if we were to + * leave it in, a concurrent unlock()/unqueue() from + * @node->next might complete Step-A and think its @prev is + * still valid. + * + * If the concurrent unlock()/unqueue() wins the race, we'll + * wait for either @lock to point to us, through its Step-B, or + * wait for a new @node->next from its Step-C. + */ + if (node->next) { + next = xchg(&node->next, NULL); + if (next) + break; + } + + cpu_relax(); + } + + return next; +} + +/* uint64_t *osq is ignored because we use &global_osq instead */ +static bool osq_lock(uint64_t *osq, unsigned long cpu_number) +{ + /* each cpu core has only one thread spinning on one optimistic_spin_node */ + struct optimistic_spin_node *node = global_osq_nodepool_ptr + cpu_number; + /* optimistic_spin_queue stores the current osq tail globally */ + struct optimistic_spin_queue *lock = &global_osq; + struct optimistic_spin_node *prev, *next; + int curr = encode_cpu(cpu_number); + int old; + long long back_off = 0; + + node->locked = 0; + node->next = NULL; + node->cpu = curr; + + /* + * We need both ACQUIRE (pairs with corresponding RELEASE in + * unlock() uncontended, or fastpath) and RELEASE (to publish + * the node fields we just initialised) semantics when updating + * the lock tail. + */ + old = atomic_xchg(&lock->tail, curr); + if (old == OSQ_UNLOCKED_VAL) + return true; + + prev = cpu_to_node(old); + node->prev = prev; + + /* + * osq_lock() unqueue + * + * node->prev = prev osq_wait_next() + * WMB MB + * prev->next = node next->prev = prev // unqueue-C + * + * Here 'node->prev' and 'next->prev' are the same variable and we need + * to ensure these stores happen in-order to avoid corrupting the list. + */ + smp_wmb(); + + WRITE_ONCE(prev->next, node); + + /* + * Normally @prev is untouchable after the above store; because at that + * moment unlock can proceed and wipe the node element from stack. + * + * However, since our nodes are static per-cpu storage, we're + * guaranteed their existence -- this allows us to apply + * cmpxchg in an attempt to undo our queueing. + */ + + while (!READ_ONCE(node->locked)) { + /* + * TODO: Need to better emulate kernel rescheduling in user space. + * Because we cannot use need_resched() in user space, we simply + * add a upper limit named unqueue_retry to mimic need_resched(). + * If this limit has been exceeded by back_off times, we will jump + * to unqueue path and remove the spinning node from global osq. + */ + /* + * If we need to reschedule bail... so we can block. + * Use vcpu_is_preempted() to avoid waiting for a preempted + * lock holder. + */ + //if (need_resched() || vcpu_is_preempted(node_to_cpu(node->prev))) + if (++back_off > unqueue_retry) /* DEFAULT_UNQUEUE_RETRY 2 billions */ + goto unqueue; + + cpu_relax(); + } + return true; + +unqueue: + /* + * Step - A -- stabilize @prev + * + * Undo our @prev->next assignment; this will make @prev's + * unlock()/unqueue() wait for a next pointer since @lock points to us + * (or later). + */ + + for (;;) { + if (prev->next == node && + cmpxchg(&prev->next, node, NULL) == node) + break; + + /* + * We can only fail the cmpxchg() racing against an unlock(), + * in which case we should observe @node->locked becomming + * true. + */ + if (smp_load_acquire(&node->locked)) + return true; + + cpu_relax(); + + /* + * Or we race against a concurrent unqueue()'s step-B, in which + * case its step-C will write us a new @node->prev pointer. + */ + prev = READ_ONCE(node->prev); + } + + /* + * Step - B -- stabilize @next + * + * Similar to unlock(), wait for @node->next or move @lock from @node + * back to @prev. + */ + + next = osq_wait_next(lock, node, prev, cpu_number); + if (!next) + return false; + + /* + * Step - C -- unlink + * + * @prev is stable because its still waiting for a new @prev->next + * pointer, @next is stable because our @node->next pointer is NULL and + * it will wait in Step-A. + */ + + WRITE_ONCE(next->prev, prev); + WRITE_ONCE(prev->next, next); + + return false; +} + +/* uint64_t *osq is ignored because we use &global_osq instead */ +static void osq_unlock(uint64_t *osq, unsigned long cpu_number) +{ + /* optimistic_spin_queue stores the current osq tail globally */ + struct optimistic_spin_queue *lock = &global_osq; + struct optimistic_spin_node *node, *next; + int curr = encode_cpu(cpu_number); + + /* + * Fast path for the uncontended case. + */ + if (atomic_cmpxchg_release(&lock->tail, curr, + OSQ_UNLOCKED_VAL) == curr) + return; + + /* + * Second most likely case. + * If there is a next node, notify it. + */ + node = global_osq_nodepool_ptr + cpu_number; + next = xchg(&node->next, NULL); + if (next) { + WRITE_ONCE(next->locked, 1); + return; + } + + /* + * Wait for another stable next, or get NULL if the queue is empty. + */ + next = osq_wait_next(lock, node, NULL, cpu_number); + if (next) + WRITE_ONCE(next->locked, 1); +} + + +/* standard lockhammer lock_acquire and lock_release interfaces */ +static unsigned long __attribute__((noinline)) +lock_acquire (uint64_t *lock, unsigned long threadnum) +{ + /* + * Note: The linux kernel implements additional mutex slow path in mutex.c + * __mutex_lock_common() function. We will create another workload which + * combines osq_lock and mutex_lock_common. This workload only benchmarks + * osq_lock itself. The osq_lock is different from mcs_queue_spinlock + * because of tunable unqueue path and backoff sleep time. + */ + while (!osq_lock(lock, threadnum)) { + /* + * If still cannot acquire the lock after spinning for unqueue_retry + * times, try to backoff and sleep for random microseconds specified + * by parameter '-s', by default the maximum sleep time is 0us. Then + * reacquire the lock again infinitely until success. + * + * This behaves similar to kernel mutex with fine tuning sleep time. + */ + usleep((global_osq_nodepool_ptr + threadnum)->random_sleep); + } + return 1; +} + + +static inline void lock_release (uint64_t *lock, unsigned long threadnum) +{ + osq_unlock(lock, threadnum); +} + +#endif /* __LINUX_OSQ_LOCK_H */ diff --git a/ext/tbb/tbb_spin_rw_mutex.h b/ext/tbb/tbb_spin_rw_mutex.h index 71ae06b..cfdeeef 100644 --- a/ext/tbb/tbb_spin_rw_mutex.h +++ b/ext/tbb/tbb_spin_rw_mutex.h @@ -123,6 +123,14 @@ #ifndef __TBB_spin_mutex_H #define __TBB_spin_mutex_H +#ifdef initialize_lock +#undef initialize_lock +#endif + +#ifdef parse_test_args +#undef parse_test_args +#endif + #define initialize_lock(lock, threads) tbb_init_locks(lock, threads) #define parse_test_args(args, argc, argv) tbb_parse_args(args, argc, argv)