diff --git a/benchmarks/lockhammer/Makefile b/benchmarks/lockhammer/Makefile
index ad1e730..600c8c4 100644
--- a/benchmarks/lockhammer/Makefile
+++ b/benchmarks/lockhammer/Makefile
@@ -31,7 +31,8 @@ TEST_TARGETS=lh_swap_mutex \
 	lh_queued_spinlock \
 	lh_empty \
 	lh_jvm_objectmonitor \
-	lh_tbb_spin_rw_mutex
+	lh_tbb_spin_rw_mutex \
+	lh_osq_lock
 
 ifeq ($(TARGET_ARCH),aarch64)
 	TEST_TARGETS+=lh_hybrid_spinlock \
@@ -55,6 +56,9 @@ lh_hybrid_spinlock: ../../ext/linux/hybrid_spinlock.h include/atomics.h ../../ex
 lh_hybrid_spinlock_fastdequeue: ../../ext/linux/hybrid_spinlock_fastdequeue.h include/atomics.h ../../ext/linux/include/lk_atomics.h src/lockhammer.c
 	${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS}
 
+lh_osq_lock: ../../ext/linux/osq_lock.h ../../ext/linux/include/lk_atomics.h ../../ext/linux/include/lk_barrier.h ../../ext/linux/include/lk_cmpxchg.h include/atomics.h src/lockhammer.c
+	${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS}
+
 lh_queued_spinlock: ../../ext/linux/queued_spinlock.h include/atomics.h ../../ext/linux/include/lk_atomics.h src/lockhammer.c
 	${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS}
 
diff --git a/ext/linux/include/lk_atomics.h b/ext/linux/include/lk_atomics.h
index 95b0cbb..86e9b06 100644
--- a/ext/linux/include/lk_atomics.h
+++ b/ext/linux/include/lk_atomics.h
@@ -392,3 +392,7 @@ do {									\
 
 #define arch_mcs_spin_unlock_contended(l)				\
 	smp_store_release((l), 1)
+
+#define ATOMIC_INIT(i)  { (i) }
+#define atomic_read(v)                  READ_ONCE((v)->counter)
+#define atomic_set(v, i)                WRITE_ONCE(((v)->counter), (i))
diff --git a/ext/linux/include/lk_barrier.h b/ext/linux/include/lk_barrier.h
new file mode 100644
index 0000000..057ac57
--- /dev/null
+++ b/ext/linux/include/lk_barrier.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Based on Linux kernel 4.16.10
+ * arch/arm64/include/asm/barrier.h
+ * arch/x86/include/asm/barrier.h
+ * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115
+ */
+
+#ifndef __ASM_BARRIER_H
+#define __ASM_BARRIER_H
+
+#include "lk_cmpxchg.h"
+
+#if defined(__x86_64__)
+
+#define mb() 	asm volatile("mfence":::"memory")
+#define rmb()	asm volatile("lfence":::"memory")
+#define wmb()	asm volatile("sfence" ::: "memory")
+#define dma_rmb()	barrier()
+#define dma_wmb()	barrier()
+#define smp_mb()	asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc")
+#define smp_rmb()	dma_rmb()
+#define smp_wmb()	barrier()
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
+
+
+/* Atomic operations are already serializing on x86 */
+#define __smp_mb__before_atomic()	barrier()
+#define __smp_mb__after_atomic()	barrier()
+
+
+#elif defined(__aarch64__)
+
+#define isb()		asm volatile("isb" : : : "memory")
+#define dmb(opt)	asm volatile("dmb " #opt : : : "memory")
+#define dsb(opt)	asm volatile("dsb " #opt : : : "memory")
+#define psb_csync()	asm volatile("hint #17" : : : "memory")
+#define csdb()		asm volatile("hint #20" : : : "memory")
+#define mb()		dsb(sy)
+#define rmb()		dsb(ld)
+#define wmb()		dsb(st)
+#define dma_rmb()	dmb(oshld)
+#define dma_wmb()	dmb(oshst)
+#define smp_mb()	dmb(ish)
+#define smp_rmb()	dmb(ishld)
+#define smp_wmb()	dmb(ishst)
+
+#else /* No Arch */
+    /* TODO: No Arch Default */
+#endif /* __x86_64__ */
+
+#endif	/* __ASM_BARRIER_H */
diff --git a/ext/linux/include/lk_cmpxchg.h b/ext/linux/include/lk_cmpxchg.h
new file mode 100644
index 0000000..85178b8
--- /dev/null
+++ b/ext/linux/include/lk_cmpxchg.h
@@ -0,0 +1,463 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Based on Linux kernel 4.16.10
+ * arch/arm64/include/asm/cmpxchg.h
+ * arch/x86/include/asm/cmpxchg.h
+ * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115
+ */
+
+#ifndef __ASM_CMPXCHG_H
+#define __ASM_CMPXCHG_H
+
+#if defined(__x86_64__)
+#define LOCK_PREFIX_HERE                                                \
+                ".pushsection .smp_locks,\"a\"\n"                       \
+                ".balign 4\n"                                           \
+                ".long 671f - .\n" /* offset */                         \
+                ".popsection\n"                                         \
+                "671:"
+
+#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
+
+/*
+ * Constants for operation sizes. On 32-bit, the 64-bit size it set to
+ * -1 because sizeof will never return -1, thereby making those switch
+ * case statements guaranteeed dead code which the compiler will
+ * eliminate, and allowing the "missing symbol in the default case" to
+ * indicate a usage error.
+ */
+#define __X86_CASE_B    1
+#define __X86_CASE_W    2
+#define __X86_CASE_L    4
+#define __X86_CASE_Q    8
+
+/*
+ * An exchange-type operation, which takes a value and a pointer, and
+ * returns the old value.
+ */
+#define __xchg_op(ptr, arg, op, lock)                                   \
+        ({                                                              \
+                __typeof__ (*(ptr)) __ret = (arg);                      \
+                switch (sizeof(*(ptr))) {                               \
+                case __X86_CASE_B:                                      \
+                        asm volatile (lock #op "b %b0, %1\n"            \
+                                      : "+q" (__ret), "+m" (*(ptr))     \
+                                      : : "memory", "cc");              \
+                        break;                                          \
+                case __X86_CASE_W:                                      \
+                        asm volatile (lock #op "w %w0, %1\n"            \
+                                      : "+r" (__ret), "+m" (*(ptr))     \
+                                      : : "memory", "cc");              \
+                        break;                                          \
+                case __X86_CASE_L:                                      \
+                        asm volatile (lock #op "l %0, %1\n"             \
+                                      : "+r" (__ret), "+m" (*(ptr))     \
+                                      : : "memory", "cc");              \
+                        break;                                          \
+                case __X86_CASE_Q:                                      \
+                        asm volatile (lock #op "q %q0, %1\n"            \
+                                      : "+r" (__ret), "+m" (*(ptr))     \
+                                      : : "memory", "cc");              \
+                        break;                                          \
+                }                                                       \
+                __ret;                                                  \
+        })
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
+ * Since this is generally used to protect other memory information, we
+ * use "asm volatile" and "memory" clobbers to prevent gcc from moving
+ * information around.
+ */
+#define xchg(ptr, v)    __xchg_op((ptr), (v), xchg, "")
+
+
+/*
+ * Atomic compare and exchange.  Compare OLD with MEM, if identical,
+ * store NEW in MEM.  Return the initial value in MEM.  Success is
+ * indicated by comparing RETURN with OLD.
+ */
+#define __raw_cmpxchg(ptr, old, new, size, lock)                        \
+({                                                                      \
+        __typeof__(*(ptr)) __ret;                                       \
+        __typeof__(*(ptr)) __old = (old);                               \
+        __typeof__(*(ptr)) __new = (new);                               \
+        switch (size) {                                                 \
+        case __X86_CASE_B:                                              \
+        {                                                               \
+                volatile u8 *__ptr = (volatile u8 *)(ptr);              \
+                asm volatile(lock "cmpxchgb %2,%1"                      \
+                             : "=a" (__ret), "+m" (*__ptr)              \
+                             : "q" (__new), "0" (__old)                 \
+                             : "memory");                               \
+                break;                                                  \
+        }                                                               \
+        case __X86_CASE_W:                                              \
+        {                                                               \
+                volatile u16 *__ptr = (volatile u16 *)(ptr);            \
+                asm volatile(lock "cmpxchgw %2,%1"                      \
+                             : "=a" (__ret), "+m" (*__ptr)              \
+                             : "r" (__new), "0" (__old)                 \
+                             : "memory");                               \
+                break;                                                  \
+        }                                                               \
+        case __X86_CASE_L:                                              \
+        {                                                               \
+                volatile u32 *__ptr = (volatile u32 *)(ptr);            \
+                asm volatile(lock "cmpxchgl %2,%1"                      \
+                             : "=a" (__ret), "+m" (*__ptr)              \
+                             : "r" (__new), "0" (__old)                 \
+                             : "memory");                               \
+                break;                                                  \
+        }                                                               \
+        case __X86_CASE_Q:                                              \
+        {                                                               \
+                volatile u64 *__ptr = (volatile u64 *)(ptr);            \
+                asm volatile(lock "cmpxchgq %2,%1"                      \
+                             : "=a" (__ret), "+m" (*__ptr)              \
+                             : "r" (__new), "0" (__old)                 \
+                             : "memory");                               \
+                break;                                                  \
+        }                                                               \
+        }                                                               \
+        __ret;                                                          \
+})
+
+#define __cmpxchg(ptr, old, new, size)                                  \
+        __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+
+#define __sync_cmpxchg(ptr, old, new, size)                             \
+        __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
+
+#define __cmpxchg_local(ptr, old, new, size)                            \
+        __raw_cmpxchg((ptr), (old), (new), (size), "")
+
+#define cmpxchg(ptr, old, new)                                          \
+        __cmpxchg(ptr, old, new, sizeof(*(ptr)))
+
+#define sync_cmpxchg(ptr, old, new)                                     \
+        __sync_cmpxchg(ptr, old, new, sizeof(*(ptr)))
+
+#define cmpxchg_local(ptr, old, new)                                    \
+        __cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
+
+static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+    return cmpxchg(&v->counter, old, new);
+}
+
+static inline int atomic_xchg(atomic_t *v, int new)
+{
+    return xchg(&v->counter, new);
+}
+
+#define  atomic_cmpxchg_relaxed     atomic_cmpxchg
+#define  atomic_cmpxchg_acquire     atomic_cmpxchg
+#define  atomic_cmpxchg_release     atomic_cmpxchg
+#define  atomic_xchg_relaxed        atomic_xchg
+#define  atomic_xchg_acquire        atomic_xchg
+#define  atomic_xchg_release        atomic_xchg
+
+
+#elif defined(__aarch64__)
+
+#define unreachable()                                                   \
+    do {                                                                \
+        asm volatile("");                                               \
+        __builtin_unreachable();                                        \
+        } while (0)
+
+#define notrace __attribute__((no_instrument_function))
+
+#define __nops(n)   ".rept  " #n "\nnop\n.endr\n"
+#define nops(n)     asm volatile(__nops(n))
+
+/* Move the ll/sc atomics out-of-line */
+#define __LL_SC_INLINE          notrace
+#define __LL_SC_PREFIX(x)       __ll_sc_##x
+
+#define __CMPXCHG_CASE(w, sz, name, mb, acq, rel, cl)                   \
+__LL_SC_INLINE unsigned long                                            \
+__LL_SC_PREFIX(__cmpxchg_case_##name(volatile void *ptr,                \
+                                     unsigned long old,                 \
+                                     unsigned long new))                \
+{                                                                       \
+        unsigned long tmp, oldval;                                      \
+                                                                        \
+        asm volatile(                                                   \
+        /* LL/SC */                                                     \
+        "       prfm    pstl1strm, %[v]\n"                              \
+        "1:     ld" #acq "xr" #sz "\t%" #w "[oldval], %[v]\n"           \
+        "       eor     %" #w "[tmp], %" #w "[oldval], %" #w "[old]\n"  \
+        "       cbnz    %" #w "[tmp], 2f\n"                             \
+        "       st" #rel "xr" #sz "\t%w[tmp], %" #w "[new], %[v]\n"     \
+        "       cbnz    %w[tmp], 1b\n"                                  \
+        "       " #mb "\n"                                              \
+        "2:"                                                            \
+        : [tmp] "=&r" (tmp), [oldval] "=&r" (oldval),                   \
+          [v] "+Q" (*(unsigned long *)ptr)                              \
+        : [old] "Lr" (old), [new] "r" (new)                             \
+        : cl);                                                          \
+                                                                        \
+        return oldval;                                                  \
+}                                                                       \
+
+__CMPXCHG_CASE(w, b,     1,        ,  ,  ,         )
+__CMPXCHG_CASE(w, h,     2,        ,  ,  ,         )
+__CMPXCHG_CASE(w,  ,     4,        ,  ,  ,         )
+__CMPXCHG_CASE( ,  ,     8,        ,  ,  ,         )
+__CMPXCHG_CASE(w, b, acq_1,        , a,  , "memory")
+__CMPXCHG_CASE(w, h, acq_2,        , a,  , "memory")
+__CMPXCHG_CASE(w,  , acq_4,        , a,  , "memory")
+__CMPXCHG_CASE( ,  , acq_8,        , a,  , "memory")
+__CMPXCHG_CASE(w, b, rel_1,        ,  , l, "memory")
+__CMPXCHG_CASE(w, h, rel_2,        ,  , l, "memory")
+__CMPXCHG_CASE(w,  , rel_4,        ,  , l, "memory")
+__CMPXCHG_CASE( ,  , rel_8,        ,  , l, "memory")
+__CMPXCHG_CASE(w, b,  mb_1, dmb ish,  , l, "memory")
+__CMPXCHG_CASE(w, h,  mb_2, dmb ish,  , l, "memory")
+__CMPXCHG_CASE(w,  ,  mb_4, dmb ish,  , l, "memory")
+__CMPXCHG_CASE( ,  ,  mb_8, dmb ish,  , l, "memory")
+
+#undef __CMPXCHG_CASE
+
+#define __LSE_CMPXCHG_CASE(w, sz, name, mb, cl...)                      \
+static inline unsigned long __cmpxchg_case_##name(volatile void *ptr,   \
+                          unsigned long old,                            \
+                          unsigned long new)                            \
+{                                                                       \
+    register unsigned long x0 asm ("x0") = (unsigned long)ptr;          \
+    register unsigned long x1 asm ("x1") = old;                         \
+    register unsigned long x2 asm ("x2") = new;                         \
+                                                                        \
+    asm volatile(                                                       \
+    /* LSE atomics */                                                   \
+    "   mov " #w "30, %" #w "[old]\n"                                   \
+    "   cas" #mb #sz "\t" #w "30, %" #w "[new], %[v]\n"                 \
+    "   mov %" #w "[ret], " #w "30"                                     \
+    : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr)                 \
+    : [old] "r" (x1), [new] "r" (x2)                                    \
+    : cl);                                                              \
+                                                                        \
+    return x0;                                                          \
+}
+
+__LSE_CMPXCHG_CASE(w, b,     1,   )
+__LSE_CMPXCHG_CASE(w, h,     2,   )
+__LSE_CMPXCHG_CASE(w,  ,     4,   )
+__LSE_CMPXCHG_CASE(x,  ,     8,   )
+__LSE_CMPXCHG_CASE(w, b, acq_1,  a, "memory")
+__LSE_CMPXCHG_CASE(w, h, acq_2,  a, "memory")
+__LSE_CMPXCHG_CASE(w,  , acq_4,  a, "memory")
+__LSE_CMPXCHG_CASE(x,  , acq_8,  a, "memory")
+__LSE_CMPXCHG_CASE(w, b, rel_1,  l, "memory")
+__LSE_CMPXCHG_CASE(w, h, rel_2,  l, "memory")
+__LSE_CMPXCHG_CASE(w,  , rel_4,  l, "memory")
+__LSE_CMPXCHG_CASE(x,  , rel_8,  l, "memory")
+__LSE_CMPXCHG_CASE(w, b,  mb_1, al, "memory")
+__LSE_CMPXCHG_CASE(w, h,  mb_2, al, "memory")
+__LSE_CMPXCHG_CASE(w,  ,  mb_4, al, "memory")
+__LSE_CMPXCHG_CASE(x,  ,  mb_8, al, "memory")
+
+#undef __LSE_CMPXCHG_CASE
+
+/*
+ * aarch64 cmpxchg implementation has been modified to disable runtime
+ * binary patching and use LL/SC assemblies directly without hard branch
+ * and link inside LSE CMPXCHG_GEN. This resolved a bug which was related
+ * to missing CFLAGS_atomic_ll_sc.o in user space. The special CFLAGS in
+ * arch/arm64/lib/Makefile tells the compiler to treat all general
+ * purpose registers (with the exception of the IP registers, which are
+ * already handled by the caller in case of a PLT) as callee-saved, which
+ * allows for efficient runtime patching of the bl instruction in the
+ * caller with an atomic instruction when supported by the CPU. Result
+ * and argument registers are handled correctly, based on the function
+ * prototype. (__LL_SC_CLOBBERS and original __CMPXCHG_CASE)
+ */
+
+#if defined(USE_LSE) /* ARMv8.1 with LSE */
+#define __CMPXCHG_GEN(sfx)                                              \
+static inline unsigned long __cmpxchg##sfx(volatile void *ptr,          \
+                       unsigned long old,                               \
+                       unsigned long new,                               \
+                       int size)                                        \
+{                                                                       \
+    switch (size) {                                                     \
+    case 1:                                                             \
+        return __cmpxchg_case##sfx##_1(ptr, (u8)old, new);              \
+    case 2:                                                             \
+        return __cmpxchg_case##sfx##_2(ptr, (u16)old, new);             \
+    case 4:                                                             \
+        return __cmpxchg_case##sfx##_4(ptr, old, new);                  \
+    case 8:                                                             \
+        return __cmpxchg_case##sfx##_8(ptr, old, new);                  \
+    }                                                                   \
+                                                                        \
+    unreachable();                                                      \
+}
+#else /* ARMv8.0 without LSE */
+#define __CMPXCHG_GEN(sfx)                                              \
+static inline unsigned long __cmpxchg##sfx(volatile void *ptr,          \
+                       unsigned long old,                               \
+                       unsigned long new,                               \
+                       int size)                                        \
+{                                                                       \
+    switch (size) {                                                     \
+    case 1:                                                             \
+        return __ll_sc___cmpxchg_case##sfx##_1(ptr, (u8)old, new);      \
+    case 2:                                                             \
+        return __ll_sc___cmpxchg_case##sfx##_2(ptr, (u16)old, new);     \
+    case 4:                                                             \
+        return __ll_sc___cmpxchg_case##sfx##_4(ptr, old, new);          \
+    case 8:                                                             \
+        return __ll_sc___cmpxchg_case##sfx##_8(ptr, old, new);          \
+    }                                                                   \
+                                                                        \
+    unreachable();                                                      \
+}
+#endif /* ARMv8 with or without LSE */
+
+__CMPXCHG_GEN()
+__CMPXCHG_GEN(_acq)
+__CMPXCHG_GEN(_rel)
+__CMPXCHG_GEN(_mb)
+
+#undef __CMPXCHG_GEN
+
+#define __cmpxchg_wrapper(sfx, ptr, o, n)                               \
+({                                                                      \
+    __typeof__(*(ptr)) __ret;                                           \
+    __ret = (__typeof__(*(ptr)))                                        \
+        __cmpxchg##sfx((ptr), (unsigned long)(o),                       \
+                (unsigned long)(n), sizeof(*(ptr)));                    \
+    __ret;                                                              \
+})
+
+/* cmpxchg */
+#define cmpxchg_relaxed(...)    __cmpxchg_wrapper(    , __VA_ARGS__)
+#define cmpxchg_acquire(...)    __cmpxchg_wrapper(_acq, __VA_ARGS__)
+#define cmpxchg_release(...)    __cmpxchg_wrapper(_rel, __VA_ARGS__)
+#define cmpxchg(...)        __cmpxchg_wrapper( _mb, __VA_ARGS__)
+#define cmpxchg_local       cmpxchg_relaxed
+
+
+/*
+ * Original ARM64_LSE_ATOMIC_INSN is defined as ALTERNATIVE and would
+ * check runtime CPU capability and dynamically patch kernel binary.
+ * New ARM64_LSE_ATOMIC_INSN has been modified to use the first or second
+ * argument as output string depending on external USE_LSE define.
+ */
+
+#if defined(USE_LSE) /* ARMv8.1 with LSE */
+#define ARM64_LSE_ATOMIC_INSN(llsc, lse)        lse
+__asm__(".arch_extension        lse");
+#else /* ARMv8.0 without LSE */
+#define ARM64_LSE_ATOMIC_INSN(llsc, lse)        llsc
+#endif
+
+/*
+ * We need separate acquire parameters for ll/sc and lse, since the full
+ * barrier case is generated as release+dmb for the former and
+ * acquire+release for the latter.
+ */
+#define __XCHG_CASE(w, sz, name, mb, nop_lse, acq, acq_lse, rel, cl)    \
+static inline unsigned long __xchg_case_##name(unsigned long x,         \
+                           volatile void *ptr)                          \
+{                                                                       \
+    unsigned long ret, tmp;                                             \
+                                                                        \
+    asm volatile(ARM64_LSE_ATOMIC_INSN(                                 \
+    /* LL/SC */                                                         \
+    "   prfm    pstl1strm, %2\n"                                        \
+    "1: ld" #acq "xr" #sz "\t%" #w "0, %2\n"                            \
+    "   st" #rel "xr" #sz "\t%w1, %" #w "3, %2\n"                       \
+    "   cbnz    %w1, 1b\n"                                              \
+    "   " #mb,                                                          \
+    /* LSE atomics */                                                   \
+    "   swp" #acq_lse #rel #sz "\t%" #w "3, %" #w "0, %2\n"             \
+        __nops(3)                                                       \
+    "   " #nop_lse)                                                     \
+    : "=&r" (ret), "=&r" (tmp), "+Q" (*(unsigned long *)ptr)            \
+    : "r" (x)                                                           \
+    : cl);                                                              \
+                                                                        \
+    return ret;                                                         \
+}
+
+__XCHG_CASE(w, b,     1,        ,    ,  ,  ,  ,         )
+__XCHG_CASE(w, h,     2,        ,    ,  ,  ,  ,         )
+__XCHG_CASE(w,  ,     4,        ,    ,  ,  ,  ,         )
+__XCHG_CASE( ,  ,     8,        ,    ,  ,  ,  ,         )
+__XCHG_CASE(w, b, acq_1,        ,    , a, a,  , "memory")
+__XCHG_CASE(w, h, acq_2,        ,    , a, a,  , "memory")
+__XCHG_CASE(w,  , acq_4,        ,    , a, a,  , "memory")
+__XCHG_CASE( ,  , acq_8,        ,    , a, a,  , "memory")
+__XCHG_CASE(w, b, rel_1,        ,    ,  ,  , l, "memory")
+__XCHG_CASE(w, h, rel_2,        ,    ,  ,  , l, "memory")
+__XCHG_CASE(w,  , rel_4,        ,    ,  ,  , l, "memory")
+__XCHG_CASE( ,  , rel_8,        ,    ,  ,  , l, "memory")
+__XCHG_CASE(w, b,  mb_1, dmb ish, nop,  , a, l, "memory")
+__XCHG_CASE(w, h,  mb_2, dmb ish, nop,  , a, l, "memory")
+__XCHG_CASE(w,  ,  mb_4, dmb ish, nop,  , a, l, "memory")
+__XCHG_CASE( ,  ,  mb_8, dmb ish, nop,  , a, l, "memory")
+
+#undef __XCHG_CASE
+
+#define __XCHG_GEN(sfx)                                                 \
+static inline unsigned long __xchg##sfx(unsigned long x,                \
+                    volatile void *ptr,                                 \
+                    int size)                                           \
+{                                                                       \
+    switch (size) {                                                     \
+    case 1:                                                             \
+        return __xchg_case##sfx##_1(x, ptr);                            \
+    case 2:                                                             \
+        return __xchg_case##sfx##_2(x, ptr);                            \
+    case 4:                                                             \
+        return __xchg_case##sfx##_4(x, ptr);                            \
+    case 8:                                                             \
+        return __xchg_case##sfx##_8(x, ptr);                            \
+    }                                                                   \
+                                                                        \
+    unreachable();                                                      \
+}
+
+__XCHG_GEN()
+__XCHG_GEN(_acq)
+__XCHG_GEN(_rel)
+__XCHG_GEN(_mb)
+
+#undef __XCHG_GEN
+
+#define __xchg_wrapper(sfx, ptr, x)                                     \
+({                                                                      \
+    __typeof__(*(ptr)) __ret;                                           \
+    __ret = (__typeof__(*(ptr)))                                        \
+        __xchg##sfx((unsigned long)(x), (ptr), sizeof(*(ptr)));         \
+    __ret;                                                              \
+})
+
+/* xchg */
+#define xchg_relaxed(...)   __xchg_wrapper(    , __VA_ARGS__)
+#define xchg_acquire(...)   __xchg_wrapper(_acq, __VA_ARGS__)
+#define xchg_release(...)   __xchg_wrapper(_rel, __VA_ARGS__)
+#define xchg(...)       __xchg_wrapper( _mb, __VA_ARGS__)
+
+#define atomic_cmpxchg_relaxed(v, old, new)                             \
+    cmpxchg_relaxed(&((v)->counter), (old), (new))
+#define atomic_cmpxchg_acquire(v, old, new)                             \
+    cmpxchg_acquire(&((v)->counter), (old), (new))
+#define atomic_cmpxchg_release(v, old, new)                             \
+    cmpxchg_release(&((v)->counter), (old), (new))
+#define atomic_cmpxchg(v, old, new) cmpxchg(&((v)->counter), (old), (new))
+
+#define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
+#define atomic_xchg_acquire(v, new) xchg_acquire(&((v)->counter), (new))
+#define atomic_xchg_release(v, new) xchg_release(&((v)->counter), (new))
+#define atomic_xchg(v, new)     xchg(&((v)->counter), (new))
+
+#else /* Unknown Arch */
+    /* TODO: No Arch Default */
+#endif /* __x86_64__ */
+
+#endif  /* __ASM_CMPXCHG_H */
diff --git a/ext/linux/osq_lock.h b/ext/linux/osq_lock.h
new file mode 100644
index 0000000..d576a20
--- /dev/null
+++ b/ext/linux/osq_lock.h
@@ -0,0 +1,526 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Based on Linux kernel 4.16.10
+ * https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git
+ * /commit/?h=v4.16.10&id=b3fdf8284efbc5020dfbd0a28150637189076115
+ *
+ * Description:
+ *
+ *      This workload implements kernel 'optimistic spin queue' derived from mcs
+ *      lock. Tunable unqueue_retry times and max_backoff_sleep duration have
+ *      also been added to simulate need_resched() condition and unqueue current
+ *      cpu node from spinning queue and put to sleep.
+ *
+ * Changes from Linux kernel osq_lock.c
+ *
+ *      The original DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node,
+ *      osq_node) was modified to 128 byte aligned optimistic_spin_node C array
+ *      allocated in heap during osq_lock_init() in main thread. It was pointed
+ *      by global_osq_nodepool_ptr pointer. The osq lock queue struct itself was
+ *      declared as a global variable too, which would substitute upper level
+ *      mutex lock struct indicated by lock pointer. Therefore we don't need to
+ *      get the lock pointer from lock_acquire() and lock_release() interface.
+ *      The spinning node structure can be linearly located by osq_nodepool_ptr
+ *      with threadnum/coreid as offset. The tail of osq_lock can be accessed
+ *      by global_osq directly.
+ *
+ *      We haven't changed the algorithm except adding unqueue_retry and max_
+ *      sleep_us as optional backoff sleep to mimic kernel rescheduling events.
+ *      By default we essentially disable unqueue_retry and backoff sleep so
+ *      that osq_lock performance is more stable and similar to mcs queue spin
+ *      lock.
+ *
+ * Internals:
+ *
+ *      In order to port osq_lock from kernel space to user space, we added
+ *      lk_barrier.h and lk_cmpxchg.h to synchronization-benchmarks/ext/linux/
+ *      include. Because there are some special gcc options to restrict compiler
+ *      from allocating x16/x17 registers in arch/arm64/lib/Makefile for
+ *      atomic_ll_sc.o, and our osq_lock.h included from lockhammer.c will not
+ *      generate any other separate object file, we have to modify cmpxchg.h
+ *      and change cmpxchg LLSC/LSE implementation for aarch64.
+ *
+ *      Kernel arm64 cmpxchg.h supports both LLSC (load-link/store-conditional)
+ *      and LSE (Armv8.1 large system extension) via dynamic binary patching.
+ *      If CONFIG_AS_LSE and CONFIG_ARM64_LSE_ATOMICS have been enabled, kernel
+ *      will use Armv8.1 new atomic instructions CAS to implement the compare
+ *      and swap function. This inline function has 3 instructions mov/cas/mov,
+ *      which will be overwritten during system boot up if the CPU doesn't
+ *      support Armv8.1 LSE. The 3 new instructions are bl/nop/nop. The branch
+ *      and link instruction will redirect program flow to Armv8.0 LLSC function
+ *      without saving any of the caller's local registers. These registers are
+ *      guaranteed to be safe because LLSC function in atomic_ll_sc.o only uses
+ *      x16/x17 and LSE caller doesn't use x16/x17.
+ *
+ *      Since lockhammer doesn't have runtime cpu detection, whether to use LLSC
+ *      or LSE is manually defined in lockhammer Makefile. Therefore our new
+ *      cmpxchg is also statically defined without branch and link or binary
+ *      patching. LLSC and LSE cmpxchg will share the same interface but use
+ *      different assembly codes and functions.
+ *
+ * Workings:
+ *
+ *      osq_lock works similar to mcs spinlock except the optional unqueue path.
+ *      Linux kernel qspinlock is slightly different than original mcs spinlock.
+ *
+ * Tuning Parameters
+ *
+ *      Optional unqueue and backoff sleep feature like kernel mutex
+ *
+ *      [-- [-u unqueue_retry]]: how many spin retries before jumping to unqueue
+ *                               path and stop spinning.
+ *
+ *      [-- [-s max_sleep_us]]: how long to sleep after unqueue from osq before
+ *                              another osq_lock() acquisition attempt. This
+ *                              parameter only defines the maximum sleep time in
+ *                              microseconds, each thread will sleep for random
+ *                              time less than this max_sleep_us. The actual
+ *                              sleep time is predetermined during main thread
+ *                              initialization phase with uniform distribution
+ *                              random function rand().
+ *
+ */
+
+#ifndef __LINUX_OSQ_LOCK_H
+#define __LINUX_OSQ_LOCK_H
+
+/* redefine initialize_lock and parse_test_args with local functions */
+#ifdef initialize_lock
+#undef initialize_lock
+#endif
+
+#ifdef parse_test_args
+#undef parse_test_args
+#endif
+
+#define initialize_lock(lock, threads) osq_lock_init(lock, threads)
+#define parse_test_args(args, argc, argv) osq_parse_args(args, argc, argv)
+
+#include <stdbool.h>
+#include "atomics.h"
+#include "lk_atomics.h"
+#include "lk_cmpxchg.h"
+#include "lk_barrier.h"
+
+#define ATOMIC_INIT(i)    { (i) }
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ *
+ * Using 128 bytes alignment to eliminate false sharing for various Armv8 core
+ * cache line size
+ */
+struct optimistic_spin_node {
+    struct optimistic_spin_node *next, *prev;
+    int locked; /* 1 if lock acquired */
+    int cpu; /* encoded CPU # + 1 value */
+    int random_sleep; /* random sleep in us */
+} __attribute__ ((aligned (128)));
+
+struct optimistic_spin_queue {
+    /*
+     * Stores an encoded value of the CPU # of the tail node in the queue.
+     * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
+     */
+    atomic_t tail;
+};
+
+/* 0 means thread unlocked, 1~N represents each individual thread on core 1~N */
+#define OSQ_UNLOCKED_VAL (0)
+
+/*
+ * maximum backoff sleep time in microseconds (default 0us, no sleep)
+ * linux kernel scheduling intrinsic delay is less than 7us, however
+ * we need to tune this parameter for different machines.
+ * http://www.brendangregg.com/blog/2017-03-16/perf-sched.html
+ */
+#define MAX_BACKOFF_SLEEP_US 0
+
+/*
+ * Default unqueue retry times, most system spins at least 500~1000 times
+ * before unqueue from optimistic_spin_queue. Default large value simply
+ * disables unqueue path and make osq_lock more like mcs_queue_spinlock.
+ */
+#define DEFAULT_UNQUEUE_RETRY 2000000000
+
+/* Init macro and function. */
+#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
+
+/* Newly added global variables used by osq_lock algorithm */
+static long long unqueue_retry;
+static long long max_sleep_us;
+static struct optimistic_spin_queue global_osq;
+static struct optimistic_spin_node *global_osq_nodepool_ptr;
+
+/* Newly added additional tuning parameters for optional backoff sleep */
+static void osq_parse_args(test_args unused, int argc, char** argv) {
+    int i = 0;
+    char *endptr;
+    unqueue_retry = DEFAULT_UNQUEUE_RETRY;
+    max_sleep_us = MAX_BACKOFF_SLEEP_US;
+
+    /* extended options retrieved after '--' operator */
+    while ((i = getopt(argc, argv, "u:s:")) != -1)
+    {
+        switch (i) {
+          case 'u':
+            errno = 0;
+            unqueue_retry = strtoll(optarg, &endptr, 10);
+            if ((errno == ERANGE && (unqueue_retry == LONG_LONG_MAX))
+                    || (errno != 0 && unqueue_retry == 0) || endptr == optarg) {
+                fprintf(stderr, "unqueue_retry: value unsuitable "
+                                "for 'long long int'\n");
+                exit(1);
+            }
+            break;
+
+          case 's':
+            errno = 0;
+            max_sleep_us = strtoll(optarg, &endptr, 10);
+            if ((errno == ERANGE && (max_sleep_us == LONG_LONG_MAX))
+                    || (errno != 0 && max_sleep_us == 0) || endptr == optarg) {
+                fprintf(stderr, "max_sleep_us: value unsuitable "
+                                "for 'long long int'\n");
+                exit(1);
+            } else if (max_sleep_us < 0) {
+                fprintf(stderr, "max_sleep_us must be a positive integer.\n");
+                exit(1);
+            }
+            break;
+
+          default:
+            fprintf(stderr,
+                    "osq_lock additional options after --:\n"
+                    "\t[-h print this msg]\n"
+                    "\t[-u max spin retries before unqueue, default 2 billions]\n"
+                    "\t[-s max unqueue sleep in microseconds, default 0]\n");
+            exit(2);
+        }
+    }
+}
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static inline void osq_lock_init(uint64_t *lock, unsigned long cores)
+{
+    /*
+     * Allocate optimistic_spin_node from heap during main thread initialization.
+     * Each cpu core will have its own spinning node, aligned to 128 bytes maximum
+     * cache line, calloc will set memory to zero automatically, therefore no need
+     * to bzero the nodepool.
+     */
+    global_osq_nodepool_ptr = calloc(cores + 1, sizeof(struct optimistic_spin_node));
+    if (global_osq_nodepool_ptr == NULL) exit(errno);
+
+    /*
+     * If osq spins more than unqueue_retry times, the spinning cpu may backoff
+     * and sleep for 1 ~ 10 microseconds (on average 5 microseconds). Each spinning
+     * thread uses a different backoff sleep time, and we can adjust the maximum
+     * sleep time by redefine MAX_BACKOFF_SLEEP_US or tuning via parameter '-s'
+     * By default, we disable this sleep (MAX_BACKOFF_SLEEP_US = 0)
+     *
+     * Note: Avoid assigning random_sleep a negative value, otherwise usleep would
+     * have a very large sleep time after implicit casting negative to uint32_t.
+     */
+    srand(time(0));
+    for (int i = 0; i < cores; i++) {
+        if (max_sleep_us > 0)
+            (global_osq_nodepool_ptr + i)->random_sleep = rand() % max_sleep_us + 1;
+    }
+
+    /* Initialize global osq tail indicater to OSQ_UNLOCKED_VAL (0: unlocked) */
+    atomic_set(&global_osq.tail, OSQ_UNLOCKED_VAL);
+}
+
+static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
+{
+    return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
+}
+
+/*
+ * Value 0 represents "no CPU" or "unlocked", thus the encoded value will be
+ * the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+    return cpu_nr + 1;
+}
+
+static inline int node_to_cpu(struct optimistic_spin_node *node)
+{
+    return node->cpu - 1;
+}
+
+/*
+ * optimistic_spin_node for each cpu is stored linearly in main heap starting
+ * from global_osq_nodepool_ptr
+ */
+static inline struct optimistic_spin_node * cpu_to_node(int encoded_cpu_val)
+{
+    int cpu_nr = encoded_cpu_val - 1;
+    return global_osq_nodepool_ptr + cpu_nr;
+}
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_queue *lock,
+          struct optimistic_spin_node *node,
+          struct optimistic_spin_node *prev,
+          unsigned long cpu_number)
+{
+    struct optimistic_spin_node *next = NULL;
+    int curr = encode_cpu(cpu_number);
+    int old;
+
+    /*
+     * If there is a prev node in queue, then the 'old' value will be
+     * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+     * we're currently last in queue, then the queue will then become empty.
+     */
+    old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
+
+    for (;;) {
+
+        if (atomic_read(&lock->tail) == curr &&
+            atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+            /*
+             * We were the last queued, we moved @lock back. @prev
+             * will now observe @lock and will complete its
+             * unlock()/unqueue().
+             */
+            break;
+        }
+
+        /*
+         * We must xchg() the @node->next value, because if we were to
+         * leave it in, a concurrent unlock()/unqueue() from
+         * @node->next might complete Step-A and think its @prev is
+         * still valid.
+         *
+         * If the concurrent unlock()/unqueue() wins the race, we'll
+         * wait for either @lock to point to us, through its Step-B, or
+         * wait for a new @node->next from its Step-C.
+         */
+        if (node->next) {
+            next = xchg(&node->next, NULL);
+            if (next)
+                break;
+        }
+
+        cpu_relax();
+    }
+
+    return next;
+}
+
+/* uint64_t *osq is ignored because we use &global_osq instead */
+static bool osq_lock(uint64_t *osq, unsigned long cpu_number)
+{
+    /* each cpu core has only one thread spinning on one optimistic_spin_node */
+    struct optimistic_spin_node *node = global_osq_nodepool_ptr + cpu_number;
+    /* optimistic_spin_queue stores the current osq tail globally */
+    struct optimistic_spin_queue *lock = &global_osq;
+    struct optimistic_spin_node *prev, *next;
+    int curr = encode_cpu(cpu_number);
+    int old;
+    long long back_off = 0;
+
+    node->locked = 0;
+    node->next = NULL;
+    node->cpu = curr;
+
+    /*
+     * We need both ACQUIRE (pairs with corresponding RELEASE in
+     * unlock() uncontended, or fastpath) and RELEASE (to publish
+     * the node fields we just initialised) semantics when updating
+     * the lock tail.
+     */
+    old = atomic_xchg(&lock->tail, curr);
+    if (old == OSQ_UNLOCKED_VAL)
+        return true;
+
+    prev = cpu_to_node(old);
+    node->prev = prev;
+
+    /*
+     * osq_lock()            unqueue
+     *
+     * node->prev = prev        osq_wait_next()
+     * WMB                MB
+     * prev->next = node        next->prev = prev // unqueue-C
+     *
+     * Here 'node->prev' and 'next->prev' are the same variable and we need
+     * to ensure these stores happen in-order to avoid corrupting the list.
+     */
+    smp_wmb();
+
+    WRITE_ONCE(prev->next, node);
+
+    /*
+     * Normally @prev is untouchable after the above store; because at that
+     * moment unlock can proceed and wipe the node element from stack.
+     *
+     * However, since our nodes are static per-cpu storage, we're
+     * guaranteed their existence -- this allows us to apply
+     * cmpxchg in an attempt to undo our queueing.
+     */
+
+    while (!READ_ONCE(node->locked)) {
+        /*
+         * TODO: Need to better emulate kernel rescheduling in user space.
+         * Because we cannot use need_resched() in user space, we simply
+         * add a upper limit named unqueue_retry to mimic need_resched().
+         * If this limit has been exceeded by back_off times, we will jump
+         * to unqueue path and remove the spinning node from global osq.
+         */
+        /*
+         * If we need to reschedule bail... so we can block.
+         * Use vcpu_is_preempted() to avoid waiting for a preempted
+         * lock holder.
+         */
+        //if (need_resched() || vcpu_is_preempted(node_to_cpu(node->prev)))
+        if (++back_off > unqueue_retry) /* DEFAULT_UNQUEUE_RETRY 2 billions */
+            goto unqueue;
+
+        cpu_relax();
+    }
+    return true;
+
+unqueue:
+    /*
+     * Step - A  -- stabilize @prev
+     *
+     * Undo our @prev->next assignment; this will make @prev's
+     * unlock()/unqueue() wait for a next pointer since @lock points to us
+     * (or later).
+     */
+
+    for (;;) {
+        if (prev->next == node &&
+            cmpxchg(&prev->next, node, NULL) == node)
+            break;
+
+        /*
+         * We can only fail the cmpxchg() racing against an unlock(),
+         * in which case we should observe @node->locked becomming
+         * true.
+         */
+        if (smp_load_acquire(&node->locked))
+            return true;
+
+        cpu_relax();
+
+        /*
+         * Or we race against a concurrent unqueue()'s step-B, in which
+         * case its step-C will write us a new @node->prev pointer.
+         */
+        prev = READ_ONCE(node->prev);
+    }
+
+    /*
+     * Step - B -- stabilize @next
+     *
+     * Similar to unlock(), wait for @node->next or move @lock from @node
+     * back to @prev.
+     */
+
+    next = osq_wait_next(lock, node, prev, cpu_number);
+    if (!next)
+        return false;
+
+    /*
+     * Step - C -- unlink
+     *
+     * @prev is stable because its still waiting for a new @prev->next
+     * pointer, @next is stable because our @node->next pointer is NULL and
+     * it will wait in Step-A.
+     */
+
+    WRITE_ONCE(next->prev, prev);
+    WRITE_ONCE(prev->next, next);
+
+    return false;
+}
+
+/* uint64_t *osq is ignored because we use &global_osq instead */
+static void osq_unlock(uint64_t *osq, unsigned long cpu_number)
+{
+    /* optimistic_spin_queue stores the current osq tail globally */
+    struct optimistic_spin_queue *lock = &global_osq;
+    struct optimistic_spin_node *node, *next;
+    int curr = encode_cpu(cpu_number);
+
+    /*
+     * Fast path for the uncontended case.
+     */
+    if (atomic_cmpxchg_release(&lock->tail, curr,
+                      OSQ_UNLOCKED_VAL) == curr)
+        return;
+
+    /*
+     * Second most likely case.
+     * If there is a next node, notify it.
+     */
+    node = global_osq_nodepool_ptr + cpu_number;
+    next = xchg(&node->next, NULL);
+    if (next) {
+        WRITE_ONCE(next->locked, 1);
+        return;
+    }
+
+    /*
+     * Wait for another stable next, or get NULL if the queue is empty.
+     */
+    next = osq_wait_next(lock, node, NULL, cpu_number);
+    if (next)
+        WRITE_ONCE(next->locked, 1);
+}
+
+
+/* standard lockhammer lock_acquire and lock_release interfaces */
+static unsigned long __attribute__((noinline))
+lock_acquire (uint64_t *lock, unsigned long threadnum)
+{
+    /*
+     * Note: The linux kernel implements additional mutex slow path in mutex.c
+     * __mutex_lock_common() function. We will create another workload which
+     * combines osq_lock and mutex_lock_common. This workload only benchmarks
+     * osq_lock itself. The osq_lock is different from mcs_queue_spinlock
+     * because of tunable unqueue path and backoff sleep time.
+     */
+    while (!osq_lock(lock, threadnum)) {
+        /*
+         * If still cannot acquire the lock after spinning for unqueue_retry
+         * times, try to backoff and sleep for random microseconds specified
+         * by parameter '-s', by default the maximum sleep time is 0us. Then
+         * reacquire the lock again infinitely until success.
+         *
+         * This behaves similar to kernel mutex with fine tuning sleep time.
+         */
+        usleep((global_osq_nodepool_ptr + threadnum)->random_sleep);
+    }
+    return 1;
+}
+
+
+static inline void lock_release (uint64_t *lock, unsigned long threadnum)
+{
+    osq_unlock(lock, threadnum);
+}
+
+#endif /* __LINUX_OSQ_LOCK_H */
diff --git a/ext/tbb/tbb_spin_rw_mutex.h b/ext/tbb/tbb_spin_rw_mutex.h
index 71ae06b..cfdeeef 100644
--- a/ext/tbb/tbb_spin_rw_mutex.h
+++ b/ext/tbb/tbb_spin_rw_mutex.h
@@ -123,6 +123,14 @@
 #ifndef __TBB_spin_mutex_H
 #define __TBB_spin_mutex_H
 
+#ifdef initialize_lock
+#undef initialize_lock
+#endif
+
+#ifdef parse_test_args
+#undef parse_test_args
+#endif
+
 #define initialize_lock(lock, threads) tbb_init_locks(lock, threads)
 #define parse_test_args(args, argc, argv) tbb_parse_args(args, argc, argv)