diff --git a/benchmarks/lockhammer/Makefile b/benchmarks/lockhammer/Makefile index 7db286a..0dda8d0 100644 --- a/benchmarks/lockhammer/Makefile +++ b/benchmarks/lockhammer/Makefile @@ -1,4 +1,5 @@ -override CFLAGS += -g -O3 -I. -I./include -I../../ext/mysql/include -I../../ext/linux/include -I../../ext/tbb/include +# override keyword overwrites make command-line option LSE_ENABLE=y, therefore it has been removed +CFLAGS += -g -O3 -I. -I./include -I../../ext/mysql/include -I../../ext/linux/include -I../../ext/tbb/include -I../../ext/sms/base ifneq ($(DEBUG_LEVEL),) ifeq ($(shell test $(DEBUG_LEVEL) -gt 0; echo $$?),0) @@ -32,7 +33,8 @@ TEST_TARGETS=lh_swap_mutex \ lh_empty \ lh_jvm_objectmonitor \ lh_tbb_spin_rw_mutex \ - lh_osq_lock + lh_osq_lock \ + lh_clh_spinlock ifeq ($(TARGET_ARCH),aarch64) TEST_TARGETS+=lh_hybrid_spinlock \ @@ -59,6 +61,9 @@ lh_hybrid_spinlock_fastdequeue: ../../ext/linux/hybrid_spinlock_fastdequeue.h in lh_osq_lock: ../../ext/linux/osq_lock.h ../../ext/linux/include/lk_atomics.h ../../ext/linux/include/lk_barrier.h ../../ext/linux/include/lk_cmpxchg.h include/atomics.h src/lockhammer.c ${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS} +lh_clh_spinlock: ../../ext/sms/clh_spinlock.h ../../ext/sms/base/build_config.h ../../ext/sms/base/cpu.h ../../ext/sms/base/llsc.h src/lockhammer.c + ${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS} + lh_queued_spinlock: ../../ext/linux/queued_spinlock.h include/atomics.h ../../ext/linux/include/lk_atomics.h src/lockhammer.c ${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS} diff --git a/benchmarks/lockhammer/include/atomics.h b/benchmarks/lockhammer/include/atomics.h index 2520202..6dda573 100644 --- a/benchmarks/lockhammer/include/atomics.h +++ b/benchmarks/lockhammer/include/atomics.h @@ -34,12 +34,6 @@ #ifndef __LH_ATOMICS_H_ #define __LH_ATOMICS_H_ -#ifndef initialize_lock - #define initialize_lock(lock, thread) -#endif -#ifndef parse_test_args - #define parse_test_args(args, argc, argv) -#endif static inline void spin_wait (unsigned long wait_iter) { #if defined(__aarch64__) diff --git a/benchmarks/lockhammer/include/lockhammer.h b/benchmarks/lockhammer/include/lockhammer.h index aa8cac5..427cbd2 100644 --- a/benchmarks/lockhammer/include/lockhammer.h +++ b/benchmarks/lockhammer/include/lockhammer.h @@ -32,6 +32,17 @@ #ifndef __LOCKHAMMER_H__ #define __LOCKHAMMER_H__ + +#ifndef initialize_lock + #define initialize_lock(lock, thread) +#endif +#ifndef parse_test_args + #define parse_test_args(args, argc, argv) +#endif +#ifndef thread_local_init + #define thread_local_init(smtid) +#endif + enum units { NS, INSTS }; typedef enum units Units; diff --git a/benchmarks/lockhammer/scripts/lh_sweeptest_cfg.yaml b/benchmarks/lockhammer/scripts/lh_sweeptest_cfg.yaml index defa3af..5bb0e84 100644 --- a/benchmarks/lockhammer/scripts/lh_sweeptest_cfg.yaml +++ b/benchmarks/lockhammer/scripts/lh_sweeptest_cfg.yaml @@ -66,6 +66,7 @@ sweeptest: - lh_swap_mutex - lh_tbb_spin_rw_mutex - lh_ticket_spinlock + - lh_clh_spinlock cmd_aarch64: [lh_hybrid_spinlock, lh_hybrid_spinlock_fastdequeue] cmd_x86_64: repeat: 9 @@ -78,6 +79,10 @@ sweeptest: c: 0ns p: 0ns o: lstopo + - a: 5000 + c: 200ns + p: 0ns + o: lstopo - a: 5000 c: 1000ns p: 0ns diff --git a/benchmarks/lockhammer/scripts/runall_obsolete.sh b/benchmarks/lockhammer/scripts/runall_obsolete.sh deleted file mode 100755 index 0ec8e27..0000000 --- a/benchmarks/lockhammer/scripts/runall_obsolete.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2017, The Linux Foundation. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided -# with the distribution. -# * Neither the name of The Linux Foundation nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED -# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS -# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE -# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN -# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -./sweep.sh incdec_refcount 0 0 > incdec_refcount_0_0_$HOSTNAME.csv -./sweep.sh cas_lockref 0 0 > cas_lockref_0_0_$HOSTNAME.csv -./sweep.sh cas_lockref 2000 1000 > cas_lockref_2000_1000_$HOSTNAME.csv -./sweep.sh ticket_spinlock 0 0 > ticket_spinlock_0_0_$HOSTNAME.csv -./sweep.sh ticket_spinlock 1000 5000 > ticket_spinlock_1000_5000_$HOSTNAME.csv -./sweep.sh queued_spinlock 0 0 > queued_spinlock_0_0_$HOSTNAME.csv -./sweep.sh queued_spinlock 1000 5000 > queued_spinlock_1000_5000_$HOSTNAME.csv -./sweep.sh event_mutex 0 0 > event_mutex_0_0_$HOSTNAME.csv -./sweep.sh event_mutex 1000 5000 > event_mutex_1000_5000_$HOSTNAME.csv -./sweep.sh cas_event_mutex 0 0 > cas_event_mutex_0_0_$HOSTNAME.csv -./sweep.sh cas_event_mutex 1000 5000 > cas_event_mutex_1000_5000_$HOSTNAME.csv -./sweep.sh cas_rw_lock 0 0 > cas_rw_lock_0_0_$HOSTNAME.csv -./sweep.sh cas_rw_lock 2000 1000 > cas_rw_lock_2000_1000_$HOSTNAME.csv -./sweep.sh hybrid_spinlock 0 0 > hybrid_spinlock_0_0_$HOSTNAME.csv -./sweep.sh hybrid_spinlock 1000 5000 > hybrid_spinlock_1000_5000_$HOSTNAME.csv -./sweep.sh hybrid_spinlock_fastdequeue 0 0 > hybrid_spinlock_fastdequeue_0_0_$HOSTNAME.csv -./sweep.sh hybrid_spinlock_fastdequeue 1000 5000 > hybrid_spinlock_fastdequeue_1000_5000_$HOSTNAME.csv -./sweep.sh empty 0 0 > empty_0_0_$HOSTNAME.csv -./sweep.sh jvm_objectmonitor 0 0 > jvm_objectmonitor_0_0_$HOSTNAME.csv -./sweep.sh jvm_objectmonitor 1000 5000 > jvm_objectmonitor_1000_5000_$HOSTNAME.csv -./sweep.sh swap_mutex 0 0 > swap_mutex_0_0_$HOSTNAME.csv -./sweep.sh swap_mutex 1000 5000 > swap_mutex_1000_5000_$HOSTNAME.csv -./sweep.sh spin_rw_mutex 0 0 > spin_rw_mutex_0_0_$HOSTNAME.csv -./sweep.sh spin_rw_mutex 1000 5000 > spin_rw_mutex_1000_5000_$HOSTNAME.csv diff --git a/benchmarks/lockhammer/src/lockhammer.c b/benchmarks/lockhammer/src/lockhammer.c index 98448f5..60d9362 100644 --- a/benchmarks/lockhammer/src/lockhammer.c +++ b/benchmarks/lockhammer/src/lockhammer.c @@ -448,6 +448,8 @@ void* hmr(void *ptr) synchronize_threads(&calibrate_lock, nthrds); } + thread_local_init(mycore); + #ifdef DDEBUG printf("%ld %ld\n", hold_count, post_count); #endif diff --git a/ext/sms/base/build_config.h b/ext/sms/base/build_config.h new file mode 100644 index 0000000..c97e028 --- /dev/null +++ b/ext/sms/base/build_config.h @@ -0,0 +1,75 @@ +// Copyright (c) 2017 ARM Limited. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +// Architecture detection is inferred from the toolchain. This relies on +// the C compiler's system-specific macros. +#if defined(__aarch64__) +#define CONFIG_ARCH_ARM_V8 +#define CONFIG_ARCH_64BIT +#elif defined(__arm__) +#define CONFIG_ARCH_ARM_V7 +#define CONFIG_ARCH_32BIT +#elif defined(__x86_64__) +#define CONFIG_ARCH_X86_64 +#define CONFIG_ARCH_64BIT +#elif defined(__i386__) +#define CONFIG_ARCH_X86 +#define CONFIG_ARCH_32BIT +#endif + +#if !defined(CONFIG_ARCH_64BIT) && !defined(CONFIG_ARCH_32BIT) +#error Please add support for N-bit computing to build_config.h +// If you experience this C pre-processor error, take a look at the place +// in this file where CONFIG_ARCH_64/32BIT are defined. If there are no issues +// there and you are needing to add support for a new N-bit processor, please +// search the source code for all occurances of CONFIG_ARCH_64BIT and +// CONFIG_ARCH_32BIT to check whether further modification is necessary. +// These places will not necessarily #error for unsupported N-bit computing. +#endif + +// OS detection is also inferred from the toolchain. +#if defined(__APPLE__) +#define OS_MACOSX 1 +#elif defined(__linux__) +#define OS_LINUX 1 +#elif defined(__FreeBSD__) +#define OS_FREEBSD 1 +#endif + +#if defined(OS_MACOSX) || defined(OS_LINUX) || defined(OS_FREEBSD) +#define OS_POSIX 1 +#endif + +#define MAX_THREADS 32 + +//Use LL/SC atomic primitives instead of __atomic_compare_exchange built-ins +//This seems to be the most performant option on ARM but may violate +//recommendations by the ARM architecture (e.g. no memory accesses between +//LL and SC) +//USE_LLSC overrides the use of __atomic_compare_exchange +#ifdef __ARM_ARCH +#define USE_LLSC +#endif + +//Use barrier + relaxed store (DMB;STR) instead of store-release (STRL) +//This is more performant on Cortex-A57 and possibly also on Cortex-A53 +#if defined(__aarch64__) +#define USE_DMB +#endif + +#if defined(USE_DMB) && defined(__arm__) +#error USE_DMB optimization only applies to select ARMv8 processors +#endif + +//Use ARM wait-for-event mechanism when busy polling +//This will minimise interconnect transactions and often increase system-wide +//performance +#if defined __ARM_ARCH +#define USE_WFE +#if defined(__arm__) +//TODO: WFE on ARMv7 +#undef USE_WFE +#endif +#endif diff --git a/ext/sms/base/cpu.h b/ext/sms/base/cpu.h new file mode 100644 index 0000000..6b93d98 --- /dev/null +++ b/ext/sms/base/cpu.h @@ -0,0 +1,28 @@ +// Copyright (c) 2017 ARM Limited. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#ifndef CACHE_LINE +// Default CPU cache line size +#define CACHE_LINE 128 +#endif + +static inline void doze(void) +{ +#if defined(__ARM_ARCH) + // YIELD hints the CPU to switch to another thread if available + // but otherwise executes as a NOP + // ISB flushes the pipeline, then restarts. This is guaranteed to stall + // the CPU a number of cycles + __asm__ volatile("isb" : : : "memory"); +#elif defined(__x86_64__) + __asm__ volatile("pause" : : : "memory"); +#else +#error Please add support for your CPU in cpu.h +#endif +} + +int num_cpus(void); + +unsigned long cpu_hz(void); diff --git a/ext/sms/base/llsc.h b/ext/sms/base/llsc.h new file mode 100644 index 0000000..5ef4207 --- /dev/null +++ b/ext/sms/base/llsc.h @@ -0,0 +1,359 @@ +// Copyright (c) 2017 ARM Limited. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include "build_config.h" +#include "cpu.h" + +#include +#include + +/****************************************************************************** + * LL/SC primitives + *****************************************************************************/ + +#if __ARM_ARCH == 7 || (__ARM_ARCH == 8 && __ARM_64BIT_STATE == 0) + +static inline void dmb() +{ + __asm volatile("dmb" : : : "memory"); +} + +static inline uint8_t ll8(uint8_t *var, int mm) +{ + uint8_t old; + __asm volatile("ldrexb %0, [%1]" + : "=&r" (old) + : "r" (var) + : ); + if (mm == __ATOMIC_ACQUIRE) + dmb(); + return old; +} + +static inline uint32_t ll(uint32_t *var, int mm) +{ + uint32_t old; + __asm volatile("ldrex %0, [%1]" + : "=&r" (old) + : "r" (var) + : ); + //Barrier after an acquiring load + if (mm == __ATOMIC_ACQUIRE) + dmb(); + return old; +} +#define ll32(a, b) ll((a), (b)) + +//Return 0 on success, 1 on failure +static inline uint32_t sc(uint32_t *var, uint32_t neu, int mm) +{ + uint32_t ret; + //Barrier before a releasing store + if (mm == __ATOMIC_RELEASE) + dmb(); + __asm volatile("strex %0, %1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : ); + return ret; +} +#define sc32(a, b, c) sc((a), (b), (c)) + +static inline uint64_t lld(uint64_t *var, int mm) +{ + uint64_t old; + __asm volatile("ldrexd %0, %H0, [%1]" + : "=&r" (old) + : "r" (var) + : ); + //Barrier after an acquiring load + if (mm == __ATOMIC_ACQUIRE) + dmb(); + return old; +} +#define ll64(a, b) lld((a), (b)) + +//Return 0 on success, 1 on failure +static inline uint32_t scd(uint64_t *var, uint64_t neu, int mm) +{ + uint32_t ret; + //Barrier before a releasing store + if (mm == __ATOMIC_RELEASE) + dmb(); + __asm volatile("strexd %0, %1, %H1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : ); + return ret; +} +#define sc64(a, b, c) scd((a), (b), (c)) + +#endif + +#if __ARM_ARCH == 8 && __ARM_64BIT_STATE == 1 + +static inline uint8_t ll8(uint8_t *var, int mm) +{ + uint8_t old; + if (mm == __ATOMIC_ACQUIRE) + __asm volatile("ldaxrb %w0, [%1]" + : "=&r" (old) + : "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("ldxrb %w0, [%1]" + : "=&r" (old) + : "r" (var) + : ); + else + abort(); + return old; +} + +static inline uint16_t ll16(uint16_t *var, int mm) +{ + uint16_t old; + if (mm == __ATOMIC_ACQUIRE) + __asm volatile("ldaxrh %w0, [%1]" + : "=&r" (old) + : "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("ldxrh %w0, [%1]" + : "=&r" (old) + : "r" (var) + : ); + else + abort(); + return old; +} + +static inline uint32_t ll32(uint32_t *var, int mm) +{ + uint32_t old; + if (mm == __ATOMIC_ACQUIRE) + __asm volatile("ldaxr %w0, [%1]" + : "=&r" (old) + : "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("ldxr %w0, [%1]" + : "=&r" (old) + : "r" (var) + : ); + else + abort(); + return old; +} + +//Return 0 on success, 1 on failure +static inline uint8_t sc8(uint8_t *var, uint8_t neu, int mm) +{ + uint8_t ret; + if (mm == __ATOMIC_RELEASE) + __asm volatile("stlxrb %w0, %w1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("stxrb %w0, %w1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : ); + else + abort(); + return ret; +} + +//Return 0 on success, 1 on failure +static inline uint32_t sc32(uint32_t *var, uint32_t neu, int mm) +{ + uint32_t ret; + if (mm == __ATOMIC_RELEASE) + __asm volatile("stlxr %w0, %w1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("stxr %w0, %w1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : ); + else + abort(); + return ret; +} + +static inline uint64_t ll(uint64_t *var, int mm) +{ + uint64_t old; + if (mm == __ATOMIC_ACQUIRE) + __asm volatile("ldaxr %0, [%1]" + : "=&r" (old) + : "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("ldxr %0, [%1]" + : "=&r" (old) + : "r" (var) + : ); + else + abort(); + return old; +} +#define ll64(a, b) ll((a), (b)) + +//Return 0 on success, 1 on failure +static inline uint32_t sc(uint64_t *var, uint64_t neu, int mm) +{ + uint32_t ret; + if (mm == __ATOMIC_RELEASE) + __asm volatile("stlxr %w0, %1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("stxr %w0, %1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : ); + else + abort(); + return ret; +} +#define sc64(a, b, c) sc((a), (b), (c)) + +#if defined(__clang__) +union i128 +{ + __int128 i128; + int64_t i64[2]; +}; +#endif + +static inline __int128 lld(__int128 *var, int mm) +{ +#if defined(__clang__) + union i128 old; + if (mm == __ATOMIC_ACQUIRE) + __asm volatile("ldaxp %0, %1, [%2]" + : "=&r" (old.i64[0]), "=&r" (old.i64[1]) + : "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("ldxp %0, %1, [%2]" + : "=&r" (old.i64[0]), "=&r" (old.i64[1]) + : "r" (var) + : ); + else + abort(); + return old.i128; +#else + __int128 old; + if (mm == __ATOMIC_ACQUIRE) + __asm volatile("ldaxp %0, %H0, [%1]" + : "=&r" (old) + : "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("ldxp %0, %H0, [%1]" + : "=&r" (old) + : "r" (var) + : ); + else + abort(); + return old; +#endif +} + +//Return 0 on success, 1 on failure +static inline uint32_t scd(__int128 *var, __int128 neu, int mm) +{ +#if defined(__clang__) + uint32_t ret; + if (mm == __ATOMIC_RELEASE) + __asm volatile("stlxp %w0, %1, %2, [%3]" + : "=&r" (ret) + : "r" (((union i128)neu).i64[0]), + "r" (((union i128)neu).i64[1]), + "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("stxp %w0, %1, %2, [%3]" + : "=&r" (ret) + : "r" (((union i128)neu).i64[0]), + "r" (((union i128)neu).i64[1]), + "r" (var) + : ); + else + abort(); + return ret; +#else + uint32_t ret; + if (mm == __ATOMIC_RELEASE) + __asm volatile("stlxp %w0, %1, %H1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : "memory"); + else if (mm == __ATOMIC_RELAXED) + __asm volatile("stxp %w0, %1, %H1, [%2]" + : "=&r" (ret) + : "r" (neu), "r" (var) + : ); + else + abort(); + return ret; +#endif +} +#endif + +static inline void sevl(void) +{ +#if defined __ARM_ARCH + __asm volatile("sevl" : : : ); +#endif +} + +static inline void sev(void) +{ +#if defined __ARM_ARCH + __asm volatile("sev" : : : "memory"); +#endif +} + +static inline int wfe(void) +{ +#if defined __ARM_ARCH + __asm volatile("wfe" : : : "memory"); +#endif + return 1; +} + +#ifdef USE_WFE +#define SEVL() sevl() +#define WFE() wfe() +#define SEV() do { __asm volatile ("dsb ish" ::: "memory"); sev(); } while(0) +#if __ARM_ARCH == 8 && __ARM_64BIT_STATE == 1 +#define LDXR128(addr, mo) lld((addr), (mo)) +#endif +#define LDXR64(addr, mo) ll64((addr), (mo)) +#define LDXR32(addr, mo) ll32((addr), (mo)) +#define LDXR16(addr, mo) ll16((addr), (mo)) +#define LDXR8(addr, mo) ll8((addr), (mo)) +#define LDXR(addr, mo) ll((addr), (mo)) +//When using WFE we should not stall the pipeline using other means +#define DOZE() (void)0 +#else +#define SEVL() (void)0 +#define WFE() 1 +#define SEV() (void)0 +#define LDXR128(addr, mo) __atomic_load_n((addr), (mo)) +#define LDXR64(addr, mo) __atomic_load_n((addr), (mo)) +#define LDXR32(addr, mo) __atomic_load_n((addr), (mo)) +#define LDXR16(addr, mo) __atomic_load_n((addr), (mo)) +#define LDXR8(addr, mo) __atomic_load_n((addr), (mo)) +#define LDXR(addr, mo) __atomic_load_n((addr), (mo)) +#define DOZE() doze() +#endif diff --git a/ext/sms/clh_spinlock.h b/ext/sms/clh_spinlock.h new file mode 100644 index 0000000..8cf334b --- /dev/null +++ b/ext/sms/clh_spinlock.h @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2017 ARM Limited. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * Neither the name of ARM Limited nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Arm Shared Memory Synchronization Benchmark (SMS) + * commit: 85a4b2456f1c84e2235a527d8b2b69be99621e94 + * August 6 2018 + * + * Description: + * CLH (Craig Landin Hagersten) spinlock is a queue-based spinlock that each + * node spins on previous node's wait status. CLH spinlock is starvation-free + * and has FCFS (first come, first served) order. Because each thread spins + * on the previous node created by another thread, CLH's performance may be + * worse than MCS spinlock, which only spins on local memory. However, this + * should not be a problem because modern architectures always implement ccNUMA + * (cache coherent non-uniform memory architecture) which will coherently cache + * remote memory to a local cache-line. The remote memory may not be updated at + * all and the changed status will be implicit transferred by interconnect cache + * coherence protocols to the spinning core. CLH data structure is an implicit + * linked list, the global_clh only contains a cache-line aligned tail pointer + * and an initial dummy clh_node. The main disadvantages of CLH spinlock compared + * to MCS spinlock are: 1) slower than MCS on cacheless NUMA, 2) hard to implement + * wait-free back-off / time-out / abortable / hierarchical spinlock. + * + * Changes compared to official CLH spinlock + * Official CLH spinlock reuses previous released queue node. We used thread-local + * pointers to indicate current local node, which is also a thread-local struct. + * Therefore each thread may spin at other thread's TLS queue node, and ccNUMA + * coherence protocols will cache the remote DRAM to local cache. Overall + * performance should be similar to MCS spinlock. + * + * Internals: + * The only LSE instruction is SWPAL which exchanges current node and lock tail. + * There is a tunable parameter -w which can be used to disable WFE. All variables + * are cache-line aligned. Queue node is implemented with TLS __thread keyword. + * New initial clh_thread_local_init() function will initialize all queue nodes. + * clh_lock() and clh_unlock() strictly follow the original CLH algorithm. Global + * uint64_t lock pointer is unused. + * + * Workings: + * clh_spinlock works similar to osq_lock and queued_spinlock + * + * Tuning Parameters: + * + * Optional without_wfe to disable wfe instruction and use empty loops instead. + * + * [-- [-w]]: disable sevl and wfe + * + */ + +#pragma once + +#include "llsc.h" + +#include +#include +#include + +#ifdef initialize_lock +#undef initialize_lock +#endif + +#ifdef parse_test_args +#undef parse_test_args +#endif + +#ifdef thread_local_init +#undef thread_local_init +#endif + +#define initialize_lock(lock, threads) clh_lock_init(lock, threads) +#define parse_test_args(args, argc, argv) clh_parse_args(args, argc, argv) +#define thread_local_init(smtid) clh_thread_local_init(smtid) + + +struct clh_node +{ + struct clh_node *prev; + unsigned long wait; +} __attribute__ ((aligned (CACHE_LINE))); + +struct clh_node_pointer +{ + struct clh_node *ptr; +} __attribute__ ((aligned (CACHE_LINE))); + +struct clh_lock +{ + struct clh_node node; + unsigned long num_cores; + struct clh_node *tail __attribute__ ((aligned(CACHE_LINE))); +}; + +static bool without_wfe; +static struct clh_lock global_clh_lock; // clh lock queue +/* + * Cannot use __thread thread local storage because some threads + * may be joined earlier and their node may be referenced by other + * threads, this will cause memory access violation. We have to + * use the main thread heap and share a common C array. Two arrays + * are used here, one is used as a pointer array, which is fixed + * for each thread. The other is a nodepool, whose node is assigned + * to each thread according to its threadid initially. Then + * according to CLH algorithm, current node will reuse its previous + * node as the next available node. We just update the fixed pointer + * array to reflect this change. That is, each thread will retrieve + * its next available node from fixed pointer array by its thread + * id offset, but the pointer value may point to any node in the + * CLH nodepool. + */ +static struct clh_node_pointer *clh_nodeptr; // clh node pointer array +static struct clh_node *clh_nodepool; // clh node struct array + +/* additional parameter to enable WFE(default) or disable WFE */ +static void clh_parse_args(test_args unused, int argc, char** argv) { + int i = 0; +#if defined(__aarch64__) + without_wfe = false; +#else + /* only aarch64 supports WFE */ + without_wfe = true; +#endif + + /* extended options retrieved after '--' operator */ + while ((i = getopt(argc, argv, "w")) != -1) + { + switch (i) { + case 'w': + without_wfe = true; + break; + + default: + fprintf(stderr, + "clh_spinlock additional options after --:\n" + "\t[-h print this msg]\n" + "\t[-w without_wfe, aarch64 default is false, non-aarch64 default is true]\n"); + exit(2); + } + } +} + +static inline void clh_lock_init(uint64_t *u64_lock, unsigned long num_cores) +{ + /* default tail node should be set to 0 */ + global_clh_lock.node.prev = NULL; + global_clh_lock.node.wait = 0; + global_clh_lock.num_cores = num_cores; + global_clh_lock.tail = &global_clh_lock.node; + + /* save clh_lock pointer to global u64int_t */ + *u64_lock = (uint64_t)&global_clh_lock; + + /* calloc will initialize all memory to zero automatically */ + clh_nodeptr = calloc(num_cores, sizeof(struct clh_node_pointer)); + if (clh_nodeptr == NULL) exit(errno); + clh_nodepool = calloc(num_cores, sizeof(struct clh_node)); + if (clh_nodepool == NULL) exit(errno); + +#ifdef DDEBUG + printf("CLH: global_clh_lock=%llx\n", (long long unsigned int) &global_clh_lock); +#endif +} + +static inline void clh_thread_local_init(unsigned long smtid) +{ + /* initialize clh node pointer array individually */ + clh_nodepool[smtid].wait = 1; + clh_nodeptr[smtid].ptr = &clh_nodepool[smtid]; +} + +static inline void clh_lock(struct clh_lock *lock, struct clh_node *node, bool use_wfe, unsigned long tid) +{ + /* must set wait to 1 first, otherwise next node after new tail will not spin */ + node->wait = 1; + struct clh_node *prev = node->prev = __atomic_exchange_n(&lock->tail, node, __ATOMIC_ACQ_REL); +#ifdef DDEBUG + printf("T%lu LOCK: prev<-node: %llx<-%llx\n", tid, (long long unsigned int)prev, (long long unsigned int)node); +#endif + + /* CLH spinlock: spinning on previous node's wait status */ + if (use_wfe) + { + if (__atomic_load_n(&prev->wait, __ATOMIC_ACQUIRE)) + { + SEVL(); + while (WFE() && LDXR(&prev->wait, __ATOMIC_ACQUIRE)) + { + DOZE(); + } + } + } + else + { + while (__atomic_load_n(&prev->wait, __ATOMIC_ACQUIRE)) + { + ; + } + } +} + +/* return the previous node as reused node for the next clh_lock() */ +static inline void clh_unlock(struct clh_node *node, unsigned long tid) +{ +#ifdef DDEBUG + printf("T%lu UNLOCK: node: %llx\n", tid, (long long unsigned int)node); +#endif + /* CLH spinlock: release current node by resetting wait status */ +#ifdef USE_DMB + __atomic_thread_fence(__ATOMIC_RELEASE); + __atomic_store_n(&node->wait, 0, __ATOMIC_RELAXED); +#else + __atomic_store_n(&node->wait, 0, __ATOMIC_RELEASE); +#endif +} + +/* standard lockhammer lock_acquire and lock_release interfaces */ +static unsigned long __attribute__((noinline)) +lock_acquire (uint64_t *lock, unsigned long threadnum) +{ + clh_lock(&global_clh_lock, clh_nodeptr[threadnum].ptr, !without_wfe, threadnum); + return 1; +} + +static inline void lock_release (uint64_t *lock, unsigned long threadnum) +{ + /* + * Have to save prev first, once called clh_unlock(), node->prev might + * be overwritten by another thread and caused two thread use the same + * nodepool clh_node, therefore generated a circular linked list after + * another round of lock acquisition. + */ + struct clh_node* prev = clh_nodeptr[threadnum].ptr->prev; + clh_unlock(clh_nodeptr[threadnum].ptr, threadnum); + clh_nodeptr[threadnum].ptr = prev; +}