diff --git a/benchmarks/lockhammer/Makefile b/benchmarks/lockhammer/Makefile
index 7db286a..0dda8d0 100644
--- a/benchmarks/lockhammer/Makefile
+++ b/benchmarks/lockhammer/Makefile
@@ -1,4 +1,5 @@
-override CFLAGS += -g -O3 -I. -I./include -I../../ext/mysql/include -I../../ext/linux/include -I../../ext/tbb/include
+# override keyword overwrites make command-line option LSE_ENABLE=y, therefore it has been removed
+CFLAGS += -g -O3 -I. -I./include -I../../ext/mysql/include -I../../ext/linux/include -I../../ext/tbb/include -I../../ext/sms/base
 
 ifneq ($(DEBUG_LEVEL),)
 ifeq ($(shell test $(DEBUG_LEVEL) -gt 0; echo $$?),0)
@@ -32,7 +33,8 @@ TEST_TARGETS=lh_swap_mutex \
 	lh_empty \
 	lh_jvm_objectmonitor \
 	lh_tbb_spin_rw_mutex \
-	lh_osq_lock
+	lh_osq_lock \
+	lh_clh_spinlock
 
 ifeq ($(TARGET_ARCH),aarch64)
 	TEST_TARGETS+=lh_hybrid_spinlock \
@@ -59,6 +61,9 @@ lh_hybrid_spinlock_fastdequeue: ../../ext/linux/hybrid_spinlock_fastdequeue.h in
 lh_osq_lock: ../../ext/linux/osq_lock.h ../../ext/linux/include/lk_atomics.h ../../ext/linux/include/lk_barrier.h ../../ext/linux/include/lk_cmpxchg.h include/atomics.h src/lockhammer.c
 	${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS}
 
+lh_clh_spinlock: ../../ext/sms/clh_spinlock.h ../../ext/sms/base/build_config.h ../../ext/sms/base/cpu.h ../../ext/sms/base/llsc.h src/lockhammer.c
+	${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS}
+
 lh_queued_spinlock: ../../ext/linux/queued_spinlock.h include/atomics.h ../../ext/linux/include/lk_atomics.h src/lockhammer.c
 	${CC} ${CFLAGS} -DATOMIC_TEST=\"$<\" src/lockhammer.c -o build/$@ ${LDFLAGS}
 
diff --git a/benchmarks/lockhammer/include/atomics.h b/benchmarks/lockhammer/include/atomics.h
index 2520202..6dda573 100644
--- a/benchmarks/lockhammer/include/atomics.h
+++ b/benchmarks/lockhammer/include/atomics.h
@@ -34,12 +34,6 @@
 #ifndef __LH_ATOMICS_H_
 #define __LH_ATOMICS_H_
 
-#ifndef initialize_lock
-	#define initialize_lock(lock, thread)
-#endif
-#ifndef parse_test_args
-	#define parse_test_args(args, argc, argv)
-#endif
 
 static inline void spin_wait (unsigned long wait_iter) {
 #if defined(__aarch64__)
diff --git a/benchmarks/lockhammer/include/lockhammer.h b/benchmarks/lockhammer/include/lockhammer.h
index aa8cac5..427cbd2 100644
--- a/benchmarks/lockhammer/include/lockhammer.h
+++ b/benchmarks/lockhammer/include/lockhammer.h
@@ -32,6 +32,17 @@
 #ifndef __LOCKHAMMER_H__
 #define __LOCKHAMMER_H__
 
+
+#ifndef initialize_lock
+    #define initialize_lock(lock, thread)
+#endif
+#ifndef parse_test_args
+    #define parse_test_args(args, argc, argv)
+#endif
+#ifndef thread_local_init
+    #define thread_local_init(smtid)
+#endif
+
 enum units { NS,
              INSTS };
 typedef enum units Units;
diff --git a/benchmarks/lockhammer/scripts/lh_sweeptest_cfg.yaml b/benchmarks/lockhammer/scripts/lh_sweeptest_cfg.yaml
index defa3af..5bb0e84 100644
--- a/benchmarks/lockhammer/scripts/lh_sweeptest_cfg.yaml
+++ b/benchmarks/lockhammer/scripts/lh_sweeptest_cfg.yaml
@@ -66,6 +66,7 @@ sweeptest:
         - lh_swap_mutex
         - lh_tbb_spin_rw_mutex
         - lh_ticket_spinlock
+        - lh_clh_spinlock
     cmd_aarch64: [lh_hybrid_spinlock, lh_hybrid_spinlock_fastdequeue]
     cmd_x86_64:
     repeat: 9
@@ -78,6 +79,10 @@ sweeptest:
           c: 0ns
           p: 0ns
           o: lstopo
+        - a: 5000
+          c: 200ns
+          p: 0ns
+          o: lstopo
         - a: 5000
           c: 1000ns
           p: 0ns
diff --git a/benchmarks/lockhammer/scripts/runall_obsolete.sh b/benchmarks/lockhammer/scripts/runall_obsolete.sh
deleted file mode 100755
index 0ec8e27..0000000
--- a/benchmarks/lockhammer/scripts/runall_obsolete.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2017, The Linux Foundation. All rights reserved.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#     * Redistributions of source code must retain the above copyright
-#       notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above
-#       copyright notice, this list of conditions and the following
-#       disclaimer in the documentation and/or other materials provided
-#       with the distribution.
-#     * Neither the name of The Linux Foundation nor the names of its
-#       contributors may be used to endorse or promote products derived
-#       from this software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
-# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
-# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-./sweep.sh incdec_refcount 0 0 > incdec_refcount_0_0_$HOSTNAME.csv
-./sweep.sh cas_lockref 0 0 > cas_lockref_0_0_$HOSTNAME.csv
-./sweep.sh cas_lockref 2000 1000 > cas_lockref_2000_1000_$HOSTNAME.csv
-./sweep.sh ticket_spinlock 0 0 > ticket_spinlock_0_0_$HOSTNAME.csv
-./sweep.sh ticket_spinlock 1000 5000 > ticket_spinlock_1000_5000_$HOSTNAME.csv
-./sweep.sh queued_spinlock 0 0 > queued_spinlock_0_0_$HOSTNAME.csv
-./sweep.sh queued_spinlock 1000 5000 > queued_spinlock_1000_5000_$HOSTNAME.csv
-./sweep.sh event_mutex 0 0 > event_mutex_0_0_$HOSTNAME.csv
-./sweep.sh event_mutex 1000 5000 > event_mutex_1000_5000_$HOSTNAME.csv
-./sweep.sh cas_event_mutex 0 0 > cas_event_mutex_0_0_$HOSTNAME.csv
-./sweep.sh cas_event_mutex 1000 5000 > cas_event_mutex_1000_5000_$HOSTNAME.csv
-./sweep.sh cas_rw_lock 0 0 > cas_rw_lock_0_0_$HOSTNAME.csv
-./sweep.sh cas_rw_lock 2000 1000 > cas_rw_lock_2000_1000_$HOSTNAME.csv
-./sweep.sh hybrid_spinlock 0 0 > hybrid_spinlock_0_0_$HOSTNAME.csv
-./sweep.sh hybrid_spinlock 1000 5000 > hybrid_spinlock_1000_5000_$HOSTNAME.csv
-./sweep.sh hybrid_spinlock_fastdequeue 0 0 > hybrid_spinlock_fastdequeue_0_0_$HOSTNAME.csv
-./sweep.sh hybrid_spinlock_fastdequeue 1000 5000 > hybrid_spinlock_fastdequeue_1000_5000_$HOSTNAME.csv
-./sweep.sh empty 0 0 > empty_0_0_$HOSTNAME.csv
-./sweep.sh jvm_objectmonitor 0 0 > jvm_objectmonitor_0_0_$HOSTNAME.csv
-./sweep.sh jvm_objectmonitor 1000 5000 > jvm_objectmonitor_1000_5000_$HOSTNAME.csv
-./sweep.sh swap_mutex 0 0 > swap_mutex_0_0_$HOSTNAME.csv
-./sweep.sh swap_mutex 1000 5000 > swap_mutex_1000_5000_$HOSTNAME.csv
-./sweep.sh spin_rw_mutex 0 0 > spin_rw_mutex_0_0_$HOSTNAME.csv
-./sweep.sh spin_rw_mutex 1000 5000 > spin_rw_mutex_1000_5000_$HOSTNAME.csv
diff --git a/benchmarks/lockhammer/src/lockhammer.c b/benchmarks/lockhammer/src/lockhammer.c
index 98448f5..60d9362 100644
--- a/benchmarks/lockhammer/src/lockhammer.c
+++ b/benchmarks/lockhammer/src/lockhammer.c
@@ -448,6 +448,8 @@ void* hmr(void *ptr)
         synchronize_threads(&calibrate_lock, nthrds);
     }
 
+    thread_local_init(mycore);
+
 #ifdef DDEBUG
     printf("%ld %ld\n", hold_count, post_count);
 #endif
diff --git a/ext/sms/base/build_config.h b/ext/sms/base/build_config.h
new file mode 100644
index 0000000..c97e028
--- /dev/null
+++ b/ext/sms/base/build_config.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2017 ARM Limited. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+// Architecture detection is inferred from the toolchain. This relies on
+// the C compiler's system-specific macros.
+#if defined(__aarch64__)
+#define CONFIG_ARCH_ARM_V8
+#define CONFIG_ARCH_64BIT
+#elif defined(__arm__)
+#define CONFIG_ARCH_ARM_V7
+#define CONFIG_ARCH_32BIT
+#elif defined(__x86_64__)
+#define CONFIG_ARCH_X86_64
+#define CONFIG_ARCH_64BIT
+#elif defined(__i386__)
+#define CONFIG_ARCH_X86
+#define CONFIG_ARCH_32BIT
+#endif
+
+#if !defined(CONFIG_ARCH_64BIT) && !defined(CONFIG_ARCH_32BIT)
+#error Please add support for N-bit computing to build_config.h
+// If you experience this C pre-processor error, take a look at the place
+// in this file where CONFIG_ARCH_64/32BIT are defined. If there are no issues
+// there and you are needing to add support for a new N-bit processor, please
+// search the source code for all occurances of CONFIG_ARCH_64BIT and
+// CONFIG_ARCH_32BIT to check whether further modification is necessary.
+// These places will not necessarily #error for unsupported N-bit computing.
+#endif
+
+// OS detection is also inferred from the toolchain.
+#if defined(__APPLE__)
+#define OS_MACOSX 1
+#elif defined(__linux__)
+#define OS_LINUX 1
+#elif defined(__FreeBSD__)
+#define OS_FREEBSD 1
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_LINUX) || defined(OS_FREEBSD)
+#define OS_POSIX 1
+#endif
+
+#define MAX_THREADS 32
+
+//Use LL/SC atomic primitives instead of __atomic_compare_exchange built-ins
+//This seems to be the most performant option on ARM but may violate
+//recommendations by the ARM architecture (e.g. no memory accesses between
+//LL and SC)
+//USE_LLSC overrides the use of __atomic_compare_exchange
+#ifdef __ARM_ARCH
+#define USE_LLSC
+#endif
+
+//Use barrier + relaxed store (DMB;STR) instead of store-release (STRL)
+//This is more performant on Cortex-A57 and possibly also on Cortex-A53
+#if defined(__aarch64__)
+#define USE_DMB
+#endif
+
+#if defined(USE_DMB) && defined(__arm__)
+#error USE_DMB optimization only applies to select ARMv8 processors
+#endif
+
+//Use ARM wait-for-event mechanism when busy polling
+//This will minimise interconnect transactions and often increase system-wide
+//performance
+#if defined __ARM_ARCH
+#define USE_WFE
+#if defined(__arm__)
+//TODO: WFE on ARMv7
+#undef USE_WFE
+#endif
+#endif
diff --git a/ext/sms/base/cpu.h b/ext/sms/base/cpu.h
new file mode 100644
index 0000000..6b93d98
--- /dev/null
+++ b/ext/sms/base/cpu.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2017 ARM Limited. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#ifndef CACHE_LINE
+// Default CPU cache line size
+#define CACHE_LINE 128
+#endif
+
+static inline void doze(void)
+{
+#if defined(__ARM_ARCH)
+    // YIELD hints the CPU to switch to another thread if available
+    // but otherwise executes as a NOP
+    // ISB flushes the pipeline, then restarts. This is guaranteed to stall
+    // the CPU a number of cycles
+    __asm__ volatile("isb" : : : "memory");
+#elif defined(__x86_64__)
+    __asm__ volatile("pause" : : : "memory");
+#else
+#error Please add support for your CPU in cpu.h
+#endif
+}
+
+int num_cpus(void);
+
+unsigned long cpu_hz(void);
diff --git a/ext/sms/base/llsc.h b/ext/sms/base/llsc.h
new file mode 100644
index 0000000..5ef4207
--- /dev/null
+++ b/ext/sms/base/llsc.h
@@ -0,0 +1,359 @@
+// Copyright (c) 2017 ARM Limited. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include "build_config.h"
+#include "cpu.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/******************************************************************************
+ * LL/SC primitives
+ *****************************************************************************/
+
+#if __ARM_ARCH == 7 || (__ARM_ARCH == 8 && __ARM_64BIT_STATE == 0)
+
+static inline void dmb()
+{
+    __asm volatile("dmb" : : : "memory");
+}
+
+static inline uint8_t ll8(uint8_t *var, int mm)
+{
+    uint8_t old;
+    __asm volatile("ldrexb %0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : );
+    if (mm == __ATOMIC_ACQUIRE)
+        dmb();
+    return old;
+}
+
+static inline uint32_t ll(uint32_t *var, int mm)
+{
+    uint32_t old;
+    __asm volatile("ldrex %0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : );
+    //Barrier after an acquiring load
+    if (mm == __ATOMIC_ACQUIRE)
+	dmb();
+    return old;
+}
+#define ll32(a, b) ll((a), (b))
+
+//Return 0 on success, 1 on failure
+static inline uint32_t sc(uint32_t *var, uint32_t neu, int mm)
+{
+    uint32_t ret;
+    //Barrier before a releasing store
+    if (mm == __ATOMIC_RELEASE)
+	dmb();
+    __asm volatile("strex %0, %1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : );
+    return ret;
+}
+#define sc32(a, b, c) sc((a), (b), (c))
+
+static inline uint64_t lld(uint64_t *var, int mm)
+{
+    uint64_t old;
+    __asm volatile("ldrexd %0, %H0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : );
+    //Barrier after an acquiring load
+    if (mm == __ATOMIC_ACQUIRE)
+	dmb();
+    return old;
+}
+#define ll64(a, b) lld((a), (b))
+
+//Return 0 on success, 1 on failure
+static inline uint32_t scd(uint64_t *var, uint64_t neu, int mm)
+{
+    uint32_t ret;
+    //Barrier before a releasing store
+    if (mm == __ATOMIC_RELEASE)
+	dmb();
+    __asm volatile("strexd %0, %1, %H1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : );
+    return ret;
+}
+#define sc64(a, b, c) scd((a), (b), (c))
+
+#endif
+
+#if __ARM_ARCH == 8 && __ARM_64BIT_STATE == 1
+
+static inline uint8_t ll8(uint8_t *var, int mm)
+{
+    uint8_t old;
+    if (mm == __ATOMIC_ACQUIRE)
+    __asm volatile("ldaxrb %w0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("ldxrb %w0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : );
+    else
+	abort();
+    return old;
+}
+
+static inline uint16_t ll16(uint16_t *var, int mm)
+{
+    uint16_t old;
+    if (mm == __ATOMIC_ACQUIRE)
+    __asm volatile("ldaxrh %w0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("ldxrh %w0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : );
+    else
+	abort();
+    return old;
+}
+
+static inline uint32_t ll32(uint32_t *var, int mm)
+{
+    uint32_t old;
+    if (mm == __ATOMIC_ACQUIRE)
+    __asm volatile("ldaxr %w0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("ldxr %w0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : );
+    else
+	abort();
+    return old;
+}
+
+//Return 0 on success, 1 on failure
+static inline uint8_t sc8(uint8_t *var, uint8_t neu, int mm)
+{
+    uint8_t ret;
+    if (mm == __ATOMIC_RELEASE)
+    __asm volatile("stlxrb %w0, %w1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("stxrb %w0, %w1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : );
+    else
+	abort();
+    return ret;
+}
+
+//Return 0 on success, 1 on failure
+static inline uint32_t sc32(uint32_t *var, uint32_t neu, int mm)
+{
+    uint32_t ret;
+    if (mm == __ATOMIC_RELEASE)
+    __asm volatile("stlxr %w0, %w1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("stxr %w0, %w1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : );
+    else
+	abort();
+    return ret;
+}
+
+static inline uint64_t ll(uint64_t *var, int mm)
+{
+    uint64_t old;
+    if (mm == __ATOMIC_ACQUIRE)
+    __asm volatile("ldaxr %0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("ldxr %0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : );
+    else
+	abort();
+    return old;
+}
+#define ll64(a, b) ll((a), (b))
+
+//Return 0 on success, 1 on failure
+static inline uint32_t sc(uint64_t *var, uint64_t neu, int mm)
+{
+    uint32_t ret;
+    if (mm == __ATOMIC_RELEASE)
+    __asm volatile("stlxr %w0, %1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("stxr %w0, %1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : );
+    else
+	abort();
+    return ret;
+}
+#define sc64(a, b, c) sc((a), (b), (c))
+
+#if defined(__clang__)
+union i128
+{
+    __int128 i128;
+    int64_t  i64[2];
+};
+#endif
+
+static inline __int128 lld(__int128 *var, int mm)
+{
+#if defined(__clang__)
+    union i128 old;
+    if (mm == __ATOMIC_ACQUIRE)
+    __asm volatile("ldaxp %0, %1, [%2]"
+                   : "=&r" (old.i64[0]), "=&r" (old.i64[1])
+                   : "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("ldxp %0, %1, [%2]"
+                   : "=&r" (old.i64[0]), "=&r" (old.i64[1])
+                   : "r" (var)
+                   : );
+    else
+	abort();
+    return old.i128;
+#else
+    __int128 old;
+    if (mm == __ATOMIC_ACQUIRE)
+    __asm volatile("ldaxp %0, %H0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("ldxp %0, %H0, [%1]"
+                   : "=&r" (old)
+                   : "r" (var)
+                   : );
+    else
+	abort();
+    return old;
+#endif
+}
+
+//Return 0 on success, 1 on failure
+static inline uint32_t scd(__int128 *var, __int128 neu, int mm)
+{
+#if defined(__clang__)
+    uint32_t ret;
+    if (mm == __ATOMIC_RELEASE)
+    __asm volatile("stlxp %w0, %1, %2, [%3]"
+                   : "=&r" (ret)
+                   : "r" (((union i128)neu).i64[0]),
+                     "r" (((union i128)neu).i64[1]),
+                     "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("stxp %w0, %1, %2, [%3]"
+                   : "=&r" (ret)
+                   : "r" (((union i128)neu).i64[0]),
+                     "r" (((union i128)neu).i64[1]),
+                     "r" (var)
+                   : );
+    else
+	abort();
+    return ret;
+#else
+    uint32_t ret;
+    if (mm == __ATOMIC_RELEASE)
+    __asm volatile("stlxp %w0, %1, %H1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : "memory");
+    else if (mm == __ATOMIC_RELAXED)
+    __asm volatile("stxp %w0, %1, %H1, [%2]"
+                   : "=&r" (ret)
+                   : "r" (neu), "r" (var)
+                   : );
+    else
+	abort();
+    return ret;
+#endif
+}
+#endif
+
+static inline void sevl(void)
+{
+#if defined __ARM_ARCH
+    __asm volatile("sevl" : : : );
+#endif
+}
+
+static inline void sev(void)
+{
+#if defined __ARM_ARCH
+    __asm volatile("sev" : : : "memory");
+#endif
+}
+
+static inline int wfe(void)
+{
+#if defined __ARM_ARCH
+    __asm volatile("wfe" : : : "memory");
+#endif
+    return 1;
+}
+
+#ifdef USE_WFE
+#define SEVL() sevl()
+#define WFE() wfe()
+#define SEV() do { __asm volatile ("dsb ish" ::: "memory"); sev(); } while(0)
+#if __ARM_ARCH == 8 && __ARM_64BIT_STATE == 1
+#define LDXR128(addr, mo) lld((addr), (mo))
+#endif
+#define LDXR64(addr, mo) ll64((addr), (mo))
+#define LDXR32(addr, mo) ll32((addr), (mo))
+#define LDXR16(addr, mo) ll16((addr), (mo))
+#define LDXR8(addr, mo)  ll8((addr), (mo))
+#define LDXR(addr, mo)   ll((addr), (mo))
+//When using WFE we should not stall the pipeline using other means
+#define DOZE() (void)0
+#else
+#define SEVL() (void)0
+#define WFE() 1
+#define SEV() (void)0
+#define LDXR128(addr, mo) __atomic_load_n((addr), (mo))
+#define LDXR64(addr, mo)  __atomic_load_n((addr), (mo))
+#define LDXR32(addr, mo)  __atomic_load_n((addr), (mo))
+#define LDXR16(addr, mo)  __atomic_load_n((addr), (mo))
+#define LDXR8(addr, mo)   __atomic_load_n((addr), (mo))
+#define LDXR(addr, mo)    __atomic_load_n((addr), (mo))
+#define DOZE() doze()
+#endif
diff --git a/ext/sms/clh_spinlock.h b/ext/sms/clh_spinlock.h
new file mode 100644
index 0000000..8cf334b
--- /dev/null
+++ b/ext/sms/clh_spinlock.h
@@ -0,0 +1,262 @@
+/* 
+ * Copyright (c) 2017 ARM Limited. All rights reserved.
+ * SPDX-License-Identifier:    BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice, this
+ * list of conditions and the following disclaimer in the documentation and/or
+ * other materials provided with the distribution.
+ *
+ * Neither the name of ARM Limited nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Arm Shared Memory Synchronization Benchmark (SMS)
+ * commit: 85a4b2456f1c84e2235a527d8b2b69be99621e94
+ * August 6 2018
+ *
+ * Description:
+ * CLH (Craig Landin Hagersten) spinlock is a queue-based spinlock that each
+ * node spins on previous node's wait status. CLH spinlock is starvation-free
+ * and has FCFS (first come, first served) order. Because each thread spins
+ * on the previous node created by another thread, CLH's performance may be
+ * worse than MCS spinlock, which only spins on local memory. However, this
+ * should not be a problem because modern architectures always implement ccNUMA
+ * (cache coherent non-uniform memory architecture) which will coherently cache
+ * remote memory to a local cache-line. The remote memory may not be updated at
+ * all and the changed status will be implicit transferred by interconnect cache
+ * coherence protocols to the spinning core. CLH data structure is an implicit
+ * linked list, the global_clh only contains a cache-line aligned tail pointer
+ * and an initial dummy clh_node. The main disadvantages of CLH spinlock compared
+ * to MCS spinlock are: 1) slower than MCS on cacheless NUMA, 2) hard to implement
+ * wait-free back-off / time-out / abortable / hierarchical spinlock.
+ *
+ * Changes compared to official CLH spinlock
+ * Official CLH spinlock reuses previous released queue node. We used thread-local
+ * pointers to indicate current local node, which is also a thread-local struct.
+ * Therefore each thread may spin at other thread's TLS queue node, and ccNUMA
+ * coherence protocols will cache the remote DRAM to local cache. Overall
+ * performance should be similar to MCS spinlock.
+ *
+ * Internals:
+ * The only LSE instruction is SWPAL which exchanges current node and lock tail.
+ * There is a tunable parameter -w which can be used to disable WFE. All variables
+ * are cache-line aligned. Queue node is implemented with TLS __thread keyword.
+ * New initial clh_thread_local_init() function will initialize all queue nodes.
+ * clh_lock() and clh_unlock() strictly follow the original CLH algorithm. Global
+ * uint64_t lock pointer is unused.
+ *
+ * Workings:
+ * clh_spinlock works similar to osq_lock and queued_spinlock
+ *
+ * Tuning Parameters:
+ *
+ * Optional without_wfe to disable wfe instruction and use empty loops instead.
+ *
+ * [-- [-w]]: disable sevl and wfe
+ *
+ */
+
+#pragma once
+
+#include "llsc.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#ifdef initialize_lock
+#undef initialize_lock
+#endif
+
+#ifdef parse_test_args
+#undef parse_test_args
+#endif
+
+#ifdef thread_local_init
+#undef thread_local_init
+#endif
+
+#define initialize_lock(lock, threads) clh_lock_init(lock, threads)
+#define parse_test_args(args, argc, argv) clh_parse_args(args, argc, argv)
+#define thread_local_init(smtid) clh_thread_local_init(smtid)
+
+
+struct clh_node
+{
+    struct clh_node *prev;
+    unsigned long wait;
+} __attribute__ ((aligned (CACHE_LINE)));
+
+struct clh_node_pointer
+{
+    struct clh_node *ptr;
+} __attribute__ ((aligned (CACHE_LINE)));
+
+struct clh_lock
+{
+    struct clh_node node;
+    unsigned long num_cores;
+    struct clh_node *tail __attribute__ ((aligned(CACHE_LINE)));
+};
+
+static bool without_wfe;
+static struct clh_lock global_clh_lock;  // clh lock queue
+/*
+ * Cannot use __thread thread local storage because some threads
+ * may be joined earlier and their node may be referenced by other
+ * threads, this will cause memory access violation. We have to
+ * use the main thread heap and share a common C array. Two arrays
+ * are used here, one is used as a pointer array, which is fixed
+ * for each thread. The other is a nodepool, whose node is assigned
+ * to each thread according to its threadid initially. Then
+ * according to CLH algorithm, current node will reuse its previous
+ * node as the next available node. We just update the fixed pointer
+ * array to reflect this change. That is, each thread will retrieve
+ * its next available node from fixed pointer array by its thread
+ * id offset, but the pointer value may point to any node in the
+ * CLH nodepool.
+ */
+static struct clh_node_pointer *clh_nodeptr;  // clh node pointer array
+static struct clh_node *clh_nodepool;  // clh node struct array
+
+/* additional parameter to enable WFE(default) or disable WFE */
+static void clh_parse_args(test_args unused, int argc, char** argv) {
+    int i = 0;
+#if defined(__aarch64__)
+    without_wfe = false;
+#else
+    /* only aarch64 supports WFE */
+    without_wfe = true;
+#endif
+
+    /* extended options retrieved after '--' operator */
+    while ((i = getopt(argc, argv, "w")) != -1)
+    {
+        switch (i) {
+          case 'w':
+            without_wfe = true;
+            break;
+
+          default:
+            fprintf(stderr,
+                    "clh_spinlock additional options after --:\n"
+                    "\t[-h print this msg]\n"
+                    "\t[-w without_wfe, aarch64 default is false, non-aarch64 default is true]\n");
+            exit(2);
+        }
+    }
+}
+
+static inline void clh_lock_init(uint64_t *u64_lock, unsigned long num_cores)
+{
+    /* default tail node should be set to 0 */
+    global_clh_lock.node.prev = NULL;
+    global_clh_lock.node.wait = 0;
+    global_clh_lock.num_cores = num_cores;
+    global_clh_lock.tail = &global_clh_lock.node;
+
+    /* save clh_lock pointer to global u64int_t */
+    *u64_lock = (uint64_t)&global_clh_lock;
+
+    /* calloc will initialize all memory to zero automatically */
+    clh_nodeptr = calloc(num_cores, sizeof(struct clh_node_pointer));
+    if (clh_nodeptr == NULL) exit(errno);
+    clh_nodepool = calloc(num_cores, sizeof(struct clh_node));
+    if (clh_nodepool == NULL) exit(errno);
+
+#ifdef DDEBUG
+    printf("CLH: global_clh_lock=%llx\n", (long long unsigned int) &global_clh_lock);
+#endif
+}
+
+static inline void clh_thread_local_init(unsigned long smtid)
+{
+    /* initialize clh node pointer array individually */
+    clh_nodepool[smtid].wait = 1;
+    clh_nodeptr[smtid].ptr = &clh_nodepool[smtid];
+}
+
+static inline void clh_lock(struct clh_lock *lock, struct clh_node *node, bool use_wfe, unsigned long tid)
+{
+    /* must set wait to 1 first, otherwise next node after new tail will not spin */
+    node->wait = 1;
+    struct clh_node *prev = node->prev = __atomic_exchange_n(&lock->tail, node, __ATOMIC_ACQ_REL);
+#ifdef DDEBUG
+    printf("T%lu LOCK: prev<-node: %llx<-%llx\n", tid, (long long unsigned int)prev, (long long unsigned int)node);
+#endif
+
+    /* CLH spinlock: spinning on previous node's wait status */
+    if (use_wfe)
+    {
+        if (__atomic_load_n(&prev->wait, __ATOMIC_ACQUIRE))
+        {
+            SEVL();
+            while (WFE() && LDXR(&prev->wait, __ATOMIC_ACQUIRE))
+            {
+                DOZE();
+            }
+        }
+    }
+    else
+    {
+        while (__atomic_load_n(&prev->wait, __ATOMIC_ACQUIRE))
+        {
+            ;
+        }
+    }
+}
+
+/* return the previous node as reused node for the next clh_lock() */
+static inline void clh_unlock(struct clh_node *node, unsigned long tid)
+{
+#ifdef DDEBUG
+    printf("T%lu UNLOCK: node: %llx\n", tid, (long long unsigned int)node);
+#endif
+    /* CLH spinlock: release current node by resetting wait status */
+#ifdef USE_DMB
+    __atomic_thread_fence(__ATOMIC_RELEASE);
+    __atomic_store_n(&node->wait, 0, __ATOMIC_RELAXED);
+#else
+    __atomic_store_n(&node->wait, 0, __ATOMIC_RELEASE);
+#endif
+}
+
+/* standard lockhammer lock_acquire and lock_release interfaces */
+static unsigned long __attribute__((noinline))
+lock_acquire (uint64_t *lock, unsigned long threadnum)
+{
+    clh_lock(&global_clh_lock, clh_nodeptr[threadnum].ptr, !without_wfe, threadnum);
+    return 1;
+}
+
+static inline void lock_release (uint64_t *lock, unsigned long threadnum)
+{
+    /*
+     * Have to save prev first, once called clh_unlock(), node->prev might
+     * be overwritten by another thread and caused two thread use the same
+     * nodepool clh_node, therefore generated a circular linked list after
+     * another round of lock acquisition.
+     */
+    struct clh_node* prev = clh_nodeptr[threadnum].ptr->prev;
+    clh_unlock(clh_nodeptr[threadnum].ptr, threadnum);
+    clh_nodeptr[threadnum].ptr = prev;
+}