ClickHouse · alexey-milovidov · Mar 14, 2021 · Mar 2, 2021 · Mar 2, 2021 · Mar 2, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -155,7 +155,6 @@ option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests"
 
 if (OS_LINUX AND NOT UNBUNDLED AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND CMAKE_VERSION VERSION_GREATER "3.9.0")
     # Only for Linux, x86_64.
-    # Implies ${ENABLE_FASTMEMCPY}
     option(GLIBC_COMPATIBILITY "Enable compatibility with older glibc libraries." ON)
 elseif(GLIBC_COMPATIBILITY)
     message (${RECONFIGURE_MESSAGE_LEVEL} "Glibc compatibility cannot be enabled in current configuration")
@@ -536,7 +535,7 @@ macro (add_executable target)
     # explicitly acquire and interpose malloc symbols by clickhouse_malloc
     # if GLIBC_COMPATIBILITY is ON and ENABLE_THINLTO is on than provide memcpy symbol explicitly to neutrialize thinlto's libcall generation.
     if (GLIBC_COMPATIBILITY AND ENABLE_THINLTO)
-        _add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc> $<TARGET_OBJECTS:clickhouse_memcpy>)
+        _add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc> $<TARGET_OBJECTS:memcpy>)
     else ()
         _add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc>)
     endif ()

diff --git a/base/common/CMakeLists.txt b/base/common/CMakeLists.txt
@@ -74,7 +74,6 @@ target_link_libraries (common
         ${CITYHASH_LIBRARIES}
         boost::headers_only
         boost::system
-        FastMemcpy
         Poco::Net
         Poco::Net::SSL
         Poco::Util

diff --git a/base/common/tests/CMakeLists.txt b/base/common/tests/CMakeLists.txt
@@ -11,7 +11,7 @@ set(PLATFORM_LIBS ${CMAKE_DL_LIBS})
 target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS})
 target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS})
 target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS})
-target_link_libraries (local_date_time_comparison PRIVATE common)
+target_link_libraries (local_date_time_comparison PRIVATE common ${PLATFORM_LIBS})
 target_link_libraries (realloc-perf PRIVATE common)
 add_check(local_date_time_comparison)
 

diff --git a/base/glibc-compatibility/CMakeLists.txt b/base/glibc-compatibility/CMakeLists.txt
@@ -1,5 +1,8 @@
 if (GLIBC_COMPATIBILITY)
-    set (ENABLE_FASTMEMCPY ON)
+    add_subdirectory(memcpy)
+    if(TARGET memcpy)
+        set(MEMCPY_LIBRARY memcpy)
+    endif()
 
     enable_language(ASM)
     include(CheckIncludeFile)
@@ -27,13 +30,6 @@ if (GLIBC_COMPATIBILITY)
         list(APPEND glibc_compatibility_sources musl/getentropy.c)
     endif()
 
-    if (NOT ARCH_ARM)
-        # clickhouse_memcpy don't support ARCH_ARM, see https://github.com/ClickHouse/ClickHouse/issues/18951
-        add_library (clickhouse_memcpy OBJECT
-            ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy/memcpy_wrapper.c
-        )
-    endif()
-
     # Need to omit frame pointers to match the performance of glibc
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer")
 
@@ -51,15 +47,16 @@ if (GLIBC_COMPATIBILITY)
         target_compile_options(glibc-compatibility PRIVATE -fPIC)
     endif ()
 
-    target_link_libraries(global-libs INTERFACE glibc-compatibility)
+    target_link_libraries(global-libs INTERFACE glibc-compatibility ${MEMCPY_LIBRARY})
 
     install(
-        TARGETS glibc-compatibility
+        TARGETS glibc-compatibility ${MEMCPY_LIBRARY}
         EXPORT global
         ARCHIVE DESTINATION lib
     )
 
     message (STATUS "Some symbols from glibc will be replaced for compatibility")
+
 elseif (YANDEX_OFFICIAL_BUILD)
     message (WARNING "Option GLIBC_COMPATIBILITY must be turned on for production builds.")
 endif ()
diff --git a/base/glibc-compatibility/memcpy/CMakeLists.txt b/base/glibc-compatibility/memcpy/CMakeLists.txt
@@ -0,0 +1,8 @@
+if (ARCH_AMD64)
+    add_library(memcpy STATIC memcpy.cpp)
+
+    # We allow to include memcpy.h from user code for better inlining.
+    target_include_directories(memcpy PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+    target_compile_options(memcpy PRIVATE -fno-builtin-memcpy)
+endif ()
diff --git a/base/glibc-compatibility/memcpy/memcpy.cpp b/base/glibc-compatibility/memcpy/memcpy.cpp
@@ -0,0 +1,6 @@
+#include "memcpy.h"
+
+extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size)
+{
+    return inline_memcpy(dst, src, size);
+}
diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h
@@ -0,0 +1,217 @@
+#include <cstddef>
+
+#include <emmintrin.h>
+
+
+/** Custom memcpy implementation for ClickHouse.
+  * It has the following benefits over using glibc's implementation:
+  * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability.
+  * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient.
+  * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis.
+  * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries.
+  *
+  * Writing our own memcpy is extremely difficult for the following reasons:
+  * 1. The optimal variant depends on the specific CPU model.
+  * 2. The optimal variant depends on the distribution of size arguments.
+  * 3. It depends on the number of threads copying data concurrently.
+  * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other.
+  * Due to vast range of scenarios it makes proper testing especially difficult.
+  * When writing our own memcpy there is a risk to overoptimize it
+  * on non-representative microbenchmarks while making real-world use cases actually worse.
+  *
+  * Most of the benchmarks for memcpy on the internet are wrong.
+  *
+  * Let's look at the details:
+  *
+  * For small size, the order of branches in code is important.
+  * There are variants with specific order of branches (like here or in glibc)
+  * or with jump table (in asm code see example from Cosmopolitan libc:
+  * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61)
+  * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/)
+  *
+  * It's also important how to copy uneven sizes.
+  * Almost every implementation, including this, is using two overlapping movs.
+  *
+  * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation,
+  * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion.
+  *
+  * For larger sizes it's important to choose the instructions used:
+  * - SSE or AVX or AVX-512;
+  * - rep movsb;
+  * Performance will depend on the size threshold, on the CPU model, on the "erms" flag
+  * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes)
+  * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy
+  *
+  * Using AVX-512 can be bad due to throttling.
+  * Using AVX can be bad if most code is using SSE due to switching penalty
+  * (it also depends on the usage of "vzeroupper" instruction).
+  * But in some cases AVX gives a win.
+  *
+  * It also depends on how many times the loop will be unrolled.
+  * We are unrolling the loop 8 times (by the number of available registers), but it not always the best.
+  *
+  * It also depends on the usage of aligned or unaligned loads/stores.
+  * We are using unaligned loads and aligned stores.
+  *
+  * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD.
+  * Setting up correct offset for prefetching is non-obvious.
+  *
+  * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache).
+  * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower,
+  * because L3 cache is shared (and L2 cache is partially shared).
+  *
+  * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios,
+  * so we don't pay attention to using non-temporary stores.
+  *
+  * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial,
+  * even comparing to non-temporary aligned unrolled stores even with the most wide registers.
+  *
+  * memcpy can be written in asm, C or C++. The latter can also use inline asm.
+  * The asm implementation can be better to make sure that compiler won't make the code worse,
+  * to ensure the order of branches, the code layout, the usage of all required registers.
+  * But if it is located in separate translation unit, inlining will not be possible
+  * (inline asm can be used to overcome this limitation).
+  * Sometimes C or C++ code can be further optimized by compiler.
+  * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used.
+  *
+  * Please note that compiler can replace plain code to memcpy and vice versa.
+  * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy;
+  *   it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy.
+  *   This is often used to implement unaligned load/store without undefined behaviour in C++.
+  * - a loop with copying bytes can be recognized and replaced by a call to memcpy;
+  *   it is controlled by -ftree-loop-distribute-patterns.
+  * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you
+  *   inline code somewhat similar to a decent implementation of memcpy.
+  *
+  * This description is up to date as of Mar 2021.
+  *
+  * How to test the memcpy implementation for performance:
+  * 1. Test on real production workload.
+  * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios.
+  *
+  * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes.
+  * See https://habr.com/en/company/yandex/blog/457612/
+  */
+
+
+static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
+{
+    /// We will use pointer arithmetic, so char pointer will be used.
+    /// Note that __restrict makes sense (otherwise compiler will reload data from memory
+    /// instead of using the value of registers due to possible aliasing).
+    char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
+    const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
+
+    /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it.
+    /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly
+    /// for inlining and removing this single instruction.
+    void * ret = dst;
+
+tail:
+    /// Small sizes and tails after the loop for large sizes.
+    /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
+    /// This order of branches is from the disassembly of glibc's code.
+    /// We copy chunks of possibly uneven size with two overlapping movs.
+    /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
+    if (size <= 16)
+    {
+        if (size >= 8)
+        {
+            /// Chunks of 8..16 bytes.
+            __builtin_memcpy(dst + size - 8, src + size - 8, 8);
+            __builtin_memcpy(dst, src, 8);
+        }
+        else if (size >= 4)
+        {
+            /// Chunks of 4..7 bytes.
+            __builtin_memcpy(dst + size - 4, src + size - 4, 4);
+            __builtin_memcpy(dst, src, 4);
+        }
+        else if (size >= 2)
+        {
+            /// Chunks of 2..3 bytes.
+            __builtin_memcpy(dst + size - 2, src + size - 2, 2);
+            __builtin_memcpy(dst, src, 2);
+        }
+        else if (size >= 1)
+        {
+            /// A single byte.
+            *dst = *src;
+        }
+        /// No bytes remaining.
+    }
+    else
+    {
+        /// Medium and large sizes.
+        if (size <= 128)
+        {
+            /// Medium size, not enough for full loop unrolling.
+
+            /// We will copy the last 16 bytes.
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));
+
+            /// Then we will copy every 16 bytes from the beginning in a loop.
+            /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes.
+            /// This is Ok, similar to the code for small sizes above.
+            while (size > 16)
+            {
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+                dst += 16;
+                src += 16;
+                size -= 16;
+            }
+        }
+        else
+        {
+            /// Large size with fully unrolled loop.
+
+            /// Align destination to 16 bytes boundary.
+            size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;
+
+            /// If not aligned - we will copy first 16 bytes with unaligned stores.
+            if (padding > 0)
+            {
+                __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
+                _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
+                dst += padding;
+                src += padding;
+                size -= padding;
+            }
+
+            /// Aligned unrolled copy. We will use all available SSE registers.
+            /// It's not possible to have both src and dst aligned.
+            /// So, we will use aligned stores and unaligned loads.
+            __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            while (size >= 128)
+            {
+                c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
+                c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
+                c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
+                c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
+                c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
+                c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
+                c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
+                c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
+                src += 128;
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
+                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
+                dst += 128;
+
+                size -= 128;
+            }
+
+            /// The latest remaining 0..127 bytes will be processed as usual.
+            goto tail;
+        }
+    }
+
+    return ret;
+}
+
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
@@ -38,7 +38,6 @@ add_subdirectory (boost-cmake)
 add_subdirectory (cctz-cmake)
 add_subdirectory (consistent-hashing)
 add_subdirectory (dragonbox-cmake)
-add_subdirectory (FastMemcpy)
 add_subdirectory (hyperscan-cmake)
 add_subdirectory (jemalloc-cmake)
 add_subdirectory (libcpuid-cmake)

diff --git a/contrib/FastMemcpy/CMakeLists.txt b/contrib/FastMemcpy/CMakeLists.txt