Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluate yet another memcpy #21520

Merged
merged 26 commits into from Mar 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
50c62c3
Add memcpy implementation from @jart
alexey-milovidov Mar 2, 2021
d1b3258
Try without AVX
alexey-milovidov Mar 2, 2021
aecdadd
Add missing files
alexey-milovidov Mar 2, 2021
680b0d1
Merge branch 'master' into jart-memcpy
alexey-milovidov Mar 3, 2021
cc28374
Remove prefetch
alexey-milovidov Mar 8, 2021
2c86bc4
Addition to prev. revision
alexey-milovidov Mar 8, 2021
940ce58
Add memcpy-bench tool
alexey-milovidov Mar 8, 2021
da32a61
Merge branch 'jart-memcpy' into replace-memcpy
alexey-milovidov Mar 8, 2021
f0342ed
Add experimental memcpy implementation
alexey-milovidov Mar 8, 2021
3fb5b24
Evaluate another memcpy
alexey-milovidov Mar 8, 2021
e02de23
Remove useless files
alexey-milovidov Mar 8, 2021
7664a3d
Fix MSan
alexey-milovidov Mar 8, 2021
5f65d46
Fix Arcadia
alexey-milovidov Mar 8, 2021
39dbc11
Fix style check
alexey-milovidov Mar 9, 2021
af1a08e
Merge branch 'master' into replace-memcpy
alexey-milovidov Mar 10, 2021
8619665
Remove AVX
alexey-milovidov Mar 10, 2021
e8919c4
Merge branch 'master' into replace-memcpy
alexey-milovidov Mar 13, 2021
6d91881
Remove currently unused code
alexey-milovidov Mar 13, 2021
a446612
Remove currently unused code
alexey-milovidov Mar 13, 2021
37cc2fe
Use custom memcpy only for AArch64
alexey-milovidov Mar 13, 2021
36a4122
Add comments and documentation
alexey-milovidov Mar 13, 2021
b4cae2c
Add TODO
alexey-milovidov Mar 13, 2021
32e66ba
Add comments and documentation
alexey-milovidov Mar 13, 2021
a1ebd52
Add comments and documentation
alexey-milovidov Mar 13, 2021
bab9246
Add comments and documentation
alexey-milovidov Mar 13, 2021
1606c7e
Fix typo
alexey-milovidov Mar 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 1 addition & 2 deletions CMakeLists.txt
Expand Up @@ -155,7 +155,6 @@ option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests"

if (OS_LINUX AND NOT UNBUNDLED AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND CMAKE_VERSION VERSION_GREATER "3.9.0")
# Only for Linux, x86_64.
# Implies ${ENABLE_FASTMEMCPY}
option(GLIBC_COMPATIBILITY "Enable compatibility with older glibc libraries." ON)
elseif(GLIBC_COMPATIBILITY)
message (${RECONFIGURE_MESSAGE_LEVEL} "Glibc compatibility cannot be enabled in current configuration")
Expand Down Expand Up @@ -536,7 +535,7 @@ macro (add_executable target)
# explicitly acquire and interpose malloc symbols by clickhouse_malloc
# if GLIBC_COMPATIBILITY is ON and ENABLE_THINLTO is on than provide memcpy symbol explicitly to neutrialize thinlto's libcall generation.
if (GLIBC_COMPATIBILITY AND ENABLE_THINLTO)
_add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc> $<TARGET_OBJECTS:clickhouse_memcpy>)
_add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc> $<TARGET_OBJECTS:memcpy>)
else ()
_add_executable (${ARGV} $<TARGET_OBJECTS:clickhouse_malloc>)
endif ()
Expand Down
1 change: 0 additions & 1 deletion base/common/CMakeLists.txt
Expand Up @@ -74,7 +74,6 @@ target_link_libraries (common
${CITYHASH_LIBRARIES}
boost::headers_only
boost::system
FastMemcpy
Poco::Net
Poco::Net::SSL
Poco::Util
Expand Down
2 changes: 1 addition & 1 deletion base/common/tests/CMakeLists.txt
Expand Up @@ -11,7 +11,7 @@ set(PLATFORM_LIBS ${CMAKE_DL_LIBS})
target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS})
target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS})
target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS})
target_link_libraries (local_date_time_comparison PRIVATE common)
target_link_libraries (local_date_time_comparison PRIVATE common ${PLATFORM_LIBS})
target_link_libraries (realloc-perf PRIVATE common)
add_check(local_date_time_comparison)

Expand Down
17 changes: 7 additions & 10 deletions base/glibc-compatibility/CMakeLists.txt
@@ -1,5 +1,8 @@
if (GLIBC_COMPATIBILITY)
set (ENABLE_FASTMEMCPY ON)
add_subdirectory(memcpy)
if(TARGET memcpy)
set(MEMCPY_LIBRARY memcpy)
endif()

enable_language(ASM)
include(CheckIncludeFile)
Expand Down Expand Up @@ -27,13 +30,6 @@ if (GLIBC_COMPATIBILITY)
list(APPEND glibc_compatibility_sources musl/getentropy.c)
endif()

if (NOT ARCH_ARM)
# clickhouse_memcpy don't support ARCH_ARM, see https://github.com/ClickHouse/ClickHouse/issues/18951
add_library (clickhouse_memcpy OBJECT
${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy/memcpy_wrapper.c
)
endif()

# Need to omit frame pointers to match the performance of glibc
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer")

Expand All @@ -51,15 +47,16 @@ if (GLIBC_COMPATIBILITY)
target_compile_options(glibc-compatibility PRIVATE -fPIC)
endif ()

target_link_libraries(global-libs INTERFACE glibc-compatibility)
target_link_libraries(global-libs INTERFACE glibc-compatibility ${MEMCPY_LIBRARY})

install(
TARGETS glibc-compatibility
TARGETS glibc-compatibility ${MEMCPY_LIBRARY}
EXPORT global
ARCHIVE DESTINATION lib
)

message (STATUS "Some symbols from glibc will be replaced for compatibility")

elseif (YANDEX_OFFICIAL_BUILD)
message (WARNING "Option GLIBC_COMPATIBILITY must be turned on for production builds.")
endif ()
8 changes: 8 additions & 0 deletions base/glibc-compatibility/memcpy/CMakeLists.txt
@@ -0,0 +1,8 @@
if (ARCH_AMD64)
add_library(memcpy STATIC memcpy.cpp)

# We allow to include memcpy.h from user code for better inlining.
target_include_directories(memcpy PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)

target_compile_options(memcpy PRIVATE -fno-builtin-memcpy)
endif ()
6 changes: 6 additions & 0 deletions base/glibc-compatibility/memcpy/memcpy.cpp
@@ -0,0 +1,6 @@
#include "memcpy.h"

extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size)
{
return inline_memcpy(dst, src, size);
}
217 changes: 217 additions & 0 deletions base/glibc-compatibility/memcpy/memcpy.h
@@ -0,0 +1,217 @@
#include <cstddef>

#include <emmintrin.h>


/** Custom memcpy implementation for ClickHouse.
* It has the following benefits over using glibc's implementation:
* 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability.
* 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient.
* 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis.
* 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries.
*
* Writing our own memcpy is extremely difficult for the following reasons:
* 1. The optimal variant depends on the specific CPU model.
* 2. The optimal variant depends on the distribution of size arguments.
* 3. It depends on the number of threads copying data concurrently.
* 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other.
* Due to vast range of scenarios it makes proper testing especially difficult.
* When writing our own memcpy there is a risk to overoptimize it
* on non-representative microbenchmarks while making real-world use cases actually worse.
*
* Most of the benchmarks for memcpy on the internet are wrong.
*
* Let's look at the details:
*
* For small size, the order of branches in code is important.
* There are variants with specific order of branches (like here or in glibc)
* or with jump table (in asm code see example from Cosmopolitan libc:
* https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61)
* or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/)
*
* It's also important how to copy uneven sizes.
* Almost every implementation, including this, is using two overlapping movs.
*
* It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation,
* otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion.
*
* For larger sizes it's important to choose the instructions used:
* - SSE or AVX or AVX-512;
* - rep movsb;
* Performance will depend on the size threshold, on the CPU model, on the "erms" flag
* ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes)
* https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy
*
* Using AVX-512 can be bad due to throttling.
* Using AVX can be bad if most code is using SSE due to switching penalty
* (it also depends on the usage of "vzeroupper" instruction).
* But in some cases AVX gives a win.
*
* It also depends on how many times the loop will be unrolled.
* We are unrolling the loop 8 times (by the number of available registers), but it not always the best.
*
* It also depends on the usage of aligned or unaligned loads/stores.
* We are using unaligned loads and aligned stores.
*
* It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD.
* Setting up correct offset for prefetching is non-obvious.
*
* Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache).
* But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower,
* because L3 cache is shared (and L2 cache is partially shared).
*
* Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios,
* so we don't pay attention to using non-temporary stores.
*
* On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial,
* even comparing to non-temporary aligned unrolled stores even with the most wide registers.
*
* memcpy can be written in asm, C or C++. The latter can also use inline asm.
* The asm implementation can be better to make sure that compiler won't make the code worse,
* to ensure the order of branches, the code layout, the usage of all required registers.
* But if it is located in separate translation unit, inlining will not be possible
* (inline asm can be used to overcome this limitation).
* Sometimes C or C++ code can be further optimized by compiler.
* For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used.
*
* Please note that compiler can replace plain code to memcpy and vice versa.
* - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy;
* it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy.
* This is often used to implement unaligned load/store without undefined behaviour in C++.
* - a loop with copying bytes can be recognized and replaced by a call to memcpy;
* it is controlled by -ftree-loop-distribute-patterns.
* - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you
* inline code somewhat similar to a decent implementation of memcpy.
*
* This description is up to date as of Mar 2021.
*
* How to test the memcpy implementation for performance:
* 1. Test on real production workload.
* 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios.
*
* TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes.
* See https://habr.com/en/company/yandex/blog/457612/
*/


static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
{
/// We will use pointer arithmetic, so char pointer will be used.
/// Note that __restrict makes sense (otherwise compiler will reload data from memory
/// instead of using the value of registers due to possible aliasing).
char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);

/// Standard memcpy returns the original value of dst. It is rarely used but we have to do it.
/// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly
/// for inlining and removing this single instruction.
void * ret = dst;

tail:
/// Small sizes and tails after the loop for large sizes.
/// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
/// This order of branches is from the disassembly of glibc's code.
/// We copy chunks of possibly uneven size with two overlapping movs.
/// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
if (size <= 16)
{
if (size >= 8)
{
/// Chunks of 8..16 bytes.
__builtin_memcpy(dst + size - 8, src + size - 8, 8);
__builtin_memcpy(dst, src, 8);
}
else if (size >= 4)
{
/// Chunks of 4..7 bytes.
__builtin_memcpy(dst + size - 4, src + size - 4, 4);
__builtin_memcpy(dst, src, 4);
}
else if (size >= 2)
{
/// Chunks of 2..3 bytes.
__builtin_memcpy(dst + size - 2, src + size - 2, 2);
__builtin_memcpy(dst, src, 2);
}
else if (size >= 1)
{
/// A single byte.
*dst = *src;
}
/// No bytes remaining.
}
else
{
/// Medium and large sizes.
if (size <= 128)
{
/// Medium size, not enough for full loop unrolling.

/// We will copy the last 16 bytes.
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));

/// Then we will copy every 16 bytes from the beginning in a loop.
/// The last loop iteration will possibly overwrite some part of already copied last 16 bytes.
/// This is Ok, similar to the code for small sizes above.
while (size > 16)
{
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
dst += 16;
src += 16;
size -= 16;
}
}
else
{
/// Large size with fully unrolled loop.

/// Align destination to 16 bytes boundary.
size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;

/// If not aligned - we will copy first 16 bytes with unaligned stores.
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}

/// Aligned unrolled copy. We will use all available SSE registers.
/// It's not possible to have both src and dst aligned.
/// So, we will use aligned stores and unaligned loads.
__m128i c0, c1, c2, c3, c4, c5, c6, c7;

while (size >= 128)
{
c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
src += 128;
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
_mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
dst += 128;

size -= 128;
}

/// The latest remaining 0..127 bytes will be processed as usual.
goto tail;
}
}

return ret;
}

1 change: 0 additions & 1 deletion contrib/CMakeLists.txt
Expand Up @@ -38,7 +38,6 @@ add_subdirectory (boost-cmake)
add_subdirectory (cctz-cmake)
add_subdirectory (consistent-hashing)
add_subdirectory (dragonbox-cmake)
add_subdirectory (FastMemcpy)
add_subdirectory (hyperscan-cmake)
add_subdirectory (jemalloc-cmake)
add_subdirectory (libcpuid-cmake)
Expand Down
28 changes: 0 additions & 28 deletions contrib/FastMemcpy/CMakeLists.txt

This file was deleted.