From 50c62c3b4cafdfbbbd965f1455f09d196cbcbb74 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 2 Mar 2021 22:09:37 +0300 Subject: [PATCH 01/22] Add memcpy implementation from @jart --- CMakeLists.txt | 3 +- base/common/CMakeLists.txt | 1 - base/common/tests/CMakeLists.txt | 2 +- base/glibc-compatibility/CMakeLists.txt | 17 +- .../glibc-compatibility/memcpy/CMakeLists.txt | 5 + base/glibc-compatibility/memcpy/memcpy.S | 151 ++++ base/glibc-compatibility/memcpy/memcpy.h | 28 + contrib/CMakeLists.txt | 1 - contrib/FastMemcpy/CMakeLists.txt | 28 - contrib/FastMemcpy/FastMemcpy.c | 220 ------ contrib/FastMemcpy/FastMemcpy.h | 694 ------------------ contrib/FastMemcpy/FastMemcpy_Avx.c | 171 ----- contrib/FastMemcpy/FastMemcpy_Avx.h | 492 ------------- contrib/FastMemcpy/LICENSE | 22 - contrib/FastMemcpy/README.md | 20 - contrib/FastMemcpy/memcpy_wrapper.c | 6 - 16 files changed, 193 insertions(+), 1668 deletions(-) create mode 100644 base/glibc-compatibility/memcpy/CMakeLists.txt create mode 100644 base/glibc-compatibility/memcpy/memcpy.S create mode 100644 base/glibc-compatibility/memcpy/memcpy.h delete mode 100644 contrib/FastMemcpy/CMakeLists.txt delete mode 100644 contrib/FastMemcpy/FastMemcpy.c delete mode 100644 contrib/FastMemcpy/FastMemcpy.h delete mode 100644 contrib/FastMemcpy/FastMemcpy_Avx.c delete mode 100644 contrib/FastMemcpy/FastMemcpy_Avx.h delete mode 100644 contrib/FastMemcpy/LICENSE delete mode 100644 contrib/FastMemcpy/README.md delete mode 100644 contrib/FastMemcpy/memcpy_wrapper.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 9002f1df140c..e622cbdebe6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,7 +155,6 @@ option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests" if (OS_LINUX AND NOT UNBUNDLED AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND CMAKE_VERSION VERSION_GREATER "3.9.0") # Only for Linux, x86_64. - # Implies ${ENABLE_FASTMEMCPY} option(GLIBC_COMPATIBILITY "Enable compatibility with older glibc libraries." ON) elseif(GLIBC_COMPATIBILITY) message (${RECONFIGURE_MESSAGE_LEVEL} "Glibc compatibility cannot be enabled in current configuration") @@ -536,7 +535,7 @@ macro (add_executable target) # explicitly acquire and interpose malloc symbols by clickhouse_malloc # if GLIBC_COMPATIBILITY is ON and ENABLE_THINLTO is on than provide memcpy symbol explicitly to neutrialize thinlto's libcall generation. if (GLIBC_COMPATIBILITY AND ENABLE_THINLTO) - _add_executable (${ARGV} $ $) + _add_executable (${ARGV} $ $) else () _add_executable (${ARGV} $) endif () diff --git a/base/common/CMakeLists.txt b/base/common/CMakeLists.txt index cea52b443ddc..b4bf4f55466a 100644 --- a/base/common/CMakeLists.txt +++ b/base/common/CMakeLists.txt @@ -74,7 +74,6 @@ target_link_libraries (common ${CITYHASH_LIBRARIES} boost::headers_only boost::system - FastMemcpy Poco::Net Poco::Net::SSL Poco::Util diff --git a/base/common/tests/CMakeLists.txt b/base/common/tests/CMakeLists.txt index b7082ee9900c..6775d443fb6a 100644 --- a/base/common/tests/CMakeLists.txt +++ b/base/common/tests/CMakeLists.txt @@ -11,7 +11,7 @@ set(PLATFORM_LIBS ${CMAKE_DL_LIBS}) target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS}) target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS}) target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (local_date_time_comparison PRIVATE common) +target_link_libraries (local_date_time_comparison PRIVATE common ${PLATFORM_LIBS}) target_link_libraries (realloc-perf PRIVATE common) add_check(local_date_time_comparison) diff --git a/base/glibc-compatibility/CMakeLists.txt b/base/glibc-compatibility/CMakeLists.txt index 684c6162941d..1fc537ded244 100644 --- a/base/glibc-compatibility/CMakeLists.txt +++ b/base/glibc-compatibility/CMakeLists.txt @@ -1,5 +1,5 @@ if (GLIBC_COMPATIBILITY) - set (ENABLE_FASTMEMCPY ON) + add_subdirectory(memcpy) enable_language(ASM) include(CheckIncludeFile) @@ -27,13 +27,6 @@ if (GLIBC_COMPATIBILITY) list(APPEND glibc_compatibility_sources musl/getentropy.c) endif() - if (NOT ARCH_ARM) - # clickhouse_memcpy don't support ARCH_ARM, see https://github.com/ClickHouse/ClickHouse/issues/18951 - add_library (clickhouse_memcpy OBJECT - ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy/memcpy_wrapper.c - ) - endif() - # Need to omit frame pointers to match the performance of glibc set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer") @@ -51,15 +44,19 @@ if (GLIBC_COMPATIBILITY) target_compile_options(glibc-compatibility PRIVATE -fPIC) endif () - target_link_libraries(global-libs INTERFACE glibc-compatibility) + target_link_libraries(global-libs INTERFACE glibc-compatibility memcpy) install( - TARGETS glibc-compatibility + TARGETS glibc-compatibility memcpy EXPORT global ARCHIVE DESTINATION lib ) message (STATUS "Some symbols from glibc will be replaced for compatibility") + + if (ENABLE_TESTS) + add_subdirectory (tests) + endif () elseif (YANDEX_OFFICIAL_BUILD) message (WARNING "Option GLIBC_COMPATIBILITY must be turned on for production builds.") endif () diff --git a/base/glibc-compatibility/memcpy/CMakeLists.txt b/base/glibc-compatibility/memcpy/CMakeLists.txt new file mode 100644 index 000000000000..b51ebab1d50d --- /dev/null +++ b/base/glibc-compatibility/memcpy/CMakeLists.txt @@ -0,0 +1,5 @@ +enable_language(ASM) +add_library(memcpy STATIC memcpy.S) + +# We allow to include memcpy.h from user code for better inlining (less number of registers are clobbered?). +target_include_directories(memcpy PUBLIC $) diff --git a/base/glibc-compatibility/memcpy/memcpy.S b/base/glibc-compatibility/memcpy/memcpy.S new file mode 100644 index 000000000000..c1439c3c8594 --- /dev/null +++ b/base/glibc-compatibility/memcpy/memcpy.S @@ -0,0 +1,151 @@ +/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ +│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2020 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ + +// Copies memory. +// +// DEST and SRC must not overlap, unless DEST≤SRC. +// +// @param rdi is dest +// @param rsi is src +// @param rdx is number of bytes +// @return original rdi copied to rax +// @mode long +// @asyncsignalsafe +memcpy: mov %rdi,%rax +// 𝑠𝑙𝑖𝑑𝑒 + .align 16 + .type memcpy,@function + .size memcpy,.-memcpy + .globl memcpy + +// Copies memory w/ minimal impact ABI. +// +// @param rdi is dest +// @param rsi is src +// @param rdx is number of bytes +// @clob flags,rcx,xmm3,xmm4 +// @mode long +MemCpy: mov $.Lmemcpytab.size,%ecx + cmp %rcx,%rdx + cmovb %rdx,%rcx + jmp *memcpytab(,%rcx,8) +.Lanchorpoint: +.L32r: cmp $1024,%rdx + jae .Lerms +.L32: vmovdqu -32(%rsi,%rdx),%ymm4 + mov $32,%rcx +0: add $32,%rcx + vmovdqu -64(%rsi,%rcx),%ymm3 + vmovdqu %ymm3,-64(%rdi,%rcx) + cmp %rcx,%rdx + ja 0b + vmovdqu %ymm4,-32(%rdi,%rdx) + vxorps %ymm4,%ymm4,%ymm4 + vxorps %ymm3,%ymm3,%ymm3 + jmp .L0 +.L16r: cmp $1024,%rdx + jae .Lerms +.L16: movdqu -16(%rsi,%rdx),%xmm4 + mov $16,%rcx +0: add $16,%rcx + movdqu -32(%rsi,%rcx),%xmm3 + movdqu %xmm3,-32(%rdi,%rcx) + cmp %rcx,%rdx + ja 0b + movdqu %xmm4,-16(%rdi,%rdx) + pxor %xmm4,%xmm4 + pxor %xmm3,%xmm3 + jmp .L0 +.L8: push %rbx + mov (%rsi),%rcx + mov -8(%rsi,%rdx),%rbx + mov %rcx,(%rdi) + mov %rbx,-8(%rdi,%rdx) +1: pop %rbx +.L0: ret +.L4: push %rbx + mov (%rsi),%ecx + mov -4(%rsi,%rdx),%ebx + mov %ecx,(%rdi) + mov %ebx,-4(%rdi,%rdx) + jmp 1b +.L3: push %rbx + mov (%rsi),%cx + mov -2(%rsi,%rdx),%bx + mov %cx,(%rdi) + mov %bx,-2(%rdi,%rdx) + jmp 1b +.L2: mov (%rsi),%cx + mov %cx,(%rdi) + jmp .L0 +.L1: mov (%rsi),%cl + mov %cl,(%rdi) + jmp .L0 +.Lerms: cmp $1024*1024,%rdx + ja .Lnts + push %rdi + push %rsi + mov %rdx,%rcx + rep movsb + pop %rsi + pop %rdi + jmp .L0 +.Lnts: movdqu (%rsi),%xmm3 + movdqu %xmm3,(%rdi) + lea 16(%rdi),%rcx + and $-16,%rcx + sub %rdi,%rcx + add %rcx,%rdi + add %rcx,%rsi + sub %rcx,%rdx + mov $16,%rcx +0: add $16,%rcx + movdqu -32(%rsi,%rcx),%xmm3 + movntdq %xmm3,-32(%rdi,%rcx) + cmp %rcx,%rdx + ja 0b + sfence + movdqu -16(%rsi,%rdx),%xmm3 + movdqu %xmm3,-16(%rdi,%rdx) + pxor %xmm3,%xmm3 + jmp .L0 + .type MemCpy,@function + .size MemCpy,.-MemCpy + .globl MemCpy + + .section .rodata + .align 8 +memcpytab: + .quad .L0 + .quad .L1 + .quad .L2 + .quad .L3 + .rept 4 + .quad .L4 + .endr + .rept 8 + .quad .L8 + .endr + .rept 16 + .quad .L16 + .endr + .equ .Lmemcpytab.size,(.-memcpytab)/8 + .quad .L32r # AVX + ERMS + .type memcpytab,@object + .previous diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h new file mode 100644 index 000000000000..7e3069d6b797 --- /dev/null +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -0,0 +1,28 @@ +#ifndef MEMCPY_H_ +#define MEMCPY_H_ +#if !(__ASSEMBLER__ + __LINKER__ + 0) + +/// Note: this header can be included only after libc++ headers. +/// Note: you cannot write std::memcpy if you include this header. + +void *memcpy(void *, const void *, size_t); + +#ifdef __GNUC__ +#define __memcpy_isgoodsize(SIZE) \ + (__builtin_constant_p(SIZE) && ((SIZE) <= __BIGGEST_ALIGNMENT__ && \ + __builtin_popcountl((unsigned)(SIZE)) == 1)) +#define memcpy(DEST, SRC, SIZE) \ + (__memcpy_isgoodsize(SIZE) ? __builtin_memcpy(DEST, SRC, SIZE) : ({ \ + void *DeSt = (DEST); \ + const void *SrC = (SRC); \ + size_t SiZe = (SIZE); \ + asm("call\tMemCpy" \ + : "=m"(*(char(*)[SiZe])(DeSt)) \ + : "D"(DeSt), "S"(SrC), "d"(SiZe), "m"(*(const char(*)[SiZe])(SrC)) \ + : "xmm3", "xmm4", "rcx", "cc"); \ + DeSt; \ + })) +#endif + +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* MEMCPY_H_ */ diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 20b4fad0437a..3a635bda83ad 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -37,7 +37,6 @@ add_subdirectory (boost-cmake) add_subdirectory (cctz-cmake) add_subdirectory (consistent-hashing) add_subdirectory (dragonbox-cmake) -add_subdirectory (FastMemcpy) add_subdirectory (hyperscan-cmake) add_subdirectory (jemalloc-cmake) add_subdirectory (libcpuid-cmake) diff --git a/contrib/FastMemcpy/CMakeLists.txt b/contrib/FastMemcpy/CMakeLists.txt deleted file mode 100644 index 8efe6d45dff9..000000000000 --- a/contrib/FastMemcpy/CMakeLists.txt +++ /dev/null @@ -1,28 +0,0 @@ -option (ENABLE_FASTMEMCPY "Enable FastMemcpy library (only internal)" ${ENABLE_LIBRARIES}) - -if (NOT OS_LINUX OR ARCH_AARCH64) - set (ENABLE_FASTMEMCPY OFF) -endif () - -if (ENABLE_FASTMEMCPY) - set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy) - - set (SRCS - ${LIBRARY_DIR}/FastMemcpy.c - - memcpy_wrapper.c - ) - - add_library (FastMemcpy ${SRCS}) - target_include_directories (FastMemcpy PUBLIC ${LIBRARY_DIR}) - - target_compile_definitions(FastMemcpy PUBLIC USE_FASTMEMCPY=1) - - message (STATUS "Using FastMemcpy") -else () - add_library (FastMemcpy INTERFACE) - - target_compile_definitions(FastMemcpy INTERFACE USE_FASTMEMCPY=0) - - message (STATUS "Not using FastMemcpy") -endif () diff --git a/contrib/FastMemcpy/FastMemcpy.c b/contrib/FastMemcpy/FastMemcpy.c deleted file mode 100644 index 5021bcc7d165..000000000000 --- a/contrib/FastMemcpy/FastMemcpy.c +++ /dev/null @@ -1,220 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) -// -//===================================================================== -#include -#include -#include -#include - -#if (defined(_WIN32) || defined(WIN32)) -#include -#include -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif -#elif defined(__unix) -#include -#include -#else -#error it can only be compiled under windows or unix -#endif - -#include "FastMemcpy.h" - -unsigned int gettime() -{ - #if (defined(_WIN32) || defined(WIN32)) - return timeGetTime(); - #else - static struct timezone tz={ 0,0 }; - struct timeval time; - gettimeofday(&time,&tz); - return (time.tv_sec * 1000 + time.tv_usec / 1000); - #endif -} - -void sleepms(unsigned int millisec) -{ -#if defined(_WIN32) || defined(WIN32) - Sleep(millisec); -#else - usleep(millisec * 1000); -#endif -} - - -void benchmark(int dstalign, int srcalign, size_t size, int times) -{ - char *DATA1 = (char*)malloc(size + 64); - char *DATA2 = (char*)malloc(size + 64); - size_t LINEAR1 = ((size_t)DATA1); - size_t LINEAR2 = ((size_t)DATA2); - char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); - char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); - char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); - char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); - unsigned int t1, t2; - int k; - - sleepms(100); - t1 = gettime(); - for (k = times; k > 0; k--) { - memcpy(dst, src, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (k = times; k > 0; k--) { - memcpy_fast(dst, src, size); - } - t2 = gettime() - t2; - - free(DATA1); - free(DATA2); - - printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", - dstalign? "aligned" : "unalign", - srcalign? "aligned" : "unalign", (int)t2, (int)t1); -} - - -void bench(int copysize, int times) -{ - printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); - benchmark(1, 1, copysize, times); - benchmark(1, 0, copysize, times); - benchmark(0, 1, copysize, times); - benchmark(0, 0, copysize, times); - printf("\n"); -} - - -void random_bench(int maxsize, int times) -{ - static char A[11 * 1024 * 1024 + 2]; - static char B[11 * 1024 * 1024 + 2]; - static int random_offsets[0x10000]; - static int random_sizes[0x8000]; - unsigned int i, p1, p2; - unsigned int t1, t2; - for (i = 0; i < 0x10000; i++) { // generate random offsets - random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); - } - for (i = 0; i < 0x8000; i++) { // generate random sizes - random_sizes[i] = 1 + rand() % maxsize; - } - sleepms(100); - t1 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy(A + offset1, B + offset2, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy_fast(A + offset1, B + offset2, size); - } - t2 = gettime() - t2; - printf("benchmark random access:\n"); - printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); -} - - -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif - -int main(void) -{ - bench(32, 0x1000000); - bench(64, 0x1000000); - bench(512, 0x800000); - bench(1024, 0x400000); - bench(4096, 0x80000); - bench(8192, 0x40000); - bench(1024 * 1024 * 1, 0x800); - bench(1024 * 1024 * 4, 0x200); - bench(1024 * 1024 * 8, 0x100); - - random_bench(2048, 8000000); - - return 0; -} - - - - -/* -benchmark(size=32 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms -result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms - -benchmark(size=64 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms -result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms -result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms - -benchmark(size=512 bytes, times=8388608): -result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms -result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms -result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms - -benchmark(size=1024 bytes, times=4194304): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms -result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms -result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms - -benchmark(size=4096 bytes, times=524288): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms - -benchmark(size=8192 bytes, times=262144): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms - -benchmark(size=1048576 bytes, times=2048): -result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms -result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms -result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms -result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms - -benchmark(size=4194304 bytes, times=512): -result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms -result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms -result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms -result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms - -benchmark(size=8388608 bytes, times=256): -result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms -result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms -result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms -result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms - -benchmark random access: -memcpy_fast=515ms memcpy=1014ms - -*/ - - - - diff --git a/contrib/FastMemcpy/FastMemcpy.h b/contrib/FastMemcpy/FastMemcpy.h deleted file mode 100644 index 5dcbfcf16566..000000000000 --- a/contrib/FastMemcpy/FastMemcpy.h +++ /dev/null @@ -1,694 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - -typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; -typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; -typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_sse2_16(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -} - -static INLINE void memcpy_sse2_32(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); -} - -static INLINE void memcpy_sse2_64(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); -} - -static INLINE void memcpy_sse2_128(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); - __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); - __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); - __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); - _mm_storeu_si128(((__m128i*)dst) + 4, m4); - _mm_storeu_si128(((__m128i*)dst) + 5, m5); - _mm_storeu_si128(((__m128i*)dst) + 6, m6); - _mm_storeu_si128(((__m128i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -/// Attribute is used to avoid an error with undefined behaviour sanitizer -/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer -/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. -__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 64: - memcpy_sse2_64(dd - 64, ss - 64); - case 0: - break; - - case 65: - memcpy_sse2_64(dd - 65, ss - 65); - case 1: - dd[-1] = ss[-1]; - break; - - case 66: - memcpy_sse2_64(dd - 66, ss - 66); - case 2: - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 67: - memcpy_sse2_64(dd - 67, ss - 67); - case 3: - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 68: - memcpy_sse2_64(dd - 68, ss - 68); - case 4: - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 69: - memcpy_sse2_64(dd - 69, ss - 69); - case 5: - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 70: - memcpy_sse2_64(dd - 70, ss - 70); - case 6: - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 71: - memcpy_sse2_64(dd - 71, ss - 71); - case 7: - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 72: - memcpy_sse2_64(dd - 72, ss - 72); - case 8: - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 73: - memcpy_sse2_64(dd - 73, ss - 73); - case 9: - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 74: - memcpy_sse2_64(dd - 74, ss - 74); - case 10: - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 75: - memcpy_sse2_64(dd - 75, ss - 75); - case 11: - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 76: - memcpy_sse2_64(dd - 76, ss - 76); - case 12: - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 77: - memcpy_sse2_64(dd - 77, ss - 77); - case 13: - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 78: - memcpy_sse2_64(dd - 78, ss - 78); - case 14: - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 79: - memcpy_sse2_64(dd - 79, ss - 79); - case 15: - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 80: - memcpy_sse2_64(dd - 80, ss - 80); - case 16: - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 81: - memcpy_sse2_64(dd - 81, ss - 81); - case 17: - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 82: - memcpy_sse2_64(dd - 82, ss - 82); - case 18: - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 83: - memcpy_sse2_64(dd - 83, ss - 83); - case 19: - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 84: - memcpy_sse2_64(dd - 84, ss - 84); - case 20: - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 85: - memcpy_sse2_64(dd - 85, ss - 85); - case 21: - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 86: - memcpy_sse2_64(dd - 86, ss - 86); - case 22: - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 87: - memcpy_sse2_64(dd - 87, ss - 87); - case 23: - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 88: - memcpy_sse2_64(dd - 88, ss - 88); - case 24: - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 89: - memcpy_sse2_64(dd - 89, ss - 89); - case 25: - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 90: - memcpy_sse2_64(dd - 90, ss - 90); - case 26: - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 91: - memcpy_sse2_64(dd - 91, ss - 91); - case 27: - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 92: - memcpy_sse2_64(dd - 92, ss - 92); - case 28: - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 93: - memcpy_sse2_64(dd - 93, ss - 93); - case 29: - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 94: - memcpy_sse2_64(dd - 94, ss - 94); - case 30: - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 95: - memcpy_sse2_64(dd - 95, ss - 95); - case 31: - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 96: - memcpy_sse2_64(dd - 96, ss - 96); - case 32: - memcpy_sse2_32(dd - 32, ss - 32); - break; - - case 97: - memcpy_sse2_64(dd - 97, ss - 97); - case 33: - memcpy_sse2_32(dd - 33, ss - 33); - dd[-1] = ss[-1]; - break; - - case 98: - memcpy_sse2_64(dd - 98, ss - 98); - case 34: - memcpy_sse2_32(dd - 34, ss - 34); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 99: - memcpy_sse2_64(dd - 99, ss - 99); - case 35: - memcpy_sse2_32(dd - 35, ss - 35); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 100: - memcpy_sse2_64(dd - 100, ss - 100); - case 36: - memcpy_sse2_32(dd - 36, ss - 36); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 101: - memcpy_sse2_64(dd - 101, ss - 101); - case 37: - memcpy_sse2_32(dd - 37, ss - 37); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 102: - memcpy_sse2_64(dd - 102, ss - 102); - case 38: - memcpy_sse2_32(dd - 38, ss - 38); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 103: - memcpy_sse2_64(dd - 103, ss - 103); - case 39: - memcpy_sse2_32(dd - 39, ss - 39); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 104: - memcpy_sse2_64(dd - 104, ss - 104); - case 40: - memcpy_sse2_32(dd - 40, ss - 40); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 105: - memcpy_sse2_64(dd - 105, ss - 105); - case 41: - memcpy_sse2_32(dd - 41, ss - 41); - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 106: - memcpy_sse2_64(dd - 106, ss - 106); - case 42: - memcpy_sse2_32(dd - 42, ss - 42); - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 107: - memcpy_sse2_64(dd - 107, ss - 107); - case 43: - memcpy_sse2_32(dd - 43, ss - 43); - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 108: - memcpy_sse2_64(dd - 108, ss - 108); - case 44: - memcpy_sse2_32(dd - 44, ss - 44); - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 109: - memcpy_sse2_64(dd - 109, ss - 109); - case 45: - memcpy_sse2_32(dd - 45, ss - 45); - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 110: - memcpy_sse2_64(dd - 110, ss - 110); - case 46: - memcpy_sse2_32(dd - 46, ss - 46); - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 111: - memcpy_sse2_64(dd - 111, ss - 111); - case 47: - memcpy_sse2_32(dd - 47, ss - 47); - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 112: - memcpy_sse2_64(dd - 112, ss - 112); - case 48: - memcpy_sse2_32(dd - 48, ss - 48); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 113: - memcpy_sse2_64(dd - 113, ss - 113); - case 49: - memcpy_sse2_32(dd - 49, ss - 49); - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 114: - memcpy_sse2_64(dd - 114, ss - 114); - case 50: - memcpy_sse2_32(dd - 50, ss - 50); - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 115: - memcpy_sse2_64(dd - 115, ss - 115); - case 51: - memcpy_sse2_32(dd - 51, ss - 51); - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 116: - memcpy_sse2_64(dd - 116, ss - 116); - case 52: - memcpy_sse2_32(dd - 52, ss - 52); - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 117: - memcpy_sse2_64(dd - 117, ss - 117); - case 53: - memcpy_sse2_32(dd - 53, ss - 53); - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 118: - memcpy_sse2_64(dd - 118, ss - 118); - case 54: - memcpy_sse2_32(dd - 54, ss - 54); - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 119: - memcpy_sse2_64(dd - 119, ss - 119); - case 55: - memcpy_sse2_32(dd - 55, ss - 55); - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 120: - memcpy_sse2_64(dd - 120, ss - 120); - case 56: - memcpy_sse2_32(dd - 56, ss - 56); - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 121: - memcpy_sse2_64(dd - 121, ss - 121); - case 57: - memcpy_sse2_32(dd - 57, ss - 57); - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 122: - memcpy_sse2_64(dd - 122, ss - 122); - case 58: - memcpy_sse2_32(dd - 58, ss - 58); - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 123: - memcpy_sse2_64(dd - 123, ss - 123); - case 59: - memcpy_sse2_32(dd - 59, ss - 59); - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 124: - memcpy_sse2_64(dd - 124, ss - 124); - case 60: - memcpy_sse2_32(dd - 60, ss - 60); - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 125: - memcpy_sse2_64(dd - 125, ss - 125); - case 61: - memcpy_sse2_32(dd - 61, ss - 61); - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 126: - memcpy_sse2_64(dd - 126, ss - 126); - case 62: - memcpy_sse2_32(dd - 62, ss - 62); - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 127: - memcpy_sse2_64(dd - 127, ss - 127); - case 63: - memcpy_sse2_32(dd - 63, ss - 63); - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 128: - memcpy_sse2_128(dd - 128, ss - 128); - break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size - size_t padding; - - // small memory copy - if (size <= 128) { - return memcpy_tiny(dst, src, size); - } - - // align destination to 16 bytes boundary - padding = (16 - (((size_t)dst) & 15)) & 15; - - if (padding > 0) { - __m128i head = _mm_loadu_si128((const __m128i*)src); - _mm_storeu_si128((__m128i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } - - // medium size copy - if (size <= cachesize) { - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_store_si128((((__m128i*)dst) + 0), c0); - _mm_store_si128((((__m128i*)dst) + 1), c1); - _mm_store_si128((((__m128i*)dst) + 2), c2); - _mm_store_si128((((__m128i*)dst) + 3), c3); - _mm_store_si128((((__m128i*)dst) + 4), c4); - _mm_store_si128((((__m128i*)dst) + 5), c5); - _mm_store_si128((((__m128i*)dst) + 6), c6); - _mm_store_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // big memory copy - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 15) == 0) { // source aligned - for (; size >= 128; size -= 128) { - c0 = _mm_load_si128(((const __m128i*)src) + 0); - c1 = _mm_load_si128(((const __m128i*)src) + 1); - c2 = _mm_load_si128(((const __m128i*)src) + 2); - c3 = _mm_load_si128(((const __m128i*)src) + 3); - c4 = _mm_load_si128(((const __m128i*)src) + 4); - c5 = _mm_load_si128(((const __m128i*)src) + 5); - c6 = _mm_load_si128(((const __m128i*)src) + 6); - c7 = _mm_load_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // source unaligned - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - - return destination; -} - - -#endif diff --git a/contrib/FastMemcpy/FastMemcpy_Avx.c b/contrib/FastMemcpy/FastMemcpy_Avx.c deleted file mode 100644 index 6538c6b21260..000000000000 --- a/contrib/FastMemcpy/FastMemcpy_Avx.c +++ /dev/null @@ -1,171 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) -// -//===================================================================== -#include -#include -#include -#include -#include - -#if (defined(_WIN32) || defined(WIN32)) -#include -#include -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif -#elif defined(__unix) -#include -#include -#else -#error it can only be compiled under windows or unix -#endif - -#include "FastMemcpy_Avx.h" - - -unsigned int gettime() -{ - #if (defined(_WIN32) || defined(WIN32)) - return timeGetTime(); - #else - static struct timezone tz={ 0,0 }; - struct timeval time; - gettimeofday(&time,&tz); - return (time.tv_sec * 1000 + time.tv_usec / 1000); - #endif -} - -void sleepms(unsigned int millisec) -{ -#if defined(_WIN32) || defined(WIN32) - Sleep(millisec); -#else - usleep(millisec * 1000); -#endif -} - - - -void benchmark(int dstalign, int srcalign, size_t size, int times) -{ - char *DATA1 = (char*)malloc(size + 64); - char *DATA2 = (char*)malloc(size + 64); - size_t LINEAR1 = ((size_t)DATA1); - size_t LINEAR2 = ((size_t)DATA2); - char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); - char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); - char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); - char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); - unsigned int t1, t2; - int k; - - sleepms(100); - t1 = gettime(); - for (k = times; k > 0; k--) { - memcpy(dst, src, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (k = times; k > 0; k--) { - memcpy_fast(dst, src, size); - } - t2 = gettime() - t2; - - free(DATA1); - free(DATA2); - - printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", - dstalign? "aligned" : "unalign", - srcalign? "aligned" : "unalign", (int)t2, (int)t1); -} - - -void bench(int copysize, int times) -{ - printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); - benchmark(1, 1, copysize, times); - benchmark(1, 0, copysize, times); - benchmark(0, 1, copysize, times); - benchmark(0, 0, copysize, times); - printf("\n"); -} - - -void random_bench(int maxsize, int times) -{ - static char A[11 * 1024 * 1024 + 2]; - static char B[11 * 1024 * 1024 + 2]; - static int random_offsets[0x10000]; - static int random_sizes[0x8000]; - unsigned int i, p1, p2; - unsigned int t1, t2; - for (i = 0; i < 0x10000; i++) { // generate random offsets - random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); - } - for (i = 0; i < 0x8000; i++) { // generate random sizes - random_sizes[i] = 1 + rand() % maxsize; - } - sleepms(100); - t1 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy(A + offset1, B + offset2, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy_fast(A + offset1, B + offset2, size); - } - t2 = gettime() - t2; - printf("benchmark random access:\n"); - printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); -} - - -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif - -int main(void) -{ -#if 1 - bench(32, 0x1000000); - bench(64, 0x1000000); - bench(512, 0x800000); - bench(1024, 0x400000); -#endif - bench(4096, 0x80000); - bench(8192, 0x40000); -#if 1 - bench(1024 * 1024 * 1, 0x800); - bench(1024 * 1024 * 4, 0x200); -#endif - bench(1024 * 1024 * 8, 0x100); - - random_bench(2048, 8000000); - - return 0; -} - - - - -/* - -*/ - - - - diff --git a/contrib/FastMemcpy/FastMemcpy_Avx.h b/contrib/FastMemcpy/FastMemcpy_Avx.h deleted file mode 100644 index 8ba064b03508..000000000000 --- a/contrib/FastMemcpy/FastMemcpy_Avx.h +++ /dev/null @@ -1,492 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - - - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_avx_16(void *dst, const void *src) { -#if 1 - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -#else - *((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0)); - *((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8)); -#endif -} - -static INLINE void memcpy_avx_32(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); -} - -static INLINE void memcpy_avx_64(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); -} - -static INLINE void memcpy_avx_128(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); - _mm256_storeu_si256(((__m256i*)dst) + 2, m2); - _mm256_storeu_si256(((__m256i*)dst) + 3, m3); -} - -static INLINE void memcpy_avx_256(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - __m256i m4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - __m256i m5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - __m256i m6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - __m256i m7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); - _mm256_storeu_si256(((__m256i*)dst) + 2, m2); - _mm256_storeu_si256(((__m256i*)dst) + 3, m3); - _mm256_storeu_si256(((__m256i*)dst) + 4, m4); - _mm256_storeu_si256(((__m256i*)dst) + 5, m5); - _mm256_storeu_si256(((__m256i*)dst) + 6, m6); - _mm256_storeu_si256(((__m256i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 128: memcpy_avx_128(dd - 128, ss - 128); - case 0: break; - case 129: memcpy_avx_128(dd - 129, ss - 129); - case 1: dd[-1] = ss[-1]; break; - case 130: memcpy_avx_128(dd - 130, ss - 130); - case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 131: memcpy_avx_128(dd - 131, ss - 131); - case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break; - case 132: memcpy_avx_128(dd - 132, ss - 132); - case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 133: memcpy_avx_128(dd - 133, ss - 133); - case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break; - case 134: memcpy_avx_128(dd - 134, ss - 134); - case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 135: memcpy_avx_128(dd - 135, ss - 135); - case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 136: memcpy_avx_128(dd - 136, ss - 136); - case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 137: memcpy_avx_128(dd - 137, ss - 137); - case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break; - case 138: memcpy_avx_128(dd - 138, ss - 138); - case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 139: memcpy_avx_128(dd - 139, ss - 139); - case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 140: memcpy_avx_128(dd - 140, ss - 140); - case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 141: memcpy_avx_128(dd - 141, ss - 141); - case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 142: memcpy_avx_128(dd - 142, ss - 142); - case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 143: memcpy_avx_128(dd - 143, ss - 143); - case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 144: memcpy_avx_128(dd - 144, ss - 144); - case 16: memcpy_avx_16(dd - 16, ss - 16); break; - case 145: memcpy_avx_128(dd - 145, ss - 145); - case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; - case 146: memcpy_avx_128(dd - 146, ss - 146); - case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 147: memcpy_avx_128(dd - 147, ss - 147); - case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 148: memcpy_avx_128(dd - 148, ss - 148); - case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 149: memcpy_avx_128(dd - 149, ss - 149); - case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 150: memcpy_avx_128(dd - 150, ss - 150); - case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 151: memcpy_avx_128(dd - 151, ss - 151); - case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 152: memcpy_avx_128(dd - 152, ss - 152); - case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 153: memcpy_avx_128(dd - 153, ss - 153); - case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break; - case 154: memcpy_avx_128(dd - 154, ss - 154); - case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break; - case 155: memcpy_avx_128(dd - 155, ss - 155); - case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break; - case 156: memcpy_avx_128(dd - 156, ss - 156); - case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break; - case 157: memcpy_avx_128(dd - 157, ss - 157); - case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break; - case 158: memcpy_avx_128(dd - 158, ss - 158); - case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break; - case 159: memcpy_avx_128(dd - 159, ss - 159); - case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break; - case 160: memcpy_avx_128(dd - 160, ss - 160); - case 32: memcpy_avx_32(dd - 32, ss - 32); break; - case 161: memcpy_avx_128(dd - 161, ss - 161); - case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break; - case 162: memcpy_avx_128(dd - 162, ss - 162); - case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 163: memcpy_avx_128(dd - 163, ss - 163); - case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 164: memcpy_avx_128(dd - 164, ss - 164); - case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 165: memcpy_avx_128(dd - 165, ss - 165); - case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 166: memcpy_avx_128(dd - 166, ss - 166); - case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 167: memcpy_avx_128(dd - 167, ss - 167); - case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 168: memcpy_avx_128(dd - 168, ss - 168); - case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 169: memcpy_avx_128(dd - 169, ss - 169); - case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break; - case 170: memcpy_avx_128(dd - 170, ss - 170); - case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break; - case 171: memcpy_avx_128(dd - 171, ss - 171); - case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break; - case 172: memcpy_avx_128(dd - 172, ss - 172); - case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break; - case 173: memcpy_avx_128(dd - 173, ss - 173); - case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break; - case 174: memcpy_avx_128(dd - 174, ss - 174); - case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break; - case 175: memcpy_avx_128(dd - 175, ss - 175); - case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break; - case 176: memcpy_avx_128(dd - 176, ss - 176); - case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break; - case 177: memcpy_avx_128(dd - 177, ss - 177); - case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break; - case 178: memcpy_avx_128(dd - 178, ss - 178); - case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break; - case 179: memcpy_avx_128(dd - 179, ss - 179); - case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break; - case 180: memcpy_avx_128(dd - 180, ss - 180); - case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break; - case 181: memcpy_avx_128(dd - 181, ss - 181); - case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break; - case 182: memcpy_avx_128(dd - 182, ss - 182); - case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break; - case 183: memcpy_avx_128(dd - 183, ss - 183); - case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break; - case 184: memcpy_avx_128(dd - 184, ss - 184); - case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break; - case 185: memcpy_avx_128(dd - 185, ss - 185); - case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break; - case 186: memcpy_avx_128(dd - 186, ss - 186); - case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break; - case 187: memcpy_avx_128(dd - 187, ss - 187); - case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break; - case 188: memcpy_avx_128(dd - 188, ss - 188); - case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break; - case 189: memcpy_avx_128(dd - 189, ss - 189); - case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break; - case 190: memcpy_avx_128(dd - 190, ss - 190); - case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break; - case 191: memcpy_avx_128(dd - 191, ss - 191); - case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break; - case 192: memcpy_avx_128(dd - 192, ss - 192); - case 64: memcpy_avx_64(dd - 64, ss - 64); break; - case 193: memcpy_avx_128(dd - 193, ss - 193); - case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break; - case 194: memcpy_avx_128(dd - 194, ss - 194); - case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 195: memcpy_avx_128(dd - 195, ss - 195); - case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 196: memcpy_avx_128(dd - 196, ss - 196); - case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 197: memcpy_avx_128(dd - 197, ss - 197); - case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 198: memcpy_avx_128(dd - 198, ss - 198); - case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 199: memcpy_avx_128(dd - 199, ss - 199); - case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 200: memcpy_avx_128(dd - 200, ss - 200); - case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 201: memcpy_avx_128(dd - 201, ss - 201); - case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break; - case 202: memcpy_avx_128(dd - 202, ss - 202); - case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break; - case 203: memcpy_avx_128(dd - 203, ss - 203); - case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break; - case 204: memcpy_avx_128(dd - 204, ss - 204); - case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break; - case 205: memcpy_avx_128(dd - 205, ss - 205); - case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break; - case 206: memcpy_avx_128(dd - 206, ss - 206); - case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break; - case 207: memcpy_avx_128(dd - 207, ss - 207); - case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break; - case 208: memcpy_avx_128(dd - 208, ss - 208); - case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break; - case 209: memcpy_avx_128(dd - 209, ss - 209); - case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break; - case 210: memcpy_avx_128(dd - 210, ss - 210); - case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break; - case 211: memcpy_avx_128(dd - 211, ss - 211); - case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break; - case 212: memcpy_avx_128(dd - 212, ss - 212); - case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break; - case 213: memcpy_avx_128(dd - 213, ss - 213); - case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break; - case 214: memcpy_avx_128(dd - 214, ss - 214); - case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break; - case 215: memcpy_avx_128(dd - 215, ss - 215); - case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break; - case 216: memcpy_avx_128(dd - 216, ss - 216); - case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break; - case 217: memcpy_avx_128(dd - 217, ss - 217); - case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break; - case 218: memcpy_avx_128(dd - 218, ss - 218); - case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break; - case 219: memcpy_avx_128(dd - 219, ss - 219); - case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break; - case 220: memcpy_avx_128(dd - 220, ss - 220); - case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break; - case 221: memcpy_avx_128(dd - 221, ss - 221); - case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break; - case 222: memcpy_avx_128(dd - 222, ss - 222); - case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break; - case 223: memcpy_avx_128(dd - 223, ss - 223); - case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break; - case 224: memcpy_avx_128(dd - 224, ss - 224); - case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break; - case 225: memcpy_avx_128(dd - 225, ss - 225); - case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break; - case 226: memcpy_avx_128(dd - 226, ss - 226); - case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break; - case 227: memcpy_avx_128(dd - 227, ss - 227); - case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break; - case 228: memcpy_avx_128(dd - 228, ss - 228); - case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break; - case 229: memcpy_avx_128(dd - 229, ss - 229); - case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break; - case 230: memcpy_avx_128(dd - 230, ss - 230); - case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break; - case 231: memcpy_avx_128(dd - 231, ss - 231); - case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break; - case 232: memcpy_avx_128(dd - 232, ss - 232); - case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break; - case 233: memcpy_avx_128(dd - 233, ss - 233); - case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break; - case 234: memcpy_avx_128(dd - 234, ss - 234); - case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break; - case 235: memcpy_avx_128(dd - 235, ss - 235); - case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break; - case 236: memcpy_avx_128(dd - 236, ss - 236); - case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break; - case 237: memcpy_avx_128(dd - 237, ss - 237); - case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break; - case 238: memcpy_avx_128(dd - 238, ss - 238); - case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break; - case 239: memcpy_avx_128(dd - 239, ss - 239); - case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break; - case 240: memcpy_avx_128(dd - 240, ss - 240); - case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break; - case 241: memcpy_avx_128(dd - 241, ss - 241); - case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break; - case 242: memcpy_avx_128(dd - 242, ss - 242); - case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break; - case 243: memcpy_avx_128(dd - 243, ss - 243); - case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break; - case 244: memcpy_avx_128(dd - 244, ss - 244); - case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break; - case 245: memcpy_avx_128(dd - 245, ss - 245); - case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break; - case 246: memcpy_avx_128(dd - 246, ss - 246); - case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break; - case 247: memcpy_avx_128(dd - 247, ss - 247); - case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break; - case 248: memcpy_avx_128(dd - 248, ss - 248); - case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break; - case 249: memcpy_avx_128(dd - 249, ss - 249); - case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break; - case 250: memcpy_avx_128(dd - 250, ss - 250); - case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break; - case 251: memcpy_avx_128(dd - 251, ss - 251); - case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break; - case 252: memcpy_avx_128(dd - 252, ss - 252); - case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break; - case 253: memcpy_avx_128(dd - 253, ss - 253); - case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break; - case 254: memcpy_avx_128(dd - 254, ss - 254); - case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break; - case 255: memcpy_avx_128(dd - 255, ss - 255); - case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break; - case 256: memcpy_avx_256(dd - 256, ss - 256); break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L3-cache size - size_t padding; - - // small memory copy - if (size <= 256) { - memcpy_tiny(dst, src, size); - _mm256_zeroupper(); - return destination; - } - - // align destination to 16 bytes boundary - padding = (32 - (((size_t)dst) & 31)) & 31; - -#if 0 - if (padding > 0) { - __m256i head = _mm256_loadu_si256((const __m256i*)src); - _mm256_storeu_si256((__m256i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } -#else - __m256i head = _mm256_loadu_si256((const __m256i*)src); - _mm256_storeu_si256((__m256i*)dst, head); - dst += padding; - src += padding; - size -= padding; -#endif - - // medium size copy - if (size <= cachesize) { - __m256i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 256; size -= 256) { - c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_storeu_si256((((__m256i*)dst) + 0), c0); - _mm256_storeu_si256((((__m256i*)dst) + 1), c1); - _mm256_storeu_si256((((__m256i*)dst) + 2), c2); - _mm256_storeu_si256((((__m256i*)dst) + 3), c3); - _mm256_storeu_si256((((__m256i*)dst) + 4), c4); - _mm256_storeu_si256((((__m256i*)dst) + 5), c5); - _mm256_storeu_si256((((__m256i*)dst) + 6), c6); - _mm256_storeu_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - else { // big memory copy - __m256i c0, c1, c2, c3, c4, c5, c6, c7; - /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 31) == 0) { // source aligned - for (; size >= 256; size -= 256) { - c0 = _mm256_load_si256(((const __m256i*)src) + 0); - c1 = _mm256_load_si256(((const __m256i*)src) + 1); - c2 = _mm256_load_si256(((const __m256i*)src) + 2); - c3 = _mm256_load_si256(((const __m256i*)src) + 3); - c4 = _mm256_load_si256(((const __m256i*)src) + 4); - c5 = _mm256_load_si256(((const __m256i*)src) + 5); - c6 = _mm256_load_si256(((const __m256i*)src) + 6); - c7 = _mm256_load_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_stream_si256((((__m256i*)dst) + 0), c0); - _mm256_stream_si256((((__m256i*)dst) + 1), c1); - _mm256_stream_si256((((__m256i*)dst) + 2), c2); - _mm256_stream_si256((((__m256i*)dst) + 3), c3); - _mm256_stream_si256((((__m256i*)dst) + 4), c4); - _mm256_stream_si256((((__m256i*)dst) + 5), c5); - _mm256_stream_si256((((__m256i*)dst) + 6), c6); - _mm256_stream_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - else { // source unaligned - for (; size >= 256; size -= 256) { - c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_stream_si256((((__m256i*)dst) + 0), c0); - _mm256_stream_si256((((__m256i*)dst) + 1), c1); - _mm256_stream_si256((((__m256i*)dst) + 2), c2); - _mm256_stream_si256((((__m256i*)dst) + 3), c3); - _mm256_stream_si256((((__m256i*)dst) + 4), c4); - _mm256_stream_si256((((__m256i*)dst) + 5), c5); - _mm256_stream_si256((((__m256i*)dst) + 6), c6); - _mm256_stream_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - _mm256_zeroupper(); - - return destination; -} - - -#endif - - - diff --git a/contrib/FastMemcpy/LICENSE b/contrib/FastMemcpy/LICENSE deleted file mode 100644 index c449da6aa8ac..000000000000 --- a/contrib/FastMemcpy/LICENSE +++ /dev/null @@ -1,22 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2015 Linwei - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - diff --git a/contrib/FastMemcpy/README.md b/contrib/FastMemcpy/README.md deleted file mode 100644 index e253f6bf5ddf..000000000000 --- a/contrib/FastMemcpy/README.md +++ /dev/null @@ -1,20 +0,0 @@ -Internal implementation of `memcpy` function. - -It has the following advantages over `libc`-supplied implementation: -- it is linked statically, so the function is called directly, not through a `PLT` (procedure lookup table of shared library); -- it is linked statically, so the function can have position-dependent code; -- your binaries will not depend on `glibc`'s memcpy, that forces dependency on specific symbol version like `memcpy@@GLIBC_2.14` and consequently on specific version of `glibc` library; -- you can include `memcpy.h` directly and the function has the chance to be inlined, which is beneficial for small but unknown at compile time sizes of memory regions; -- this version of `memcpy` pretend to be faster (in our benchmarks, the difference is within few percents). - -Currently it uses the implementation from **Linwei** (skywind3000@163.com). -Look at https://www.zhihu.com/question/35172305 for discussion. - -Drawbacks: -- only use SSE 2, doesn't use wider (AVX, AVX 512) vector registers when available; -- no CPU dispatching; doesn't take into account actual cache size. - -Also worth to look at: -- simple implementation from Facebook: https://github.com/facebook/folly/blob/master/folly/memcpy.S -- implementation from Agner Fog: http://www.agner.org/optimize/ -- glibc source code. diff --git a/contrib/FastMemcpy/memcpy_wrapper.c b/contrib/FastMemcpy/memcpy_wrapper.c deleted file mode 100644 index 1f57345980ad..000000000000 --- a/contrib/FastMemcpy/memcpy_wrapper.c +++ /dev/null @@ -1,6 +0,0 @@ -#include "FastMemcpy.h" - -void * memcpy(void * __restrict destination, const void * __restrict source, size_t size) -{ - return memcpy_fast(destination, source, size); -} From d1b3258ae7a01fdaa8daa15733351880d6e5f3d9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 3 Mar 2021 00:16:34 +0300 Subject: [PATCH 02/22] Try without AVX --- base/glibc-compatibility/memcpy/memcpy.S | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/base/glibc-compatibility/memcpy/memcpy.S b/base/glibc-compatibility/memcpy/memcpy.S index c1439c3c8594..502fcc22b08e 100644 --- a/base/glibc-compatibility/memcpy/memcpy.S +++ b/base/glibc-compatibility/memcpy/memcpy.S @@ -46,19 +46,6 @@ MemCpy: mov $.Lmemcpytab.size,%ecx cmovb %rdx,%rcx jmp *memcpytab(,%rcx,8) .Lanchorpoint: -.L32r: cmp $1024,%rdx - jae .Lerms -.L32: vmovdqu -32(%rsi,%rdx),%ymm4 - mov $32,%rcx -0: add $32,%rcx - vmovdqu -64(%rsi,%rcx),%ymm3 - vmovdqu %ymm3,-64(%rdi,%rcx) - cmp %rcx,%rdx - ja 0b - vmovdqu %ymm4,-32(%rdi,%rdx) - vxorps %ymm4,%ymm4,%ymm4 - vxorps %ymm3,%ymm3,%ymm3 - jmp .L0 .L16r: cmp $1024,%rdx jae .Lerms .L16: movdqu -16(%rsi,%rdx),%xmm4 @@ -146,6 +133,6 @@ memcpytab: .quad .L16 .endr .equ .Lmemcpytab.size,(.-memcpytab)/8 - .quad .L32r # AVX + ERMS + .quad .L16r # SSE + ERMS + NTS .type memcpytab,@object .previous From aecdadd02e7921ce5cd06614cd94ca5798b92d60 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 3 Mar 2021 02:33:17 +0300 Subject: [PATCH 03/22] Add missing files --- base/glibc-compatibility/tests/CMakeLists.txt | 2 ++ .../glibc-compatibility/tests/memcpy_test.cpp | 26 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 base/glibc-compatibility/tests/CMakeLists.txt create mode 100644 base/glibc-compatibility/tests/memcpy_test.cpp diff --git a/base/glibc-compatibility/tests/CMakeLists.txt b/base/glibc-compatibility/tests/CMakeLists.txt new file mode 100644 index 000000000000..f2978a86664a --- /dev/null +++ b/base/glibc-compatibility/tests/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(memcpy_test memcpy_test.cpp) +target_link_libraries(memcpy_test common memcpy) diff --git a/base/glibc-compatibility/tests/memcpy_test.cpp b/base/glibc-compatibility/tests/memcpy_test.cpp new file mode 100644 index 000000000000..a1f782380b03 --- /dev/null +++ b/base/glibc-compatibility/tests/memcpy_test.cpp @@ -0,0 +1,26 @@ +#include +#include +#include + +__attribute__((__noinline__)) void memcpy_noinline(void * __restrict dst, const void * __restrict src, size_t size) +{ + memcpy(dst, src, size); +} + + +int main(int, char **) +{ + constexpr size_t buf_size = 100; + char buf[buf_size]{}; + memcpy_noinline(buf, "abc", 3); + + size_t bytes_to_copy = 3; + while (bytes_to_copy * 2 < buf_size) + { + memcpy_noinline(&buf[bytes_to_copy], buf, bytes_to_copy); + bytes_to_copy *= 2; + } + + std::cerr << buf << "\n"; + return 0; +} From cc2837485fecb8ce7f1cb7b31a23c273d25fa3fc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 8 Mar 2021 09:25:02 +0300 Subject: [PATCH 04/22] Remove prefetch --- contrib/FastMemcpy/FastMemcpy.h | 1448 ++++++++++++++------------- contrib/FastMemcpy/FastMemcpy_Avx.h | 972 +++++++++--------- 2 files changed, 1234 insertions(+), 1186 deletions(-) diff --git a/contrib/FastMemcpy/FastMemcpy.h b/contrib/FastMemcpy/FastMemcpy.h index 5dcbfcf16566..046ed3fc6708 100644 --- a/contrib/FastMemcpy/FastMemcpy.h +++ b/contrib/FastMemcpy/FastMemcpy.h @@ -1,694 +1,754 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - -typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; -typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; -typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_sse2_16(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -} - -static INLINE void memcpy_sse2_32(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); -} - -static INLINE void memcpy_sse2_64(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); -} - -static INLINE void memcpy_sse2_128(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); - __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); - __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); - __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); - __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); - __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); - __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); - _mm_storeu_si128(((__m128i*)dst) + 1, m1); - _mm_storeu_si128(((__m128i*)dst) + 2, m2); - _mm_storeu_si128(((__m128i*)dst) + 3, m3); - _mm_storeu_si128(((__m128i*)dst) + 4, m4); - _mm_storeu_si128(((__m128i*)dst) + 5, m5); - _mm_storeu_si128(((__m128i*)dst) + 6, m6); - _mm_storeu_si128(((__m128i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -/// Attribute is used to avoid an error with undefined behaviour sanitizer -/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer -/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. -__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 64: - memcpy_sse2_64(dd - 64, ss - 64); - case 0: - break; - - case 65: - memcpy_sse2_64(dd - 65, ss - 65); - case 1: - dd[-1] = ss[-1]; - break; - - case 66: - memcpy_sse2_64(dd - 66, ss - 66); - case 2: - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 67: - memcpy_sse2_64(dd - 67, ss - 67); - case 3: - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 68: - memcpy_sse2_64(dd - 68, ss - 68); - case 4: - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 69: - memcpy_sse2_64(dd - 69, ss - 69); - case 5: - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 70: - memcpy_sse2_64(dd - 70, ss - 70); - case 6: - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 71: - memcpy_sse2_64(dd - 71, ss - 71); - case 7: - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 72: - memcpy_sse2_64(dd - 72, ss - 72); - case 8: - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 73: - memcpy_sse2_64(dd - 73, ss - 73); - case 9: - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 74: - memcpy_sse2_64(dd - 74, ss - 74); - case 10: - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 75: - memcpy_sse2_64(dd - 75, ss - 75); - case 11: - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 76: - memcpy_sse2_64(dd - 76, ss - 76); - case 12: - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 77: - memcpy_sse2_64(dd - 77, ss - 77); - case 13: - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 78: - memcpy_sse2_64(dd - 78, ss - 78); - case 14: - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 79: - memcpy_sse2_64(dd - 79, ss - 79); - case 15: - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 80: - memcpy_sse2_64(dd - 80, ss - 80); - case 16: - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 81: - memcpy_sse2_64(dd - 81, ss - 81); - case 17: - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 82: - memcpy_sse2_64(dd - 82, ss - 82); - case 18: - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 83: - memcpy_sse2_64(dd - 83, ss - 83); - case 19: - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 84: - memcpy_sse2_64(dd - 84, ss - 84); - case 20: - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 85: - memcpy_sse2_64(dd - 85, ss - 85); - case 21: - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 86: - memcpy_sse2_64(dd - 86, ss - 86); - case 22: - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 87: - memcpy_sse2_64(dd - 87, ss - 87); - case 23: - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 88: - memcpy_sse2_64(dd - 88, ss - 88); - case 24: - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 89: - memcpy_sse2_64(dd - 89, ss - 89); - case 25: - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 90: - memcpy_sse2_64(dd - 90, ss - 90); - case 26: - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 91: - memcpy_sse2_64(dd - 91, ss - 91); - case 27: - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 92: - memcpy_sse2_64(dd - 92, ss - 92); - case 28: - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 93: - memcpy_sse2_64(dd - 93, ss - 93); - case 29: - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 94: - memcpy_sse2_64(dd - 94, ss - 94); - case 30: - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 95: - memcpy_sse2_64(dd - 95, ss - 95); - case 31: - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 96: - memcpy_sse2_64(dd - 96, ss - 96); - case 32: - memcpy_sse2_32(dd - 32, ss - 32); - break; - - case 97: - memcpy_sse2_64(dd - 97, ss - 97); - case 33: - memcpy_sse2_32(dd - 33, ss - 33); - dd[-1] = ss[-1]; - break; - - case 98: - memcpy_sse2_64(dd - 98, ss - 98); - case 34: - memcpy_sse2_32(dd - 34, ss - 34); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 99: - memcpy_sse2_64(dd - 99, ss - 99); - case 35: - memcpy_sse2_32(dd - 35, ss - 35); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 100: - memcpy_sse2_64(dd - 100, ss - 100); - case 36: - memcpy_sse2_32(dd - 36, ss - 36); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 101: - memcpy_sse2_64(dd - 101, ss - 101); - case 37: - memcpy_sse2_32(dd - 37, ss - 37); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 102: - memcpy_sse2_64(dd - 102, ss - 102); - case 38: - memcpy_sse2_32(dd - 38, ss - 38); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 103: - memcpy_sse2_64(dd - 103, ss - 103); - case 39: - memcpy_sse2_32(dd - 39, ss - 39); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 104: - memcpy_sse2_64(dd - 104, ss - 104); - case 40: - memcpy_sse2_32(dd - 40, ss - 40); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 105: - memcpy_sse2_64(dd - 105, ss - 105); - case 41: - memcpy_sse2_32(dd - 41, ss - 41); - *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 106: - memcpy_sse2_64(dd - 106, ss - 106); - case 42: - memcpy_sse2_32(dd - 42, ss - 42); - *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 107: - memcpy_sse2_64(dd - 107, ss - 107); - case 43: - memcpy_sse2_32(dd - 43, ss - 43); - *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 108: - memcpy_sse2_64(dd - 108, ss - 108); - case 44: - memcpy_sse2_32(dd - 44, ss - 44); - *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 109: - memcpy_sse2_64(dd - 109, ss - 109); - case 45: - memcpy_sse2_32(dd - 45, ss - 45); - *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 110: - memcpy_sse2_64(dd - 110, ss - 110); - case 46: - memcpy_sse2_32(dd - 46, ss - 46); - *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 111: - memcpy_sse2_64(dd - 111, ss - 111); - case 47: - memcpy_sse2_32(dd - 47, ss - 47); - *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); - break; - - case 112: - memcpy_sse2_64(dd - 112, ss - 112); - case 48: - memcpy_sse2_32(dd - 48, ss - 48); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 113: - memcpy_sse2_64(dd - 113, ss - 113); - case 49: - memcpy_sse2_32(dd - 49, ss - 49); - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 114: - memcpy_sse2_64(dd - 114, ss - 114); - case 50: - memcpy_sse2_32(dd - 50, ss - 50); - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 115: - memcpy_sse2_64(dd - 115, ss - 115); - case 51: - memcpy_sse2_32(dd - 51, ss - 51); - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 116: - memcpy_sse2_64(dd - 116, ss - 116); - case 52: - memcpy_sse2_32(dd - 52, ss - 52); - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 117: - memcpy_sse2_64(dd - 117, ss - 117); - case 53: - memcpy_sse2_32(dd - 53, ss - 53); - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 118: - memcpy_sse2_64(dd - 118, ss - 118); - case 54: - memcpy_sse2_32(dd - 54, ss - 54); - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); - break; - - case 119: - memcpy_sse2_64(dd - 119, ss - 119); - case 55: - memcpy_sse2_32(dd - 55, ss - 55); - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); - break; - - case 120: - memcpy_sse2_64(dd - 120, ss - 120); - case 56: - memcpy_sse2_32(dd - 56, ss - 56); - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 121: - memcpy_sse2_64(dd - 121, ss - 121); - case 57: - memcpy_sse2_32(dd - 57, ss - 57); - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 122: - memcpy_sse2_64(dd - 122, ss - 122); - case 58: - memcpy_sse2_32(dd - 58, ss - 58); - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 123: - memcpy_sse2_64(dd - 123, ss - 123); - case 59: - memcpy_sse2_32(dd - 59, ss - 59); - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 124: - memcpy_sse2_64(dd - 124, ss - 124); - case 60: - memcpy_sse2_32(dd - 60, ss - 60); - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 125: - memcpy_sse2_64(dd - 125, ss - 125); - case 61: - memcpy_sse2_32(dd - 61, ss - 61); - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 126: - memcpy_sse2_64(dd - 126, ss - 126); - case 62: - memcpy_sse2_32(dd - 62, ss - 62); - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 127: - memcpy_sse2_64(dd - 127, ss - 127); - case 63: - memcpy_sse2_32(dd - 63, ss - 63); - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 128: - memcpy_sse2_128(dd - 128, ss - 128); - break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size - size_t padding; - - // small memory copy - if (size <= 128) { - return memcpy_tiny(dst, src, size); - } - - // align destination to 16 bytes boundary - padding = (16 - (((size_t)dst) & 15)) & 15; - - if (padding > 0) { - __m128i head = _mm_loadu_si128((const __m128i*)src); - _mm_storeu_si128((__m128i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } - - // medium size copy - if (size <= cachesize) { - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_store_si128((((__m128i*)dst) + 0), c0); - _mm_store_si128((((__m128i*)dst) + 1), c1); - _mm_store_si128((((__m128i*)dst) + 2), c2); - _mm_store_si128((((__m128i*)dst) + 3), c3); - _mm_store_si128((((__m128i*)dst) + 4), c4); - _mm_store_si128((((__m128i*)dst) + 5), c5); - _mm_store_si128((((__m128i*)dst) + 6), c6); - _mm_store_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // big memory copy - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 15) == 0) { // source aligned - for (; size >= 128; size -= 128) { - c0 = _mm_load_si128(((const __m128i*)src) + 0); - c1 = _mm_load_si128(((const __m128i*)src) + 1); - c2 = _mm_load_si128(((const __m128i*)src) + 2); - c3 = _mm_load_si128(((const __m128i*)src) + 3); - c4 = _mm_load_si128(((const __m128i*)src) + 4); - c5 = _mm_load_si128(((const __m128i*)src) + 5); - c6 = _mm_load_si128(((const __m128i*)src) + 6); - c7 = _mm_load_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - else { // source unaligned - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128(((const __m128i*)src) + 0); - c1 = _mm_loadu_si128(((const __m128i*)src) + 1); - c2 = _mm_loadu_si128(((const __m128i*)src) + 2); - c3 = _mm_loadu_si128(((const __m128i*)src) + 3); - c4 = _mm_loadu_si128(((const __m128i*)src) + 4); - c5 = _mm_loadu_si128(((const __m128i*)src) + 5); - c6 = _mm_loadu_si128(((const __m128i*)src) + 6); - c7 = _mm_loadu_si128(((const __m128i*)src) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128((((__m128i*)dst) + 0), c0); - _mm_stream_si128((((__m128i*)dst) + 1), c1); - _mm_stream_si128((((__m128i*)dst) + 2), c2); - _mm_stream_si128((((__m128i*)dst) + 3), c3); - _mm_stream_si128((((__m128i*)dst) + 4), c4); - _mm_stream_si128((((__m128i*)dst) + 5), c5); - _mm_stream_si128((((__m128i*)dst) + 6), c6); - _mm_stream_si128((((__m128i*)dst) + 7), c7); - dst += 128; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - - return destination; -} - - -#endif +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== +#pragma once + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + +typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; +typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; +typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); +} + +static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); +} + +static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); +} + +static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + __m128i m4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + __m128i m5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + __m128i m6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + __m128i m7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +/// Attribute is used to avoid an error with undefined behaviour sanitizer +/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer +/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. +__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size) { + unsigned char *dd = ((unsigned char*)dst) + size; + const unsigned char *ss = ((const unsigned char*)src) + size; + + switch (size) { + case 64: + memcpy_sse2_64(dd - 64, ss - 64); + [[fallthrough]]; + case 0: + break; + + case 65: + memcpy_sse2_64(dd - 65, ss - 65); + [[fallthrough]]; + case 1: + dd[-1] = ss[-1]; + break; + + case 66: + memcpy_sse2_64(dd - 66, ss - 66); + [[fallthrough]]; + case 2: + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 67: + memcpy_sse2_64(dd - 67, ss - 67); + [[fallthrough]]; + case 3: + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 68: + memcpy_sse2_64(dd - 68, ss - 68); + [[fallthrough]]; + case 4: + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 69: + memcpy_sse2_64(dd - 69, ss - 69); + [[fallthrough]]; + case 5: + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 70: + memcpy_sse2_64(dd - 70, ss - 70); + [[fallthrough]]; + case 6: + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 71: + memcpy_sse2_64(dd - 71, ss - 71); + [[fallthrough]]; + case 7: + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 72: + memcpy_sse2_64(dd - 72, ss - 72); + [[fallthrough]]; + case 8: + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 73: + memcpy_sse2_64(dd - 73, ss - 73); + [[fallthrough]]; + case 9: + *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 74: + memcpy_sse2_64(dd - 74, ss - 74); + [[fallthrough]]; + case 10: + *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 75: + memcpy_sse2_64(dd - 75, ss - 75); + [[fallthrough]]; + case 11: + *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 76: + memcpy_sse2_64(dd - 76, ss - 76); + [[fallthrough]]; + case 12: + *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 77: + memcpy_sse2_64(dd - 77, ss - 77); + [[fallthrough]]; + case 13: + *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 78: + memcpy_sse2_64(dd - 78, ss - 78); + [[fallthrough]]; + case 14: + *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 79: + memcpy_sse2_64(dd - 79, ss - 79); + [[fallthrough]]; + case 15: + *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 80: + memcpy_sse2_64(dd - 80, ss - 80); + [[fallthrough]]; + case 16: + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 81: + memcpy_sse2_64(dd - 81, ss - 81); + [[fallthrough]]; + case 17: + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 82: + memcpy_sse2_64(dd - 82, ss - 82); + [[fallthrough]]; + case 18: + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 83: + memcpy_sse2_64(dd - 83, ss - 83); + [[fallthrough]]; + case 19: + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 84: + memcpy_sse2_64(dd - 84, ss - 84); + [[fallthrough]]; + case 20: + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 85: + memcpy_sse2_64(dd - 85, ss - 85); + [[fallthrough]]; + case 21: + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 86: + memcpy_sse2_64(dd - 86, ss - 86); + [[fallthrough]]; + case 22: + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 87: + memcpy_sse2_64(dd - 87, ss - 87); + [[fallthrough]]; + case 23: + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 88: + memcpy_sse2_64(dd - 88, ss - 88); + [[fallthrough]]; + case 24: + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 89: + memcpy_sse2_64(dd - 89, ss - 89); + [[fallthrough]]; + case 25: + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 90: + memcpy_sse2_64(dd - 90, ss - 90); + [[fallthrough]]; + case 26: + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 91: + memcpy_sse2_64(dd - 91, ss - 91); + [[fallthrough]]; + case 27: + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 92: + memcpy_sse2_64(dd - 92, ss - 92); + [[fallthrough]]; + case 28: + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 93: + memcpy_sse2_64(dd - 93, ss - 93); + [[fallthrough]]; + case 29: + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 94: + memcpy_sse2_64(dd - 94, ss - 94); + [[fallthrough]]; + case 30: + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 95: + memcpy_sse2_64(dd - 95, ss - 95); + [[fallthrough]]; + case 31: + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 96: + memcpy_sse2_64(dd - 96, ss - 96); + [[fallthrough]]; + case 32: + memcpy_sse2_32(dd - 32, ss - 32); + break; + + case 97: + memcpy_sse2_64(dd - 97, ss - 97); + [[fallthrough]]; + case 33: + memcpy_sse2_32(dd - 33, ss - 33); + dd[-1] = ss[-1]; + break; + + case 98: + memcpy_sse2_64(dd - 98, ss - 98); + [[fallthrough]]; + case 34: + memcpy_sse2_32(dd - 34, ss - 34); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 99: + memcpy_sse2_64(dd - 99, ss - 99); + [[fallthrough]]; + case 35: + memcpy_sse2_32(dd - 35, ss - 35); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 100: + memcpy_sse2_64(dd - 100, ss - 100); + [[fallthrough]]; + case 36: + memcpy_sse2_32(dd - 36, ss - 36); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 101: + memcpy_sse2_64(dd - 101, ss - 101); + [[fallthrough]]; + case 37: + memcpy_sse2_32(dd - 37, ss - 37); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 102: + memcpy_sse2_64(dd - 102, ss - 102); + [[fallthrough]]; + case 38: + memcpy_sse2_32(dd - 38, ss - 38); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 103: + memcpy_sse2_64(dd - 103, ss - 103); + [[fallthrough]]; + case 39: + memcpy_sse2_32(dd - 39, ss - 39); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 104: + memcpy_sse2_64(dd - 104, ss - 104); + [[fallthrough]]; + case 40: + memcpy_sse2_32(dd - 40, ss - 40); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 105: + memcpy_sse2_64(dd - 105, ss - 105); + [[fallthrough]]; + case 41: + memcpy_sse2_32(dd - 41, ss - 41); + *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 106: + memcpy_sse2_64(dd - 106, ss - 106); + [[fallthrough]]; + case 42: + memcpy_sse2_32(dd - 42, ss - 42); + *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 107: + memcpy_sse2_64(dd - 107, ss - 107); + [[fallthrough]]; + case 43: + memcpy_sse2_32(dd - 43, ss - 43); + *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 108: + memcpy_sse2_64(dd - 108, ss - 108); + [[fallthrough]]; + case 44: + memcpy_sse2_32(dd - 44, ss - 44); + *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 109: + memcpy_sse2_64(dd - 109, ss - 109); + [[fallthrough]]; + case 45: + memcpy_sse2_32(dd - 45, ss - 45); + *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 110: + memcpy_sse2_64(dd - 110, ss - 110); + [[fallthrough]]; + case 46: + memcpy_sse2_32(dd - 46, ss - 46); + *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 111: + memcpy_sse2_64(dd - 111, ss - 111); + [[fallthrough]]; + case 47: + memcpy_sse2_32(dd - 47, ss - 47); + *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 112: + memcpy_sse2_64(dd - 112, ss - 112); + [[fallthrough]]; + case 48: + memcpy_sse2_32(dd - 48, ss - 48); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 113: + memcpy_sse2_64(dd - 113, ss - 113); + [[fallthrough]]; + case 49: + memcpy_sse2_32(dd - 49, ss - 49); + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 114: + memcpy_sse2_64(dd - 114, ss - 114); + [[fallthrough]]; + case 50: + memcpy_sse2_32(dd - 50, ss - 50); + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 115: + memcpy_sse2_64(dd - 115, ss - 115); + [[fallthrough]]; + case 51: + memcpy_sse2_32(dd - 51, ss - 51); + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 116: + memcpy_sse2_64(dd - 116, ss - 116); + [[fallthrough]]; + case 52: + memcpy_sse2_32(dd - 52, ss - 52); + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 117: + memcpy_sse2_64(dd - 117, ss - 117); + [[fallthrough]]; + case 53: + memcpy_sse2_32(dd - 53, ss - 53); + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 118: + memcpy_sse2_64(dd - 118, ss - 118); + [[fallthrough]]; + case 54: + memcpy_sse2_32(dd - 54, ss - 54); + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 119: + memcpy_sse2_64(dd - 119, ss - 119); + [[fallthrough]]; + case 55: + memcpy_sse2_32(dd - 55, ss - 55); + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 120: + memcpy_sse2_64(dd - 120, ss - 120); + [[fallthrough]]; + case 56: + memcpy_sse2_32(dd - 56, ss - 56); + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 121: + memcpy_sse2_64(dd - 121, ss - 121); + [[fallthrough]]; + case 57: + memcpy_sse2_32(dd - 57, ss - 57); + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 122: + memcpy_sse2_64(dd - 122, ss - 122); + [[fallthrough]]; + case 58: + memcpy_sse2_32(dd - 58, ss - 58); + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 123: + memcpy_sse2_64(dd - 123, ss - 123); + [[fallthrough]]; + case 59: + memcpy_sse2_32(dd - 59, ss - 59); + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 124: + memcpy_sse2_64(dd - 124, ss - 124); + [[fallthrough]]; + case 60: + memcpy_sse2_32(dd - 60, ss - 60); + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 125: + memcpy_sse2_64(dd - 125, ss - 125); + [[fallthrough]]; + case 61: + memcpy_sse2_32(dd - 61, ss - 61); + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 126: + memcpy_sse2_64(dd - 126, ss - 126); + [[fallthrough]]; + case 62: + memcpy_sse2_32(dd - 62, ss - 62); + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 127: + memcpy_sse2_64(dd - 127, ss - 127); + [[fallthrough]]; + case 63: + memcpy_sse2_32(dd - 63, ss - 63); + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 128: + memcpy_sse2_128(dd - 128, ss - 128); + break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +void* memcpy_fast(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = (unsigned char*)destination; + const unsigned char *src = (const unsigned char*)source; + static size_t cachesize = 0x200000; // L2-cache size + size_t padding; + + // small memory copy + if (size <= 128) { + return memcpy_tiny(dst, src, size); + } + + // align destination to 16 bytes boundary + padding = (16 - (((size_t)dst) & 15)) & 15; + + if (padding > 0) { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + if (size <= cachesize) { + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + else { // big memory copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + _mm_prefetch((const char*)(src), _MM_HINT_NTA); + + if ((((size_t)src) & 15) == 0) { // source aligned + for (; size >= 128; size -= 128) { + c0 = _mm_load_si128((reinterpret_cast(src)) + 0); + c1 = _mm_load_si128((reinterpret_cast(src)) + 1); + c2 = _mm_load_si128((reinterpret_cast(src)) + 2); + c3 = _mm_load_si128((reinterpret_cast(src)) + 3); + c4 = _mm_load_si128((reinterpret_cast(src)) + 4); + c5 = _mm_load_si128((reinterpret_cast(src)) + 5); + c6 = _mm_load_si128((reinterpret_cast(src)) + 6); + c7 = _mm_load_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + else { // source unaligned + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + _mm_sfence(); + } + + memcpy_tiny(dst, src, size); + + return destination; +} diff --git a/contrib/FastMemcpy/FastMemcpy_Avx.h b/contrib/FastMemcpy/FastMemcpy_Avx.h index 8ba064b03508..b209442e7c5b 100644 --- a/contrib/FastMemcpy/FastMemcpy_Avx.h +++ b/contrib/FastMemcpy/FastMemcpy_Avx.h @@ -1,492 +1,480 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) -// -//===================================================================== -#ifndef __FAST_MEMCPY_H__ -#define __FAST_MEMCPY_H__ - -#include -#include -#include - - -//--------------------------------------------------------------------- -// force inline for compilers -//--------------------------------------------------------------------- -#ifndef INLINE -#ifdef __GNUC__ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - #define INLINE __inline__ __attribute__((always_inline)) -#else - #define INLINE __inline__ -#endif -#elif defined(_MSC_VER) - #define INLINE __forceinline -#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) - #define INLINE __inline -#else - #define INLINE -#endif -#endif - - - -//--------------------------------------------------------------------- -// fast copy for different sizes -//--------------------------------------------------------------------- -static INLINE void memcpy_avx_16(void *dst, const void *src) { -#if 1 - __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); - _mm_storeu_si128(((__m128i*)dst) + 0, m0); -#else - *((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0)); - *((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8)); -#endif -} - -static INLINE void memcpy_avx_32(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); -} - -static INLINE void memcpy_avx_64(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); -} - -static INLINE void memcpy_avx_128(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); - _mm256_storeu_si256(((__m256i*)dst) + 2, m2); - _mm256_storeu_si256(((__m256i*)dst) + 3, m3); -} - -static INLINE void memcpy_avx_256(void *dst, const void *src) { - __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - __m256i m4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - __m256i m5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - __m256i m6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - __m256i m7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm256_storeu_si256(((__m256i*)dst) + 0, m0); - _mm256_storeu_si256(((__m256i*)dst) + 1, m1); - _mm256_storeu_si256(((__m256i*)dst) + 2, m2); - _mm256_storeu_si256(((__m256i*)dst) + 3, m3); - _mm256_storeu_si256(((__m256i*)dst) + 4, m4); - _mm256_storeu_si256(((__m256i*)dst) + 5, m5); - _mm256_storeu_si256(((__m256i*)dst) + 6, m6); - _mm256_storeu_si256(((__m256i*)dst) + 7, m7); -} - - -//--------------------------------------------------------------------- -// tiny memory copy with jump table optimized -//--------------------------------------------------------------------- -static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; - - switch (size) { - case 128: memcpy_avx_128(dd - 128, ss - 128); - case 0: break; - case 129: memcpy_avx_128(dd - 129, ss - 129); - case 1: dd[-1] = ss[-1]; break; - case 130: memcpy_avx_128(dd - 130, ss - 130); - case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 131: memcpy_avx_128(dd - 131, ss - 131); - case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break; - case 132: memcpy_avx_128(dd - 132, ss - 132); - case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 133: memcpy_avx_128(dd - 133, ss - 133); - case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break; - case 134: memcpy_avx_128(dd - 134, ss - 134); - case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 135: memcpy_avx_128(dd - 135, ss - 135); - case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 136: memcpy_avx_128(dd - 136, ss - 136); - case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 137: memcpy_avx_128(dd - 137, ss - 137); - case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break; - case 138: memcpy_avx_128(dd - 138, ss - 138); - case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 139: memcpy_avx_128(dd - 139, ss - 139); - case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 140: memcpy_avx_128(dd - 140, ss - 140); - case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 141: memcpy_avx_128(dd - 141, ss - 141); - case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 142: memcpy_avx_128(dd - 142, ss - 142); - case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 143: memcpy_avx_128(dd - 143, ss - 143); - case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 144: memcpy_avx_128(dd - 144, ss - 144); - case 16: memcpy_avx_16(dd - 16, ss - 16); break; - case 145: memcpy_avx_128(dd - 145, ss - 145); - case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; - case 146: memcpy_avx_128(dd - 146, ss - 146); - case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 147: memcpy_avx_128(dd - 147, ss - 147); - case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 148: memcpy_avx_128(dd - 148, ss - 148); - case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 149: memcpy_avx_128(dd - 149, ss - 149); - case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 150: memcpy_avx_128(dd - 150, ss - 150); - case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 151: memcpy_avx_128(dd - 151, ss - 151); - case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 152: memcpy_avx_128(dd - 152, ss - 152); - case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 153: memcpy_avx_128(dd - 153, ss - 153); - case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break; - case 154: memcpy_avx_128(dd - 154, ss - 154); - case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break; - case 155: memcpy_avx_128(dd - 155, ss - 155); - case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break; - case 156: memcpy_avx_128(dd - 156, ss - 156); - case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break; - case 157: memcpy_avx_128(dd - 157, ss - 157); - case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break; - case 158: memcpy_avx_128(dd - 158, ss - 158); - case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break; - case 159: memcpy_avx_128(dd - 159, ss - 159); - case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break; - case 160: memcpy_avx_128(dd - 160, ss - 160); - case 32: memcpy_avx_32(dd - 32, ss - 32); break; - case 161: memcpy_avx_128(dd - 161, ss - 161); - case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break; - case 162: memcpy_avx_128(dd - 162, ss - 162); - case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 163: memcpy_avx_128(dd - 163, ss - 163); - case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 164: memcpy_avx_128(dd - 164, ss - 164); - case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 165: memcpy_avx_128(dd - 165, ss - 165); - case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 166: memcpy_avx_128(dd - 166, ss - 166); - case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 167: memcpy_avx_128(dd - 167, ss - 167); - case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 168: memcpy_avx_128(dd - 168, ss - 168); - case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 169: memcpy_avx_128(dd - 169, ss - 169); - case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break; - case 170: memcpy_avx_128(dd - 170, ss - 170); - case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break; - case 171: memcpy_avx_128(dd - 171, ss - 171); - case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break; - case 172: memcpy_avx_128(dd - 172, ss - 172); - case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break; - case 173: memcpy_avx_128(dd - 173, ss - 173); - case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break; - case 174: memcpy_avx_128(dd - 174, ss - 174); - case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break; - case 175: memcpy_avx_128(dd - 175, ss - 175); - case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break; - case 176: memcpy_avx_128(dd - 176, ss - 176); - case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break; - case 177: memcpy_avx_128(dd - 177, ss - 177); - case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break; - case 178: memcpy_avx_128(dd - 178, ss - 178); - case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break; - case 179: memcpy_avx_128(dd - 179, ss - 179); - case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break; - case 180: memcpy_avx_128(dd - 180, ss - 180); - case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break; - case 181: memcpy_avx_128(dd - 181, ss - 181); - case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break; - case 182: memcpy_avx_128(dd - 182, ss - 182); - case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break; - case 183: memcpy_avx_128(dd - 183, ss - 183); - case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break; - case 184: memcpy_avx_128(dd - 184, ss - 184); - case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break; - case 185: memcpy_avx_128(dd - 185, ss - 185); - case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break; - case 186: memcpy_avx_128(dd - 186, ss - 186); - case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break; - case 187: memcpy_avx_128(dd - 187, ss - 187); - case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break; - case 188: memcpy_avx_128(dd - 188, ss - 188); - case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break; - case 189: memcpy_avx_128(dd - 189, ss - 189); - case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break; - case 190: memcpy_avx_128(dd - 190, ss - 190); - case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break; - case 191: memcpy_avx_128(dd - 191, ss - 191); - case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break; - case 192: memcpy_avx_128(dd - 192, ss - 192); - case 64: memcpy_avx_64(dd - 64, ss - 64); break; - case 193: memcpy_avx_128(dd - 193, ss - 193); - case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break; - case 194: memcpy_avx_128(dd - 194, ss - 194); - case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; - case 195: memcpy_avx_128(dd - 195, ss - 195); - case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 196: memcpy_avx_128(dd - 196, ss - 196); - case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; - case 197: memcpy_avx_128(dd - 197, ss - 197); - case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 198: memcpy_avx_128(dd - 198, ss - 198); - case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 199: memcpy_avx_128(dd - 199, ss - 199); - case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 200: memcpy_avx_128(dd - 200, ss - 200); - case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; - case 201: memcpy_avx_128(dd - 201, ss - 201); - case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break; - case 202: memcpy_avx_128(dd - 202, ss - 202); - case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break; - case 203: memcpy_avx_128(dd - 203, ss - 203); - case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break; - case 204: memcpy_avx_128(dd - 204, ss - 204); - case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break; - case 205: memcpy_avx_128(dd - 205, ss - 205); - case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break; - case 206: memcpy_avx_128(dd - 206, ss - 206); - case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break; - case 207: memcpy_avx_128(dd - 207, ss - 207); - case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break; - case 208: memcpy_avx_128(dd - 208, ss - 208); - case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break; - case 209: memcpy_avx_128(dd - 209, ss - 209); - case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break; - case 210: memcpy_avx_128(dd - 210, ss - 210); - case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break; - case 211: memcpy_avx_128(dd - 211, ss - 211); - case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break; - case 212: memcpy_avx_128(dd - 212, ss - 212); - case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break; - case 213: memcpy_avx_128(dd - 213, ss - 213); - case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break; - case 214: memcpy_avx_128(dd - 214, ss - 214); - case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break; - case 215: memcpy_avx_128(dd - 215, ss - 215); - case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break; - case 216: memcpy_avx_128(dd - 216, ss - 216); - case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break; - case 217: memcpy_avx_128(dd - 217, ss - 217); - case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break; - case 218: memcpy_avx_128(dd - 218, ss - 218); - case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break; - case 219: memcpy_avx_128(dd - 219, ss - 219); - case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break; - case 220: memcpy_avx_128(dd - 220, ss - 220); - case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break; - case 221: memcpy_avx_128(dd - 221, ss - 221); - case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break; - case 222: memcpy_avx_128(dd - 222, ss - 222); - case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break; - case 223: memcpy_avx_128(dd - 223, ss - 223); - case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break; - case 224: memcpy_avx_128(dd - 224, ss - 224); - case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break; - case 225: memcpy_avx_128(dd - 225, ss - 225); - case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break; - case 226: memcpy_avx_128(dd - 226, ss - 226); - case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break; - case 227: memcpy_avx_128(dd - 227, ss - 227); - case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break; - case 228: memcpy_avx_128(dd - 228, ss - 228); - case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break; - case 229: memcpy_avx_128(dd - 229, ss - 229); - case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break; - case 230: memcpy_avx_128(dd - 230, ss - 230); - case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break; - case 231: memcpy_avx_128(dd - 231, ss - 231); - case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break; - case 232: memcpy_avx_128(dd - 232, ss - 232); - case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break; - case 233: memcpy_avx_128(dd - 233, ss - 233); - case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break; - case 234: memcpy_avx_128(dd - 234, ss - 234); - case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break; - case 235: memcpy_avx_128(dd - 235, ss - 235); - case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break; - case 236: memcpy_avx_128(dd - 236, ss - 236); - case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break; - case 237: memcpy_avx_128(dd - 237, ss - 237); - case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break; - case 238: memcpy_avx_128(dd - 238, ss - 238); - case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break; - case 239: memcpy_avx_128(dd - 239, ss - 239); - case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break; - case 240: memcpy_avx_128(dd - 240, ss - 240); - case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break; - case 241: memcpy_avx_128(dd - 241, ss - 241); - case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break; - case 242: memcpy_avx_128(dd - 242, ss - 242); - case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break; - case 243: memcpy_avx_128(dd - 243, ss - 243); - case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break; - case 244: memcpy_avx_128(dd - 244, ss - 244); - case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break; - case 245: memcpy_avx_128(dd - 245, ss - 245); - case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break; - case 246: memcpy_avx_128(dd - 246, ss - 246); - case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break; - case 247: memcpy_avx_128(dd - 247, ss - 247); - case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break; - case 248: memcpy_avx_128(dd - 248, ss - 248); - case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break; - case 249: memcpy_avx_128(dd - 249, ss - 249); - case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break; - case 250: memcpy_avx_128(dd - 250, ss - 250); - case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break; - case 251: memcpy_avx_128(dd - 251, ss - 251); - case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break; - case 252: memcpy_avx_128(dd - 252, ss - 252); - case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break; - case 253: memcpy_avx_128(dd - 253, ss - 253); - case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break; - case 254: memcpy_avx_128(dd - 254, ss - 254); - case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break; - case 255: memcpy_avx_128(dd - 255, ss - 255); - case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break; - case 256: memcpy_avx_256(dd - 256, ss - 256); break; - } - - return dst; -} - - -//--------------------------------------------------------------------- -// main routine -//--------------------------------------------------------------------- -static void* memcpy_fast(void *destination, const void *source, size_t size) -{ - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L3-cache size - size_t padding; - - // small memory copy - if (size <= 256) { - memcpy_tiny(dst, src, size); - _mm256_zeroupper(); - return destination; - } - - // align destination to 16 bytes boundary - padding = (32 - (((size_t)dst) & 31)) & 31; - -#if 0 - if (padding > 0) { - __m256i head = _mm256_loadu_si256((const __m256i*)src); - _mm256_storeu_si256((__m256i*)dst, head); - dst += padding; - src += padding; - size -= padding; - } -#else - __m256i head = _mm256_loadu_si256((const __m256i*)src); - _mm256_storeu_si256((__m256i*)dst, head); - dst += padding; - src += padding; - size -= padding; -#endif - - // medium size copy - if (size <= cachesize) { - __m256i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 256; size -= 256) { - c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_storeu_si256((((__m256i*)dst) + 0), c0); - _mm256_storeu_si256((((__m256i*)dst) + 1), c1); - _mm256_storeu_si256((((__m256i*)dst) + 2), c2); - _mm256_storeu_si256((((__m256i*)dst) + 3), c3); - _mm256_storeu_si256((((__m256i*)dst) + 4), c4); - _mm256_storeu_si256((((__m256i*)dst) + 5), c5); - _mm256_storeu_si256((((__m256i*)dst) + 6), c6); - _mm256_storeu_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - else { // big memory copy - __m256i c0, c1, c2, c3, c4, c5, c6, c7; - /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 31) == 0) { // source aligned - for (; size >= 256; size -= 256) { - c0 = _mm256_load_si256(((const __m256i*)src) + 0); - c1 = _mm256_load_si256(((const __m256i*)src) + 1); - c2 = _mm256_load_si256(((const __m256i*)src) + 2); - c3 = _mm256_load_si256(((const __m256i*)src) + 3); - c4 = _mm256_load_si256(((const __m256i*)src) + 4); - c5 = _mm256_load_si256(((const __m256i*)src) + 5); - c6 = _mm256_load_si256(((const __m256i*)src) + 6); - c7 = _mm256_load_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_stream_si256((((__m256i*)dst) + 0), c0); - _mm256_stream_si256((((__m256i*)dst) + 1), c1); - _mm256_stream_si256((((__m256i*)dst) + 2), c2); - _mm256_stream_si256((((__m256i*)dst) + 3), c3); - _mm256_stream_si256((((__m256i*)dst) + 4), c4); - _mm256_stream_si256((((__m256i*)dst) + 5), c5); - _mm256_stream_si256((((__m256i*)dst) + 6), c6); - _mm256_stream_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - else { // source unaligned - for (; size >= 256; size -= 256) { - c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); - c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); - c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); - c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); - c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); - c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); - c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); - c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); - _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); - src += 256; - _mm256_stream_si256((((__m256i*)dst) + 0), c0); - _mm256_stream_si256((((__m256i*)dst) + 1), c1); - _mm256_stream_si256((((__m256i*)dst) + 2), c2); - _mm256_stream_si256((((__m256i*)dst) + 3), c3); - _mm256_stream_si256((((__m256i*)dst) + 4), c4); - _mm256_stream_si256((((__m256i*)dst) + 5), c5); - _mm256_stream_si256((((__m256i*)dst) + 6), c6); - _mm256_stream_si256((((__m256i*)dst) + 7), c7); - dst += 256; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - _mm256_zeroupper(); - - return destination; -} - - -#endif - - - +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== +#pragma once + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + + + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_avx_16(void * __restrict dst, const void * __restrict src) { +#if 1 + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); +#else + *((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0)); + *((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8)); +#endif +} + +static INLINE void memcpy_avx_32(void *dst, const void *src) { + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); +} + +static INLINE void memcpy_avx_64(void *dst, const void *src) { + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); +} + +static INLINE void memcpy_avx_128(void *dst, const void *src) { + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + __m256i m2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + __m256i m3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3); +} + +static INLINE void memcpy_avx_256(void *dst, const void *src) { + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + __m256i m2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + __m256i m3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + __m256i m4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + __m256i m5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + __m256i m6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + __m256i m7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 4, m4); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 5, m5); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 6, m6); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +static INLINE void *memcpy_tiny_avx(void * __restrict dst, const void * __restrict src, size_t size) { + unsigned char *dd = reinterpret_cast(dst) + size; + const unsigned char *ss = reinterpret_cast(src) + size; + + switch (size) { + case 128: memcpy_avx_128(dd - 128, ss - 128); [[fallthrough]]; + case 0: break; + case 129: memcpy_avx_128(dd - 129, ss - 129); [[fallthrough]]; + case 1: dd[-1] = ss[-1]; break; + case 130: memcpy_avx_128(dd - 130, ss - 130); [[fallthrough]]; + case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 131: memcpy_avx_128(dd - 131, ss - 131); [[fallthrough]]; + case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break; + case 132: memcpy_avx_128(dd - 132, ss - 132); [[fallthrough]]; + case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 133: memcpy_avx_128(dd - 133, ss - 133); [[fallthrough]]; + case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break; + case 134: memcpy_avx_128(dd - 134, ss - 134); [[fallthrough]]; + case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 135: memcpy_avx_128(dd - 135, ss - 135); [[fallthrough]]; + case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 136: memcpy_avx_128(dd - 136, ss - 136); [[fallthrough]]; + case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 137: memcpy_avx_128(dd - 137, ss - 137); [[fallthrough]]; + case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break; + case 138: memcpy_avx_128(dd - 138, ss - 138); [[fallthrough]]; + case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 139: memcpy_avx_128(dd - 139, ss - 139); [[fallthrough]]; + case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 140: memcpy_avx_128(dd - 140, ss - 140); [[fallthrough]]; + case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 141: memcpy_avx_128(dd - 141, ss - 141); [[fallthrough]]; + case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 142: memcpy_avx_128(dd - 142, ss - 142); [[fallthrough]]; + case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 143: memcpy_avx_128(dd - 143, ss - 143); [[fallthrough]]; + case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 144: memcpy_avx_128(dd - 144, ss - 144); [[fallthrough]]; + case 16: memcpy_avx_16(dd - 16, ss - 16); break; + case 145: memcpy_avx_128(dd - 145, ss - 145); [[fallthrough]]; + case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; + case 146: memcpy_avx_128(dd - 146, ss - 146); [[fallthrough]]; + case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 147: memcpy_avx_128(dd - 147, ss - 147); [[fallthrough]]; + case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 148: memcpy_avx_128(dd - 148, ss - 148); [[fallthrough]]; + case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 149: memcpy_avx_128(dd - 149, ss - 149); [[fallthrough]]; + case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 150: memcpy_avx_128(dd - 150, ss - 150); [[fallthrough]]; + case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 151: memcpy_avx_128(dd - 151, ss - 151); [[fallthrough]]; + case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 152: memcpy_avx_128(dd - 152, ss - 152); [[fallthrough]]; + case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 153: memcpy_avx_128(dd - 153, ss - 153); [[fallthrough]]; + case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break; + case 154: memcpy_avx_128(dd - 154, ss - 154); [[fallthrough]]; + case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break; + case 155: memcpy_avx_128(dd - 155, ss - 155); [[fallthrough]]; + case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break; + case 156: memcpy_avx_128(dd - 156, ss - 156); [[fallthrough]]; + case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break; + case 157: memcpy_avx_128(dd - 157, ss - 157); [[fallthrough]]; + case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break; + case 158: memcpy_avx_128(dd - 158, ss - 158); [[fallthrough]]; + case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break; + case 159: memcpy_avx_128(dd - 159, ss - 159); [[fallthrough]]; + case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break; + case 160: memcpy_avx_128(dd - 160, ss - 160); [[fallthrough]]; + case 32: memcpy_avx_32(dd - 32, ss - 32); break; + case 161: memcpy_avx_128(dd - 161, ss - 161); [[fallthrough]]; + case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break; + case 162: memcpy_avx_128(dd - 162, ss - 162); [[fallthrough]]; + case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 163: memcpy_avx_128(dd - 163, ss - 163); [[fallthrough]]; + case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 164: memcpy_avx_128(dd - 164, ss - 164); [[fallthrough]]; + case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 165: memcpy_avx_128(dd - 165, ss - 165); [[fallthrough]]; + case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 166: memcpy_avx_128(dd - 166, ss - 166); [[fallthrough]]; + case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 167: memcpy_avx_128(dd - 167, ss - 167); [[fallthrough]]; + case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 168: memcpy_avx_128(dd - 168, ss - 168); [[fallthrough]]; + case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 169: memcpy_avx_128(dd - 169, ss - 169); [[fallthrough]]; + case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break; + case 170: memcpy_avx_128(dd - 170, ss - 170); [[fallthrough]]; + case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break; + case 171: memcpy_avx_128(dd - 171, ss - 171); [[fallthrough]]; + case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break; + case 172: memcpy_avx_128(dd - 172, ss - 172); [[fallthrough]]; + case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break; + case 173: memcpy_avx_128(dd - 173, ss - 173); [[fallthrough]]; + case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break; + case 174: memcpy_avx_128(dd - 174, ss - 174); [[fallthrough]]; + case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break; + case 175: memcpy_avx_128(dd - 175, ss - 175); [[fallthrough]]; + case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break; + case 176: memcpy_avx_128(dd - 176, ss - 176); [[fallthrough]]; + case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break; + case 177: memcpy_avx_128(dd - 177, ss - 177); [[fallthrough]]; + case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break; + case 178: memcpy_avx_128(dd - 178, ss - 178); [[fallthrough]]; + case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break; + case 179: memcpy_avx_128(dd - 179, ss - 179); [[fallthrough]]; + case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break; + case 180: memcpy_avx_128(dd - 180, ss - 180); [[fallthrough]]; + case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break; + case 181: memcpy_avx_128(dd - 181, ss - 181); [[fallthrough]]; + case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break; + case 182: memcpy_avx_128(dd - 182, ss - 182); [[fallthrough]]; + case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break; + case 183: memcpy_avx_128(dd - 183, ss - 183); [[fallthrough]]; + case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break; + case 184: memcpy_avx_128(dd - 184, ss - 184); [[fallthrough]]; + case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break; + case 185: memcpy_avx_128(dd - 185, ss - 185); [[fallthrough]]; + case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break; + case 186: memcpy_avx_128(dd - 186, ss - 186); [[fallthrough]]; + case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break; + case 187: memcpy_avx_128(dd - 187, ss - 187); [[fallthrough]]; + case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break; + case 188: memcpy_avx_128(dd - 188, ss - 188); [[fallthrough]]; + case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break; + case 189: memcpy_avx_128(dd - 189, ss - 189); [[fallthrough]]; + case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break; + case 190: memcpy_avx_128(dd - 190, ss - 190); [[fallthrough]]; + case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break; + case 191: memcpy_avx_128(dd - 191, ss - 191); [[fallthrough]]; + case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break; + case 192: memcpy_avx_128(dd - 192, ss - 192); [[fallthrough]]; + case 64: memcpy_avx_64(dd - 64, ss - 64); break; + case 193: memcpy_avx_128(dd - 193, ss - 193); [[fallthrough]]; + case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break; + case 194: memcpy_avx_128(dd - 194, ss - 194); [[fallthrough]]; + case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 195: memcpy_avx_128(dd - 195, ss - 195); [[fallthrough]]; + case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 196: memcpy_avx_128(dd - 196, ss - 196); [[fallthrough]]; + case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 197: memcpy_avx_128(dd - 197, ss - 197); [[fallthrough]]; + case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 198: memcpy_avx_128(dd - 198, ss - 198); [[fallthrough]]; + case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 199: memcpy_avx_128(dd - 199, ss - 199); [[fallthrough]]; + case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 200: memcpy_avx_128(dd - 200, ss - 200); [[fallthrough]]; + case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 201: memcpy_avx_128(dd - 201, ss - 201); [[fallthrough]]; + case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break; + case 202: memcpy_avx_128(dd - 202, ss - 202); [[fallthrough]]; + case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break; + case 203: memcpy_avx_128(dd - 203, ss - 203); [[fallthrough]]; + case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break; + case 204: memcpy_avx_128(dd - 204, ss - 204); [[fallthrough]]; + case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break; + case 205: memcpy_avx_128(dd - 205, ss - 205); [[fallthrough]]; + case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break; + case 206: memcpy_avx_128(dd - 206, ss - 206); [[fallthrough]]; + case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break; + case 207: memcpy_avx_128(dd - 207, ss - 207); [[fallthrough]]; + case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break; + case 208: memcpy_avx_128(dd - 208, ss - 208); [[fallthrough]]; + case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break; + case 209: memcpy_avx_128(dd - 209, ss - 209); [[fallthrough]]; + case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break; + case 210: memcpy_avx_128(dd - 210, ss - 210); [[fallthrough]]; + case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break; + case 211: memcpy_avx_128(dd - 211, ss - 211); [[fallthrough]]; + case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break; + case 212: memcpy_avx_128(dd - 212, ss - 212); [[fallthrough]]; + case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break; + case 213: memcpy_avx_128(dd - 213, ss - 213); [[fallthrough]]; + case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break; + case 214: memcpy_avx_128(dd - 214, ss - 214); [[fallthrough]]; + case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break; + case 215: memcpy_avx_128(dd - 215, ss - 215); [[fallthrough]]; + case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break; + case 216: memcpy_avx_128(dd - 216, ss - 216); [[fallthrough]]; + case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break; + case 217: memcpy_avx_128(dd - 217, ss - 217); [[fallthrough]]; + case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break; + case 218: memcpy_avx_128(dd - 218, ss - 218); [[fallthrough]]; + case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break; + case 219: memcpy_avx_128(dd - 219, ss - 219); [[fallthrough]]; + case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break; + case 220: memcpy_avx_128(dd - 220, ss - 220); [[fallthrough]]; + case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break; + case 221: memcpy_avx_128(dd - 221, ss - 221); [[fallthrough]]; + case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break; + case 222: memcpy_avx_128(dd - 222, ss - 222); [[fallthrough]]; + case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break; + case 223: memcpy_avx_128(dd - 223, ss - 223); [[fallthrough]]; + case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break; + case 224: memcpy_avx_128(dd - 224, ss - 224); [[fallthrough]]; + case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break; + case 225: memcpy_avx_128(dd - 225, ss - 225); [[fallthrough]]; + case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break; + case 226: memcpy_avx_128(dd - 226, ss - 226); [[fallthrough]]; + case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break; + case 227: memcpy_avx_128(dd - 227, ss - 227); [[fallthrough]]; + case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break; + case 228: memcpy_avx_128(dd - 228, ss - 228); [[fallthrough]]; + case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break; + case 229: memcpy_avx_128(dd - 229, ss - 229); [[fallthrough]]; + case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break; + case 230: memcpy_avx_128(dd - 230, ss - 230); [[fallthrough]]; + case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break; + case 231: memcpy_avx_128(dd - 231, ss - 231); [[fallthrough]]; + case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break; + case 232: memcpy_avx_128(dd - 232, ss - 232); [[fallthrough]]; + case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break; + case 233: memcpy_avx_128(dd - 233, ss - 233); [[fallthrough]]; + case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break; + case 234: memcpy_avx_128(dd - 234, ss - 234); [[fallthrough]]; + case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break; + case 235: memcpy_avx_128(dd - 235, ss - 235); [[fallthrough]]; + case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break; + case 236: memcpy_avx_128(dd - 236, ss - 236); [[fallthrough]]; + case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break; + case 237: memcpy_avx_128(dd - 237, ss - 237); [[fallthrough]]; + case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break; + case 238: memcpy_avx_128(dd - 238, ss - 238); [[fallthrough]]; + case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break; + case 239: memcpy_avx_128(dd - 239, ss - 239); [[fallthrough]]; + case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break; + case 240: memcpy_avx_128(dd - 240, ss - 240); [[fallthrough]]; + case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break; + case 241: memcpy_avx_128(dd - 241, ss - 241); [[fallthrough]]; + case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break; + case 242: memcpy_avx_128(dd - 242, ss - 242); [[fallthrough]]; + case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break; + case 243: memcpy_avx_128(dd - 243, ss - 243); [[fallthrough]]; + case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break; + case 244: memcpy_avx_128(dd - 244, ss - 244); [[fallthrough]]; + case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break; + case 245: memcpy_avx_128(dd - 245, ss - 245); [[fallthrough]]; + case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break; + case 246: memcpy_avx_128(dd - 246, ss - 246); [[fallthrough]]; + case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break; + case 247: memcpy_avx_128(dd - 247, ss - 247); [[fallthrough]]; + case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break; + case 248: memcpy_avx_128(dd - 248, ss - 248); [[fallthrough]]; + case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break; + case 249: memcpy_avx_128(dd - 249, ss - 249); [[fallthrough]]; + case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break; + case 250: memcpy_avx_128(dd - 250, ss - 250); [[fallthrough]]; + case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break; + case 251: memcpy_avx_128(dd - 251, ss - 251); [[fallthrough]]; + case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break; + case 252: memcpy_avx_128(dd - 252, ss - 252); [[fallthrough]]; + case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break; + case 253: memcpy_avx_128(dd - 253, ss - 253); [[fallthrough]]; + case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break; + case 254: memcpy_avx_128(dd - 254, ss - 254); [[fallthrough]]; + case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break; + case 255: memcpy_avx_128(dd - 255, ss - 255); [[fallthrough]]; + case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break; + case 256: memcpy_avx_256(dd - 256, ss - 256); break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +void* memcpy_fast_avx(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + static size_t cachesize = 0x200000; // L3-cache size + size_t padding; + + // small memory copy + if (size <= 256) { + memcpy_tiny_avx(dst, src, size); + _mm256_zeroupper(); + return destination; + } + + // align destination to 16 bytes boundary + padding = (32 - (((size_t)dst) & 31)) & 31; + +#if 0 + if (padding > 0) { + __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); + _mm256_storeu_si256((__m256i*)dst, head); + dst += padding; + src += padding; + size -= padding; + } +#else + __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); + _mm256_storeu_si256((__m256i*)dst, head); + dst += padding; + src += padding; + size -= padding; +#endif + + // medium size copy + if (size <= cachesize) { + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 256; size -= 256) { + c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + } + } + else { // big memory copy + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ + + if ((((size_t)src) & 31) == 0) { // source aligned + for (; size >= 256; size -= 256) { + c0 = _mm256_load_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_load_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_load_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_load_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_load_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_load_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_load_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_load_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + } + } + else { // source unaligned + for (; size >= 256; size -= 256) { + c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + } + } + _mm_sfence(); + } + + memcpy_tiny_avx(dst, src, size); + _mm256_zeroupper(); + + return destination; +} From 2c86bc4ea3adc699960272d5b3b4335823fad8d4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 8 Mar 2021 10:00:54 +0300 Subject: [PATCH 05/22] Addition to prev. revision --- base/glibc-compatibility/CMakeLists.txt | 2 +- contrib/FastMemcpy/CMakeLists.txt | 4 +- contrib/FastMemcpy/FastMemcpy.c | 220 ------- contrib/FastMemcpy/memcpy_wrapper.c | 6 - contrib/FastMemcpy/memcpy_wrapper.cpp | 6 + utils/memcpy-bench/FastMemcpy.h | 754 ++++++++++++++++++++++++ 6 files changed, 762 insertions(+), 230 deletions(-) delete mode 100644 contrib/FastMemcpy/FastMemcpy.c delete mode 100644 contrib/FastMemcpy/memcpy_wrapper.c create mode 100644 contrib/FastMemcpy/memcpy_wrapper.cpp create mode 100644 utils/memcpy-bench/FastMemcpy.h diff --git a/base/glibc-compatibility/CMakeLists.txt b/base/glibc-compatibility/CMakeLists.txt index 684c6162941d..3f1cb4742b21 100644 --- a/base/glibc-compatibility/CMakeLists.txt +++ b/base/glibc-compatibility/CMakeLists.txt @@ -30,7 +30,7 @@ if (GLIBC_COMPATIBILITY) if (NOT ARCH_ARM) # clickhouse_memcpy don't support ARCH_ARM, see https://github.com/ClickHouse/ClickHouse/issues/18951 add_library (clickhouse_memcpy OBJECT - ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy/memcpy_wrapper.c + ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy/memcpy_wrapper.cpp ) endif() diff --git a/contrib/FastMemcpy/CMakeLists.txt b/contrib/FastMemcpy/CMakeLists.txt index 8efe6d45dff9..7de638282b95 100644 --- a/contrib/FastMemcpy/CMakeLists.txt +++ b/contrib/FastMemcpy/CMakeLists.txt @@ -8,9 +8,7 @@ if (ENABLE_FASTMEMCPY) set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/FastMemcpy) set (SRCS - ${LIBRARY_DIR}/FastMemcpy.c - - memcpy_wrapper.c + memcpy_wrapper.cpp ) add_library (FastMemcpy ${SRCS}) diff --git a/contrib/FastMemcpy/FastMemcpy.c b/contrib/FastMemcpy/FastMemcpy.c deleted file mode 100644 index 5021bcc7d165..000000000000 --- a/contrib/FastMemcpy/FastMemcpy.c +++ /dev/null @@ -1,220 +0,0 @@ -//===================================================================== -// -// FastMemcpy.c - skywind3000@163.com, 2015 -// -// feature: -// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) -// -//===================================================================== -#include -#include -#include -#include - -#if (defined(_WIN32) || defined(WIN32)) -#include -#include -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif -#elif defined(__unix) -#include -#include -#else -#error it can only be compiled under windows or unix -#endif - -#include "FastMemcpy.h" - -unsigned int gettime() -{ - #if (defined(_WIN32) || defined(WIN32)) - return timeGetTime(); - #else - static struct timezone tz={ 0,0 }; - struct timeval time; - gettimeofday(&time,&tz); - return (time.tv_sec * 1000 + time.tv_usec / 1000); - #endif -} - -void sleepms(unsigned int millisec) -{ -#if defined(_WIN32) || defined(WIN32) - Sleep(millisec); -#else - usleep(millisec * 1000); -#endif -} - - -void benchmark(int dstalign, int srcalign, size_t size, int times) -{ - char *DATA1 = (char*)malloc(size + 64); - char *DATA2 = (char*)malloc(size + 64); - size_t LINEAR1 = ((size_t)DATA1); - size_t LINEAR2 = ((size_t)DATA2); - char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); - char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); - char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); - char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); - unsigned int t1, t2; - int k; - - sleepms(100); - t1 = gettime(); - for (k = times; k > 0; k--) { - memcpy(dst, src, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (k = times; k > 0; k--) { - memcpy_fast(dst, src, size); - } - t2 = gettime() - t2; - - free(DATA1); - free(DATA2); - - printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", - dstalign? "aligned" : "unalign", - srcalign? "aligned" : "unalign", (int)t2, (int)t1); -} - - -void bench(int copysize, int times) -{ - printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); - benchmark(1, 1, copysize, times); - benchmark(1, 0, copysize, times); - benchmark(0, 1, copysize, times); - benchmark(0, 0, copysize, times); - printf("\n"); -} - - -void random_bench(int maxsize, int times) -{ - static char A[11 * 1024 * 1024 + 2]; - static char B[11 * 1024 * 1024 + 2]; - static int random_offsets[0x10000]; - static int random_sizes[0x8000]; - unsigned int i, p1, p2; - unsigned int t1, t2; - for (i = 0; i < 0x10000; i++) { // generate random offsets - random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); - } - for (i = 0; i < 0x8000; i++) { // generate random sizes - random_sizes[i] = 1 + rand() % maxsize; - } - sleepms(100); - t1 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy(A + offset1, B + offset2, size); - } - t1 = gettime() - t1; - sleepms(100); - t2 = gettime(); - for (p1 = 0, p2 = 0, i = 0; i < times; i++) { - int offset1 = random_offsets[(p1++) & 0xffff]; - int offset2 = random_offsets[(p1++) & 0xffff]; - int size = random_sizes[(p2++) & 0x7fff]; - memcpy_fast(A + offset1, B + offset2, size); - } - t2 = gettime() - t2; - printf("benchmark random access:\n"); - printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); -} - - -#ifdef _MSC_VER -#pragma comment(lib, "winmm.lib") -#endif - -int main(void) -{ - bench(32, 0x1000000); - bench(64, 0x1000000); - bench(512, 0x800000); - bench(1024, 0x400000); - bench(4096, 0x80000); - bench(8192, 0x40000); - bench(1024 * 1024 * 1, 0x800); - bench(1024 * 1024 * 4, 0x200); - bench(1024 * 1024 * 8, 0x100); - - random_bench(2048, 8000000); - - return 0; -} - - - - -/* -benchmark(size=32 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms -result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms - -benchmark(size=64 bytes, times=16777216): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms -result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms -result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms - -benchmark(size=512 bytes, times=8388608): -result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms -result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms -result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms - -benchmark(size=1024 bytes, times=4194304): -result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms -result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms -result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms -result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms - -benchmark(size=4096 bytes, times=524288): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms - -benchmark(size=8192 bytes, times=262144): -result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms -result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms -result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms -result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms - -benchmark(size=1048576 bytes, times=2048): -result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms -result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms -result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms -result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms - -benchmark(size=4194304 bytes, times=512): -result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms -result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms -result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms -result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms - -benchmark(size=8388608 bytes, times=256): -result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms -result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms -result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms -result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms - -benchmark random access: -memcpy_fast=515ms memcpy=1014ms - -*/ - - - - diff --git a/contrib/FastMemcpy/memcpy_wrapper.c b/contrib/FastMemcpy/memcpy_wrapper.c deleted file mode 100644 index 1f57345980ad..000000000000 --- a/contrib/FastMemcpy/memcpy_wrapper.c +++ /dev/null @@ -1,6 +0,0 @@ -#include "FastMemcpy.h" - -void * memcpy(void * __restrict destination, const void * __restrict source, size_t size) -{ - return memcpy_fast(destination, source, size); -} diff --git a/contrib/FastMemcpy/memcpy_wrapper.cpp b/contrib/FastMemcpy/memcpy_wrapper.cpp new file mode 100644 index 000000000000..8fa6cf291ecb --- /dev/null +++ b/contrib/FastMemcpy/memcpy_wrapper.cpp @@ -0,0 +1,6 @@ +#include "FastMemcpy.h" + +extern "C" void * memcpy(void * __restrict destination, const void * __restrict source, size_t size) +{ + return memcpy_fast(destination, source, size); +} diff --git a/utils/memcpy-bench/FastMemcpy.h b/utils/memcpy-bench/FastMemcpy.h new file mode 100644 index 000000000000..f071e2bed626 --- /dev/null +++ b/utils/memcpy-bench/FastMemcpy.h @@ -0,0 +1,754 @@ +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== +#pragma once + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + +typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; +typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; +typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_sse2_16(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); +} + +static INLINE void memcpy_sse2_32(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); +} + +static INLINE void memcpy_sse2_64(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); +} + +static INLINE void memcpy_sse2_128(void *dst, const void *src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + __m128i m4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + __m128i m5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + __m128i m6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + __m128i m7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +/// Attribute is used to avoid an error with undefined behaviour sanitizer +/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer +/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. +__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { + unsigned char *dd = ((unsigned char*)dst) + size; + const unsigned char *ss = ((const unsigned char*)src) + size; + + switch (size) { + case 64: + memcpy_sse2_64(dd - 64, ss - 64); + [[fallthrough]]; + case 0: + break; + + case 65: + memcpy_sse2_64(dd - 65, ss - 65); + [[fallthrough]]; + case 1: + dd[-1] = ss[-1]; + break; + + case 66: + memcpy_sse2_64(dd - 66, ss - 66); + [[fallthrough]]; + case 2: + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 67: + memcpy_sse2_64(dd - 67, ss - 67); + [[fallthrough]]; + case 3: + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 68: + memcpy_sse2_64(dd - 68, ss - 68); + [[fallthrough]]; + case 4: + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 69: + memcpy_sse2_64(dd - 69, ss - 69); + [[fallthrough]]; + case 5: + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 70: + memcpy_sse2_64(dd - 70, ss - 70); + [[fallthrough]]; + case 6: + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 71: + memcpy_sse2_64(dd - 71, ss - 71); + [[fallthrough]]; + case 7: + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 72: + memcpy_sse2_64(dd - 72, ss - 72); + [[fallthrough]]; + case 8: + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 73: + memcpy_sse2_64(dd - 73, ss - 73); + [[fallthrough]]; + case 9: + *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 74: + memcpy_sse2_64(dd - 74, ss - 74); + [[fallthrough]]; + case 10: + *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 75: + memcpy_sse2_64(dd - 75, ss - 75); + [[fallthrough]]; + case 11: + *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 76: + memcpy_sse2_64(dd - 76, ss - 76); + [[fallthrough]]; + case 12: + *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 77: + memcpy_sse2_64(dd - 77, ss - 77); + [[fallthrough]]; + case 13: + *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 78: + memcpy_sse2_64(dd - 78, ss - 78); + [[fallthrough]]; + case 14: + *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 79: + memcpy_sse2_64(dd - 79, ss - 79); + [[fallthrough]]; + case 15: + *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 80: + memcpy_sse2_64(dd - 80, ss - 80); + [[fallthrough]]; + case 16: + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 81: + memcpy_sse2_64(dd - 81, ss - 81); + [[fallthrough]]; + case 17: + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 82: + memcpy_sse2_64(dd - 82, ss - 82); + [[fallthrough]]; + case 18: + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 83: + memcpy_sse2_64(dd - 83, ss - 83); + [[fallthrough]]; + case 19: + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 84: + memcpy_sse2_64(dd - 84, ss - 84); + [[fallthrough]]; + case 20: + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 85: + memcpy_sse2_64(dd - 85, ss - 85); + [[fallthrough]]; + case 21: + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 86: + memcpy_sse2_64(dd - 86, ss - 86); + [[fallthrough]]; + case 22: + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 87: + memcpy_sse2_64(dd - 87, ss - 87); + [[fallthrough]]; + case 23: + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 88: + memcpy_sse2_64(dd - 88, ss - 88); + [[fallthrough]]; + case 24: + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 89: + memcpy_sse2_64(dd - 89, ss - 89); + [[fallthrough]]; + case 25: + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 90: + memcpy_sse2_64(dd - 90, ss - 90); + [[fallthrough]]; + case 26: + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 91: + memcpy_sse2_64(dd - 91, ss - 91); + [[fallthrough]]; + case 27: + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 92: + memcpy_sse2_64(dd - 92, ss - 92); + [[fallthrough]]; + case 28: + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 93: + memcpy_sse2_64(dd - 93, ss - 93); + [[fallthrough]]; + case 29: + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 94: + memcpy_sse2_64(dd - 94, ss - 94); + [[fallthrough]]; + case 30: + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 95: + memcpy_sse2_64(dd - 95, ss - 95); + [[fallthrough]]; + case 31: + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 96: + memcpy_sse2_64(dd - 96, ss - 96); + [[fallthrough]]; + case 32: + memcpy_sse2_32(dd - 32, ss - 32); + break; + + case 97: + memcpy_sse2_64(dd - 97, ss - 97); + [[fallthrough]]; + case 33: + memcpy_sse2_32(dd - 33, ss - 33); + dd[-1] = ss[-1]; + break; + + case 98: + memcpy_sse2_64(dd - 98, ss - 98); + [[fallthrough]]; + case 34: + memcpy_sse2_32(dd - 34, ss - 34); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 99: + memcpy_sse2_64(dd - 99, ss - 99); + [[fallthrough]]; + case 35: + memcpy_sse2_32(dd - 35, ss - 35); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 100: + memcpy_sse2_64(dd - 100, ss - 100); + [[fallthrough]]; + case 36: + memcpy_sse2_32(dd - 36, ss - 36); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 101: + memcpy_sse2_64(dd - 101, ss - 101); + [[fallthrough]]; + case 37: + memcpy_sse2_32(dd - 37, ss - 37); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 102: + memcpy_sse2_64(dd - 102, ss - 102); + [[fallthrough]]; + case 38: + memcpy_sse2_32(dd - 38, ss - 38); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 103: + memcpy_sse2_64(dd - 103, ss - 103); + [[fallthrough]]; + case 39: + memcpy_sse2_32(dd - 39, ss - 39); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 104: + memcpy_sse2_64(dd - 104, ss - 104); + [[fallthrough]]; + case 40: + memcpy_sse2_32(dd - 40, ss - 40); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 105: + memcpy_sse2_64(dd - 105, ss - 105); + [[fallthrough]]; + case 41: + memcpy_sse2_32(dd - 41, ss - 41); + *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 106: + memcpy_sse2_64(dd - 106, ss - 106); + [[fallthrough]]; + case 42: + memcpy_sse2_32(dd - 42, ss - 42); + *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 107: + memcpy_sse2_64(dd - 107, ss - 107); + [[fallthrough]]; + case 43: + memcpy_sse2_32(dd - 43, ss - 43); + *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 108: + memcpy_sse2_64(dd - 108, ss - 108); + [[fallthrough]]; + case 44: + memcpy_sse2_32(dd - 44, ss - 44); + *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 109: + memcpy_sse2_64(dd - 109, ss - 109); + [[fallthrough]]; + case 45: + memcpy_sse2_32(dd - 45, ss - 45); + *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 110: + memcpy_sse2_64(dd - 110, ss - 110); + [[fallthrough]]; + case 46: + memcpy_sse2_32(dd - 46, ss - 46); + *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 111: + memcpy_sse2_64(dd - 111, ss - 111); + [[fallthrough]]; + case 47: + memcpy_sse2_32(dd - 47, ss - 47); + *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 112: + memcpy_sse2_64(dd - 112, ss - 112); + [[fallthrough]]; + case 48: + memcpy_sse2_32(dd - 48, ss - 48); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 113: + memcpy_sse2_64(dd - 113, ss - 113); + [[fallthrough]]; + case 49: + memcpy_sse2_32(dd - 49, ss - 49); + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 114: + memcpy_sse2_64(dd - 114, ss - 114); + [[fallthrough]]; + case 50: + memcpy_sse2_32(dd - 50, ss - 50); + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 115: + memcpy_sse2_64(dd - 115, ss - 115); + [[fallthrough]]; + case 51: + memcpy_sse2_32(dd - 51, ss - 51); + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 116: + memcpy_sse2_64(dd - 116, ss - 116); + [[fallthrough]]; + case 52: + memcpy_sse2_32(dd - 52, ss - 52); + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 117: + memcpy_sse2_64(dd - 117, ss - 117); + [[fallthrough]]; + case 53: + memcpy_sse2_32(dd - 53, ss - 53); + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 118: + memcpy_sse2_64(dd - 118, ss - 118); + [[fallthrough]]; + case 54: + memcpy_sse2_32(dd - 54, ss - 54); + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 119: + memcpy_sse2_64(dd - 119, ss - 119); + [[fallthrough]]; + case 55: + memcpy_sse2_32(dd - 55, ss - 55); + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 120: + memcpy_sse2_64(dd - 120, ss - 120); + [[fallthrough]]; + case 56: + memcpy_sse2_32(dd - 56, ss - 56); + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 121: + memcpy_sse2_64(dd - 121, ss - 121); + [[fallthrough]]; + case 57: + memcpy_sse2_32(dd - 57, ss - 57); + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 122: + memcpy_sse2_64(dd - 122, ss - 122); + [[fallthrough]]; + case 58: + memcpy_sse2_32(dd - 58, ss - 58); + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 123: + memcpy_sse2_64(dd - 123, ss - 123); + [[fallthrough]]; + case 59: + memcpy_sse2_32(dd - 59, ss - 59); + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 124: + memcpy_sse2_64(dd - 124, ss - 124); + [[fallthrough]]; + case 60: + memcpy_sse2_32(dd - 60, ss - 60); + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 125: + memcpy_sse2_64(dd - 125, ss - 125); + [[fallthrough]]; + case 61: + memcpy_sse2_32(dd - 61, ss - 61); + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 126: + memcpy_sse2_64(dd - 126, ss - 126); + [[fallthrough]]; + case 62: + memcpy_sse2_32(dd - 62, ss - 62); + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 127: + memcpy_sse2_64(dd - 127, ss - 127); + [[fallthrough]]; + case 63: + memcpy_sse2_32(dd - 63, ss - 63); + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 128: + memcpy_sse2_128(dd - 128, ss - 128); + break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +void* memcpy_fast(void *destination, const void *source, size_t size) +{ + unsigned char *dst = (unsigned char*)destination; + const unsigned char *src = (const unsigned char*)source; + static size_t cachesize = 0x200000; // L2-cache size + size_t padding; + + // small memory copy + if (size <= 128) { + return memcpy_tiny(dst, src, size); + } + + // align destination to 16 bytes boundary + padding = (16 - (((size_t)dst) & 15)) & 15; + + if (padding > 0) { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + if (size <= cachesize) { + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + else { // big memory copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + _mm_prefetch((const char*)(src), _MM_HINT_NTA); + + if ((((size_t)src) & 15) == 0) { // source aligned + for (; size >= 128; size -= 128) { + c0 = _mm_load_si128((reinterpret_cast(src)) + 0); + c1 = _mm_load_si128((reinterpret_cast(src)) + 1); + c2 = _mm_load_si128((reinterpret_cast(src)) + 2); + c3 = _mm_load_si128((reinterpret_cast(src)) + 3); + c4 = _mm_load_si128((reinterpret_cast(src)) + 4); + c5 = _mm_load_si128((reinterpret_cast(src)) + 5); + c6 = _mm_load_si128((reinterpret_cast(src)) + 6); + c7 = _mm_load_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + else { // source unaligned + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + _mm_sfence(); + } + + memcpy_tiny(dst, src, size); + + return destination; +} From 940ce5884e97019c5a388d4624a28d22daf43a34 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 8 Mar 2021 12:04:52 +0300 Subject: [PATCH 06/22] Add memcpy-bench tool --- utils/CMakeLists.txt | 1 + utils/memcpy-bench/CMakeLists.txt | 5 + utils/memcpy-bench/FastMemcpy.h | 1288 +++++++++++++-------------- utils/memcpy-bench/FastMemcpy_Avx.h | 480 ++++++++++ utils/memcpy-bench/memcpy-bench.cpp | 610 +++++++++++++ utils/memcpy-bench/memcpy_jart.S | 138 +++ 6 files changed, 1878 insertions(+), 644 deletions(-) create mode 100644 utils/memcpy-bench/CMakeLists.txt create mode 100644 utils/memcpy-bench/FastMemcpy_Avx.h create mode 100644 utils/memcpy-bench/memcpy-bench.cpp create mode 100644 utils/memcpy-bench/memcpy_jart.S diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 8a39d5916126..d38b34f34193 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -32,6 +32,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS) add_subdirectory (db-generator) add_subdirectory (wal-dump) add_subdirectory (check-mysql-binlog) + add_subdirectory (memcpy-bench) endif () if (ENABLE_CODE_QUALITY) diff --git a/utils/memcpy-bench/CMakeLists.txt b/utils/memcpy-bench/CMakeLists.txt new file mode 100644 index 000000000000..54dd0398912a --- /dev/null +++ b/utils/memcpy-bench/CMakeLists.txt @@ -0,0 +1,5 @@ +enable_language(ASM) +add_executable (memcpy-bench memcpy-bench.cpp memcpy_jart.S) +#target_compile_options(memcpy-bench PRIVATE -mavx) +target_link_libraries(memcpy-bench PRIVATE dbms) + diff --git a/utils/memcpy-bench/FastMemcpy.h b/utils/memcpy-bench/FastMemcpy.h index f071e2bed626..3f2278eac33f 100644 --- a/utils/memcpy-bench/FastMemcpy.h +++ b/utils/memcpy-bench/FastMemcpy.h @@ -24,7 +24,7 @@ #define INLINE __inline__ #endif #elif defined(_MSC_VER) - #define INLINE __forceinline + #define INLINE __forceinline #elif (defined(__BORLANDC__) || defined(__WATCOMC__)) #define INLINE __inline #else @@ -39,46 +39,46 @@ typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; //--------------------------------------------------------------------- // fast copy for different sizes //--------------------------------------------------------------------- -static INLINE void memcpy_sse2_16(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); +static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); } -static INLINE void memcpy_sse2_32(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); - __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); +static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); } -static INLINE void memcpy_sse2_64(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); - __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); - __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); - __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); +static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); } -static INLINE void memcpy_sse2_128(void *dst, const void *src) { - __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); - __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); - __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); - __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); - __m128i m4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); - __m128i m5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); - __m128i m6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); - __m128i m7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6); - _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7); +static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src) { + __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + __m128i m4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + __m128i m5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + __m128i m6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + __m128i m7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6); + _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7); } @@ -88,667 +88,667 @@ static INLINE void memcpy_sse2_128(void *dst, const void *src) { /// Attribute is used to avoid an error with undefined behaviour sanitizer /// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer /// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. -__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { - unsigned char *dd = ((unsigned char*)dst) + size; - const unsigned char *ss = ((const unsigned char*)src) + size; +__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size) { + unsigned char *dd = ((unsigned char*)dst) + size; + const unsigned char *ss = ((const unsigned char*)src) + size; - switch (size) { - case 64: - memcpy_sse2_64(dd - 64, ss - 64); + switch (size) { + case 64: + memcpy_sse2_64(dd - 64, ss - 64); [[fallthrough]]; - case 0: - break; + case 0: + break; - case 65: - memcpy_sse2_64(dd - 65, ss - 65); + case 65: + memcpy_sse2_64(dd - 65, ss - 65); [[fallthrough]]; - case 1: - dd[-1] = ss[-1]; - break; + case 1: + dd[-1] = ss[-1]; + break; - case 66: - memcpy_sse2_64(dd - 66, ss - 66); + case 66: + memcpy_sse2_64(dd - 66, ss - 66); [[fallthrough]]; - case 2: - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; + case 2: + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; - case 67: - memcpy_sse2_64(dd - 67, ss - 67); + case 67: + memcpy_sse2_64(dd - 67, ss - 67); [[fallthrough]]; - case 3: - *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; + case 3: + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; - case 68: - memcpy_sse2_64(dd - 68, ss - 68); + case 68: + memcpy_sse2_64(dd - 68, ss - 68); [[fallthrough]]; - case 4: - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; + case 4: + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; - case 69: - memcpy_sse2_64(dd - 69, ss - 69); + case 69: + memcpy_sse2_64(dd - 69, ss - 69); [[fallthrough]]; - case 5: - *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; + case 5: + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; - case 70: - memcpy_sse2_64(dd - 70, ss - 70); + case 70: + memcpy_sse2_64(dd - 70, ss - 70); [[fallthrough]]; - case 6: - *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; + case 6: + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; - case 71: - memcpy_sse2_64(dd - 71, ss - 71); + case 71: + memcpy_sse2_64(dd - 71, ss - 71); [[fallthrough]]; - case 7: - *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; + case 7: + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; - case 72: - memcpy_sse2_64(dd - 72, ss - 72); + case 72: + memcpy_sse2_64(dd - 72, ss - 72); [[fallthrough]]; - case 8: - *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); - break; + case 8: + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; - case 73: - memcpy_sse2_64(dd - 73, ss - 73); + case 73: + memcpy_sse2_64(dd - 73, ss - 73); [[fallthrough]]; - case 9: - *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; + case 9: + *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; - case 74: - memcpy_sse2_64(dd - 74, ss - 74); + case 74: + memcpy_sse2_64(dd - 74, ss - 74); [[fallthrough]]; - case 10: - *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; + case 10: + *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; - case 75: - memcpy_sse2_64(dd - 75, ss - 75); + case 75: + memcpy_sse2_64(dd - 75, ss - 75); [[fallthrough]]; - case 11: - *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; + case 11: + *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; - case 76: - memcpy_sse2_64(dd - 76, ss - 76); + case 76: + memcpy_sse2_64(dd - 76, ss - 76); [[fallthrough]]; - case 12: - *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; + case 12: + *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; - case 77: - memcpy_sse2_64(dd - 77, ss - 77); + case 77: + memcpy_sse2_64(dd - 77, ss - 77); [[fallthrough]]; - case 13: - *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; + case 13: + *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; - case 78: - memcpy_sse2_64(dd - 78, ss - 78); + case 78: + memcpy_sse2_64(dd - 78, ss - 78); [[fallthrough]]; - case 14: - *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); - break; + case 14: + *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; - case 79: - memcpy_sse2_64(dd - 79, ss - 79); + case 79: + memcpy_sse2_64(dd - 79, ss - 79); [[fallthrough]]; - case 15: - *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); - break; + case 15: + *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; - case 80: - memcpy_sse2_64(dd - 80, ss - 80); + case 80: + memcpy_sse2_64(dd - 80, ss - 80); [[fallthrough]]; - case 16: - memcpy_sse2_16(dd - 16, ss - 16); - break; + case 16: + memcpy_sse2_16(dd - 16, ss - 16); + break; - case 81: - memcpy_sse2_64(dd - 81, ss - 81); + case 81: + memcpy_sse2_64(dd - 81, ss - 81); [[fallthrough]]; - case 17: - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; + case 17: + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; - case 82: - memcpy_sse2_64(dd - 82, ss - 82); + case 82: + memcpy_sse2_64(dd - 82, ss - 82); [[fallthrough]]; - case 18: - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; + case 18: + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; - case 83: - memcpy_sse2_64(dd - 83, ss - 83); + case 83: + memcpy_sse2_64(dd - 83, ss - 83); [[fallthrough]]; - case 19: - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 84: - memcpy_sse2_64(dd - 84, ss - 84); + case 19: + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 84: + memcpy_sse2_64(dd - 84, ss - 84); [[fallthrough]]; - case 20: - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; - - case 85: - memcpy_sse2_64(dd - 85, ss - 85); + case 20: + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 85: + memcpy_sse2_64(dd - 85, ss - 85); [[fallthrough]]; - case 21: - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; + case 21: + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; - case 86: - memcpy_sse2_64(dd - 86, ss - 86); - [[fallthrough]]; - case 22: - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; + case 86: + memcpy_sse2_64(dd - 86, ss - 86); + [[fallthrough]]; + case 22: + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; - case 87: - memcpy_sse2_64(dd - 87, ss - 87); - [[fallthrough]]; - case 23: - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; + case 87: + memcpy_sse2_64(dd - 87, ss - 87); + [[fallthrough]]; + case 23: + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; - case 88: - memcpy_sse2_64(dd - 88, ss - 88); - [[fallthrough]]; - case 24: - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; + case 88: + memcpy_sse2_64(dd - 88, ss - 88); + [[fallthrough]]; + case 24: + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; - case 89: - memcpy_sse2_64(dd - 89, ss - 89); - [[fallthrough]]; - case 25: - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; + case 89: + memcpy_sse2_64(dd - 89, ss - 89); + [[fallthrough]]; + case 25: + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; - case 90: - memcpy_sse2_64(dd - 90, ss - 90); - [[fallthrough]]; - case 26: - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 91: - memcpy_sse2_64(dd - 91, ss - 91); - [[fallthrough]]; - case 27: - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 92: - memcpy_sse2_64(dd - 92, ss - 92); - [[fallthrough]]; - case 28: - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 93: - memcpy_sse2_64(dd - 93, ss - 93); - [[fallthrough]]; - case 29: - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 94: - memcpy_sse2_64(dd - 94, ss - 94); - [[fallthrough]]; - case 30: - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 95: - memcpy_sse2_64(dd - 95, ss - 95); - [[fallthrough]]; - case 31: - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 96: - memcpy_sse2_64(dd - 96, ss - 96); - [[fallthrough]]; - case 32: - memcpy_sse2_32(dd - 32, ss - 32); - break; - - case 97: - memcpy_sse2_64(dd - 97, ss - 97); - [[fallthrough]]; - case 33: - memcpy_sse2_32(dd - 33, ss - 33); - dd[-1] = ss[-1]; - break; - - case 98: - memcpy_sse2_64(dd - 98, ss - 98); - [[fallthrough]]; - case 34: - memcpy_sse2_32(dd - 34, ss - 34); - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; - - case 99: - memcpy_sse2_64(dd - 99, ss - 99); - [[fallthrough]]; - case 35: - memcpy_sse2_32(dd - 35, ss - 35); - *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 100: - memcpy_sse2_64(dd - 100, ss - 100); - [[fallthrough]]; - case 36: - memcpy_sse2_32(dd - 36, ss - 36); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; - - case 101: - memcpy_sse2_64(dd - 101, ss - 101); - [[fallthrough]]; - case 37: - memcpy_sse2_32(dd - 37, ss - 37); - *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 102: - memcpy_sse2_64(dd - 102, ss - 102); - [[fallthrough]]; - case 38: - memcpy_sse2_32(dd - 38, ss - 38); - *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; - - case 103: - memcpy_sse2_64(dd - 103, ss - 103); - [[fallthrough]]; - case 39: - memcpy_sse2_32(dd - 39, ss - 39); - *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; - - case 104: - memcpy_sse2_64(dd - 104, ss - 104); - [[fallthrough]]; - case 40: - memcpy_sse2_32(dd - 40, ss - 40); - *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); - break; - - case 105: - memcpy_sse2_64(dd - 105, ss - 105); - [[fallthrough]]; - case 41: - memcpy_sse2_32(dd - 41, ss - 41); - *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); - dd[-1] = ss[-1]; - break; - - case 106: - memcpy_sse2_64(dd - 106, ss - 106); - [[fallthrough]]; - case 42: - memcpy_sse2_32(dd - 42, ss - 42); - *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; - - case 107: - memcpy_sse2_64(dd - 107, ss - 107); - [[fallthrough]]; - case 43: - memcpy_sse2_32(dd - 43, ss - 43); - *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; - - case 108: - memcpy_sse2_64(dd - 108, ss - 108); - [[fallthrough]]; - case 44: - memcpy_sse2_32(dd - 44, ss - 44); - *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; - - case 109: - memcpy_sse2_64(dd - 109, ss - 109); - [[fallthrough]]; - case 45: - memcpy_sse2_32(dd - 45, ss - 45); - *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); - *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 110: - memcpy_sse2_64(dd - 110, ss - 110); - [[fallthrough]]; - case 46: - memcpy_sse2_32(dd - 46, ss - 46); - *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); - *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); - break; - - case 111: - memcpy_sse2_64(dd - 111, ss - 111); - [[fallthrough]]; - case 47: - memcpy_sse2_32(dd - 47, ss - 47); - *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); - *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); - break; - - case 112: - memcpy_sse2_64(dd - 112, ss - 112); - [[fallthrough]]; - case 48: - memcpy_sse2_32(dd - 48, ss - 48); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 113: - memcpy_sse2_64(dd - 113, ss - 113); - [[fallthrough]]; - case 49: - memcpy_sse2_32(dd - 49, ss - 49); - memcpy_sse2_16(dd - 17, ss - 17); - dd[-1] = ss[-1]; - break; - - case 114: - memcpy_sse2_64(dd - 114, ss - 114); - [[fallthrough]]; - case 50: - memcpy_sse2_32(dd - 50, ss - 50); - memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; - - case 115: - memcpy_sse2_64(dd - 115, ss - 115); - [[fallthrough]]; - case 51: - memcpy_sse2_32(dd - 51, ss - 51); - memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); - dd[-1] = ss[-1]; - break; - - case 116: - memcpy_sse2_64(dd - 116, ss - 116); - [[fallthrough]]; - case 52: - memcpy_sse2_32(dd - 52, ss - 52); - memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; - - case 117: - memcpy_sse2_64(dd - 117, ss - 117); - [[fallthrough]]; - case 53: - memcpy_sse2_32(dd - 53, ss - 53); - memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); - dd[-1] = ss[-1]; - break; - - case 118: - memcpy_sse2_64(dd - 118, ss - 118); - [[fallthrough]]; - case 54: - memcpy_sse2_32(dd - 54, ss - 54); - memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); - *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); - break; - - case 119: - memcpy_sse2_64(dd - 119, ss - 119); - [[fallthrough]]; - case 55: - memcpy_sse2_32(dd - 55, ss - 55); - memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); - *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); - break; - - case 120: - memcpy_sse2_64(dd - 120, ss - 120); - [[fallthrough]]; - case 56: - memcpy_sse2_32(dd - 56, ss - 56); - memcpy_sse2_16(dd - 24, ss - 24); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 121: - memcpy_sse2_64(dd - 121, ss - 121); - [[fallthrough]]; - case 57: - memcpy_sse2_32(dd - 57, ss - 57); - memcpy_sse2_16(dd - 25, ss - 25); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 122: - memcpy_sse2_64(dd - 122, ss - 122); - [[fallthrough]]; - case 58: - memcpy_sse2_32(dd - 58, ss - 58); - memcpy_sse2_16(dd - 26, ss - 26); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 123: - memcpy_sse2_64(dd - 123, ss - 123); - [[fallthrough]]; - case 59: - memcpy_sse2_32(dd - 59, ss - 59); - memcpy_sse2_16(dd - 27, ss - 27); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 124: - memcpy_sse2_64(dd - 124, ss - 124); - [[fallthrough]]; - case 60: - memcpy_sse2_32(dd - 60, ss - 60); - memcpy_sse2_16(dd - 28, ss - 28); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 125: - memcpy_sse2_64(dd - 125, ss - 125); - [[fallthrough]]; - case 61: - memcpy_sse2_32(dd - 61, ss - 61); - memcpy_sse2_16(dd - 29, ss - 29); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 126: - memcpy_sse2_64(dd - 126, ss - 126); - [[fallthrough]]; - case 62: - memcpy_sse2_32(dd - 62, ss - 62); - memcpy_sse2_16(dd - 30, ss - 30); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 127: - memcpy_sse2_64(dd - 127, ss - 127); - [[fallthrough]]; - case 63: - memcpy_sse2_32(dd - 63, ss - 63); - memcpy_sse2_16(dd - 31, ss - 31); - memcpy_sse2_16(dd - 16, ss - 16); - break; - - case 128: - memcpy_sse2_128(dd - 128, ss - 128); - break; - } - - return dst; + case 90: + memcpy_sse2_64(dd - 90, ss - 90); + [[fallthrough]]; + case 26: + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 91: + memcpy_sse2_64(dd - 91, ss - 91); + [[fallthrough]]; + case 27: + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 92: + memcpy_sse2_64(dd - 92, ss - 92); + [[fallthrough]]; + case 28: + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 93: + memcpy_sse2_64(dd - 93, ss - 93); + [[fallthrough]]; + case 29: + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 94: + memcpy_sse2_64(dd - 94, ss - 94); + [[fallthrough]]; + case 30: + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 95: + memcpy_sse2_64(dd - 95, ss - 95); + [[fallthrough]]; + case 31: + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 96: + memcpy_sse2_64(dd - 96, ss - 96); + [[fallthrough]]; + case 32: + memcpy_sse2_32(dd - 32, ss - 32); + break; + + case 97: + memcpy_sse2_64(dd - 97, ss - 97); + [[fallthrough]]; + case 33: + memcpy_sse2_32(dd - 33, ss - 33); + dd[-1] = ss[-1]; + break; + + case 98: + memcpy_sse2_64(dd - 98, ss - 98); + [[fallthrough]]; + case 34: + memcpy_sse2_32(dd - 34, ss - 34); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 99: + memcpy_sse2_64(dd - 99, ss - 99); + [[fallthrough]]; + case 35: + memcpy_sse2_32(dd - 35, ss - 35); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 100: + memcpy_sse2_64(dd - 100, ss - 100); + [[fallthrough]]; + case 36: + memcpy_sse2_32(dd - 36, ss - 36); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 101: + memcpy_sse2_64(dd - 101, ss - 101); + [[fallthrough]]; + case 37: + memcpy_sse2_32(dd - 37, ss - 37); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 102: + memcpy_sse2_64(dd - 102, ss - 102); + [[fallthrough]]; + case 38: + memcpy_sse2_32(dd - 38, ss - 38); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 103: + memcpy_sse2_64(dd - 103, ss - 103); + [[fallthrough]]; + case 39: + memcpy_sse2_32(dd - 39, ss - 39); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 104: + memcpy_sse2_64(dd - 104, ss - 104); + [[fallthrough]]; + case 40: + memcpy_sse2_32(dd - 40, ss - 40); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 105: + memcpy_sse2_64(dd - 105, ss - 105); + [[fallthrough]]; + case 41: + memcpy_sse2_32(dd - 41, ss - 41); + *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); + dd[-1] = ss[-1]; + break; + + case 106: + memcpy_sse2_64(dd - 106, ss - 106); + [[fallthrough]]; + case 42: + memcpy_sse2_32(dd - 42, ss - 42); + *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 107: + memcpy_sse2_64(dd - 107, ss - 107); + [[fallthrough]]; + case 43: + memcpy_sse2_32(dd - 43, ss - 43); + *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 108: + memcpy_sse2_64(dd - 108, ss - 108); + [[fallthrough]]; + case 44: + memcpy_sse2_32(dd - 44, ss - 44); + *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 109: + memcpy_sse2_64(dd - 109, ss - 109); + [[fallthrough]]; + case 45: + memcpy_sse2_32(dd - 45, ss - 45); + *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 110: + memcpy_sse2_64(dd - 110, ss - 110); + [[fallthrough]]; + case 46: + memcpy_sse2_32(dd - 46, ss - 46); + *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 111: + memcpy_sse2_64(dd - 111, ss - 111); + [[fallthrough]]; + case 47: + memcpy_sse2_32(dd - 47, ss - 47); + *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); + break; + + case 112: + memcpy_sse2_64(dd - 112, ss - 112); + [[fallthrough]]; + case 48: + memcpy_sse2_32(dd - 48, ss - 48); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 113: + memcpy_sse2_64(dd - 113, ss - 113); + [[fallthrough]]; + case 49: + memcpy_sse2_32(dd - 49, ss - 49); + memcpy_sse2_16(dd - 17, ss - 17); + dd[-1] = ss[-1]; + break; + + case 114: + memcpy_sse2_64(dd - 114, ss - 114); + [[fallthrough]]; + case 50: + memcpy_sse2_32(dd - 50, ss - 50); + memcpy_sse2_16(dd - 18, ss - 18); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 115: + memcpy_sse2_64(dd - 115, ss - 115); + [[fallthrough]]; + case 51: + memcpy_sse2_32(dd - 51, ss - 51); + memcpy_sse2_16(dd - 19, ss - 19); + *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); + dd[-1] = ss[-1]; + break; + + case 116: + memcpy_sse2_64(dd - 116, ss - 116); + [[fallthrough]]; + case 52: + memcpy_sse2_32(dd - 52, ss - 52); + memcpy_sse2_16(dd - 20, ss - 20); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 117: + memcpy_sse2_64(dd - 117, ss - 117); + [[fallthrough]]; + case 53: + memcpy_sse2_32(dd - 53, ss - 53); + memcpy_sse2_16(dd - 21, ss - 21); + *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); + dd[-1] = ss[-1]; + break; + + case 118: + memcpy_sse2_64(dd - 118, ss - 118); + [[fallthrough]]; + case 54: + memcpy_sse2_32(dd - 54, ss - 54); + memcpy_sse2_16(dd - 22, ss - 22); + *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); + break; + + case 119: + memcpy_sse2_64(dd - 119, ss - 119); + [[fallthrough]]; + case 55: + memcpy_sse2_32(dd - 55, ss - 55); + memcpy_sse2_16(dd - 23, ss - 23); + *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); + break; + + case 120: + memcpy_sse2_64(dd - 120, ss - 120); + [[fallthrough]]; + case 56: + memcpy_sse2_32(dd - 56, ss - 56); + memcpy_sse2_16(dd - 24, ss - 24); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 121: + memcpy_sse2_64(dd - 121, ss - 121); + [[fallthrough]]; + case 57: + memcpy_sse2_32(dd - 57, ss - 57); + memcpy_sse2_16(dd - 25, ss - 25); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 122: + memcpy_sse2_64(dd - 122, ss - 122); + [[fallthrough]]; + case 58: + memcpy_sse2_32(dd - 58, ss - 58); + memcpy_sse2_16(dd - 26, ss - 26); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 123: + memcpy_sse2_64(dd - 123, ss - 123); + [[fallthrough]]; + case 59: + memcpy_sse2_32(dd - 59, ss - 59); + memcpy_sse2_16(dd - 27, ss - 27); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 124: + memcpy_sse2_64(dd - 124, ss - 124); + [[fallthrough]]; + case 60: + memcpy_sse2_32(dd - 60, ss - 60); + memcpy_sse2_16(dd - 28, ss - 28); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 125: + memcpy_sse2_64(dd - 125, ss - 125); + [[fallthrough]]; + case 61: + memcpy_sse2_32(dd - 61, ss - 61); + memcpy_sse2_16(dd - 29, ss - 29); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 126: + memcpy_sse2_64(dd - 126, ss - 126); + [[fallthrough]]; + case 62: + memcpy_sse2_32(dd - 62, ss - 62); + memcpy_sse2_16(dd - 30, ss - 30); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 127: + memcpy_sse2_64(dd - 127, ss - 127); + [[fallthrough]]; + case 63: + memcpy_sse2_32(dd - 63, ss - 63); + memcpy_sse2_16(dd - 31, ss - 31); + memcpy_sse2_16(dd - 16, ss - 16); + break; + + case 128: + memcpy_sse2_128(dd - 128, ss - 128); + break; + } + + return dst; } //--------------------------------------------------------------------- // main routine //--------------------------------------------------------------------- -void* memcpy_fast(void *destination, const void *source, size_t size) +void* memcpy_fast_sse(void * __restrict destination, const void * __restrict source, size_t size) { - unsigned char *dst = (unsigned char*)destination; - const unsigned char *src = (const unsigned char*)source; - static size_t cachesize = 0x200000; // L2-cache size - size_t padding; - - // small memory copy - if (size <= 128) { - return memcpy_tiny(dst, src, size); - } - - // align destination to 16 bytes boundary - padding = (16 - (((size_t)dst) & 15)) & 15; - - if (padding > 0) { - __m128i head = _mm_loadu_si128(reinterpret_cast(src)); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); - dst += padding; - src += padding; - size -= padding; - } - - // medium size copy - if (size <= cachesize) { - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); - c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); - c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); - c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); - c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); - c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); - c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); - c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); - _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); - _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); - _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); - _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); - _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); - _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); - _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); - dst += 128; - } - } - else { // big memory copy - __m128i c0, c1, c2, c3, c4, c5, c6, c7; - - _mm_prefetch((const char*)(src), _MM_HINT_NTA); - - if ((((size_t)src) & 15) == 0) { // source aligned - for (; size >= 128; size -= 128) { - c0 = _mm_load_si128((reinterpret_cast(src)) + 0); - c1 = _mm_load_si128((reinterpret_cast(src)) + 1); - c2 = _mm_load_si128((reinterpret_cast(src)) + 2); - c3 = _mm_load_si128((reinterpret_cast(src)) + 3); - c4 = _mm_load_si128((reinterpret_cast(src)) + 4); - c5 = _mm_load_si128((reinterpret_cast(src)) + 5); - c6 = _mm_load_si128((reinterpret_cast(src)) + 6); - c7 = _mm_load_si128((reinterpret_cast(src)) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); - dst += 128; - } - } - else { // source unaligned - for (; size >= 128; size -= 128) { - c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); - c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); - c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); - c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); - c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); - c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); - c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); - c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); - _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); - src += 128; - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); - _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); - dst += 128; - } - } - _mm_sfence(); - } - - memcpy_tiny(dst, src, size); - - return destination; + unsigned char *dst = (unsigned char*)destination; + const unsigned char *src = (const unsigned char*)source; + static size_t cachesize = 0x200000; // L2-cache size + size_t padding; + + // small memory copy + if (size <= 128) { + return memcpy_tiny(dst, src, size); + } + + // align destination to 16 bytes boundary + padding = (16 - (((size_t)dst) & 15)) & 15; + + if (padding > 0) { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + if (size <= cachesize) { + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + else { // big memory copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + _mm_prefetch((const char*)(src), _MM_HINT_NTA); + + if ((((size_t)src) & 15) == 0) { // source aligned + for (; size >= 128; size -= 128) { + c0 = _mm_load_si128((reinterpret_cast(src)) + 0); + c1 = _mm_load_si128((reinterpret_cast(src)) + 1); + c2 = _mm_load_si128((reinterpret_cast(src)) + 2); + c3 = _mm_load_si128((reinterpret_cast(src)) + 3); + c4 = _mm_load_si128((reinterpret_cast(src)) + 4); + c5 = _mm_load_si128((reinterpret_cast(src)) + 5); + c6 = _mm_load_si128((reinterpret_cast(src)) + 6); + c7 = _mm_load_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + else { // source unaligned + for (; size >= 128; size -= 128) { + c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); + c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); + c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); + c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); + c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); + c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); + c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); + c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); + _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); + src += 128; + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); + _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); + dst += 128; + } + } + _mm_sfence(); + } + + memcpy_tiny(dst, src, size); + + return destination; } diff --git a/utils/memcpy-bench/FastMemcpy_Avx.h b/utils/memcpy-bench/FastMemcpy_Avx.h new file mode 100644 index 000000000000..a36964eb013d --- /dev/null +++ b/utils/memcpy-bench/FastMemcpy_Avx.h @@ -0,0 +1,480 @@ +//===================================================================== +// +// FastMemcpy.c - skywind3000@163.com, 2015 +// +// feature: +// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) +// +//===================================================================== +#pragma once + +#include +#include +#include + + +//--------------------------------------------------------------------- +// force inline for compilers +//--------------------------------------------------------------------- +#ifndef INLINE +#ifdef __GNUC__ +#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) + #define INLINE __inline__ __attribute__((always_inline)) +#else + #define INLINE __inline__ +#endif +#elif defined(_MSC_VER) + #define INLINE __forceinline +#elif (defined(__BORLANDC__) || defined(__WATCOMC__)) + #define INLINE __inline +#else + #define INLINE +#endif +#endif + + + +//--------------------------------------------------------------------- +// fast copy for different sizes +//--------------------------------------------------------------------- +static INLINE void memcpy_avx_16(void * __restrict dst, const void * __restrict src) { +#if 1 + __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); + _mm_storeu_si128(((__m128i*)dst) + 0, m0); +#else + *((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0)); + *((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8)); +#endif +} + +static INLINE void memcpy_avx_32(void *dst, const void *src) { + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); +} + +static INLINE void memcpy_avx_64(void *dst, const void *src) { + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); +} + +static INLINE void memcpy_avx_128(void *dst, const void *src) { + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + __m256i m2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + __m256i m3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3); +} + +static INLINE void memcpy_avx_256(void *dst, const void *src) { + __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + __m256i m2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + __m256i m3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + __m256i m4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + __m256i m5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + __m256i m6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + __m256i m7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 2, m2); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 4, m4); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 5, m5); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 6, m6); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 7, m7); +} + + +//--------------------------------------------------------------------- +// tiny memory copy with jump table optimized +//--------------------------------------------------------------------- +static INLINE void *memcpy_tiny_avx(void * __restrict dst, const void * __restrict src, size_t size) { + unsigned char *dd = reinterpret_cast(dst) + size; + const unsigned char *ss = reinterpret_cast(src) + size; + + switch (size) { + case 128: memcpy_avx_128(dd - 128, ss - 128); [[fallthrough]]; + case 0: break; + case 129: memcpy_avx_128(dd - 129, ss - 129); [[fallthrough]]; + case 1: dd[-1] = ss[-1]; break; + case 130: memcpy_avx_128(dd - 130, ss - 130); [[fallthrough]]; + case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 131: memcpy_avx_128(dd - 131, ss - 131); [[fallthrough]]; + case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break; + case 132: memcpy_avx_128(dd - 132, ss - 132); [[fallthrough]]; + case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 133: memcpy_avx_128(dd - 133, ss - 133); [[fallthrough]]; + case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break; + case 134: memcpy_avx_128(dd - 134, ss - 134); [[fallthrough]]; + case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 135: memcpy_avx_128(dd - 135, ss - 135); [[fallthrough]]; + case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 136: memcpy_avx_128(dd - 136, ss - 136); [[fallthrough]]; + case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 137: memcpy_avx_128(dd - 137, ss - 137); [[fallthrough]]; + case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break; + case 138: memcpy_avx_128(dd - 138, ss - 138); [[fallthrough]]; + case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 139: memcpy_avx_128(dd - 139, ss - 139); [[fallthrough]]; + case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 140: memcpy_avx_128(dd - 140, ss - 140); [[fallthrough]]; + case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 141: memcpy_avx_128(dd - 141, ss - 141); [[fallthrough]]; + case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 142: memcpy_avx_128(dd - 142, ss - 142); [[fallthrough]]; + case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 143: memcpy_avx_128(dd - 143, ss - 143); [[fallthrough]]; + case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 144: memcpy_avx_128(dd - 144, ss - 144); [[fallthrough]]; + case 16: memcpy_avx_16(dd - 16, ss - 16); break; + case 145: memcpy_avx_128(dd - 145, ss - 145); [[fallthrough]]; + case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; + case 146: memcpy_avx_128(dd - 146, ss - 146); [[fallthrough]]; + case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 147: memcpy_avx_128(dd - 147, ss - 147); [[fallthrough]]; + case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 148: memcpy_avx_128(dd - 148, ss - 148); [[fallthrough]]; + case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 149: memcpy_avx_128(dd - 149, ss - 149); [[fallthrough]]; + case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 150: memcpy_avx_128(dd - 150, ss - 150); [[fallthrough]]; + case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 151: memcpy_avx_128(dd - 151, ss - 151); [[fallthrough]]; + case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 152: memcpy_avx_128(dd - 152, ss - 152); [[fallthrough]]; + case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 153: memcpy_avx_128(dd - 153, ss - 153); [[fallthrough]]; + case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break; + case 154: memcpy_avx_128(dd - 154, ss - 154); [[fallthrough]]; + case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break; + case 155: memcpy_avx_128(dd - 155, ss - 155); [[fallthrough]]; + case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break; + case 156: memcpy_avx_128(dd - 156, ss - 156); [[fallthrough]]; + case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break; + case 157: memcpy_avx_128(dd - 157, ss - 157); [[fallthrough]]; + case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break; + case 158: memcpy_avx_128(dd - 158, ss - 158); [[fallthrough]]; + case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break; + case 159: memcpy_avx_128(dd - 159, ss - 159); [[fallthrough]]; + case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break; + case 160: memcpy_avx_128(dd - 160, ss - 160); [[fallthrough]]; + case 32: memcpy_avx_32(dd - 32, ss - 32); break; + case 161: memcpy_avx_128(dd - 161, ss - 161); [[fallthrough]]; + case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break; + case 162: memcpy_avx_128(dd - 162, ss - 162); [[fallthrough]]; + case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 163: memcpy_avx_128(dd - 163, ss - 163); [[fallthrough]]; + case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 164: memcpy_avx_128(dd - 164, ss - 164); [[fallthrough]]; + case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 165: memcpy_avx_128(dd - 165, ss - 165); [[fallthrough]]; + case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 166: memcpy_avx_128(dd - 166, ss - 166); [[fallthrough]]; + case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 167: memcpy_avx_128(dd - 167, ss - 167); [[fallthrough]]; + case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 168: memcpy_avx_128(dd - 168, ss - 168); [[fallthrough]]; + case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 169: memcpy_avx_128(dd - 169, ss - 169); [[fallthrough]]; + case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break; + case 170: memcpy_avx_128(dd - 170, ss - 170); [[fallthrough]]; + case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break; + case 171: memcpy_avx_128(dd - 171, ss - 171); [[fallthrough]]; + case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break; + case 172: memcpy_avx_128(dd - 172, ss - 172); [[fallthrough]]; + case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break; + case 173: memcpy_avx_128(dd - 173, ss - 173); [[fallthrough]]; + case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break; + case 174: memcpy_avx_128(dd - 174, ss - 174); [[fallthrough]]; + case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break; + case 175: memcpy_avx_128(dd - 175, ss - 175); [[fallthrough]]; + case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break; + case 176: memcpy_avx_128(dd - 176, ss - 176); [[fallthrough]]; + case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break; + case 177: memcpy_avx_128(dd - 177, ss - 177); [[fallthrough]]; + case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break; + case 178: memcpy_avx_128(dd - 178, ss - 178); [[fallthrough]]; + case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break; + case 179: memcpy_avx_128(dd - 179, ss - 179); [[fallthrough]]; + case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break; + case 180: memcpy_avx_128(dd - 180, ss - 180); [[fallthrough]]; + case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break; + case 181: memcpy_avx_128(dd - 181, ss - 181); [[fallthrough]]; + case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break; + case 182: memcpy_avx_128(dd - 182, ss - 182); [[fallthrough]]; + case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break; + case 183: memcpy_avx_128(dd - 183, ss - 183); [[fallthrough]]; + case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break; + case 184: memcpy_avx_128(dd - 184, ss - 184); [[fallthrough]]; + case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break; + case 185: memcpy_avx_128(dd - 185, ss - 185); [[fallthrough]]; + case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break; + case 186: memcpy_avx_128(dd - 186, ss - 186); [[fallthrough]]; + case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break; + case 187: memcpy_avx_128(dd - 187, ss - 187); [[fallthrough]]; + case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break; + case 188: memcpy_avx_128(dd - 188, ss - 188); [[fallthrough]]; + case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break; + case 189: memcpy_avx_128(dd - 189, ss - 189); [[fallthrough]]; + case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break; + case 190: memcpy_avx_128(dd - 190, ss - 190); [[fallthrough]]; + case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break; + case 191: memcpy_avx_128(dd - 191, ss - 191); [[fallthrough]]; + case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break; + case 192: memcpy_avx_128(dd - 192, ss - 192); [[fallthrough]]; + case 64: memcpy_avx_64(dd - 64, ss - 64); break; + case 193: memcpy_avx_128(dd - 193, ss - 193); [[fallthrough]]; + case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break; + case 194: memcpy_avx_128(dd - 194, ss - 194); [[fallthrough]]; + case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; + case 195: memcpy_avx_128(dd - 195, ss - 195); [[fallthrough]]; + case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 196: memcpy_avx_128(dd - 196, ss - 196); [[fallthrough]]; + case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; + case 197: memcpy_avx_128(dd - 197, ss - 197); [[fallthrough]]; + case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 198: memcpy_avx_128(dd - 198, ss - 198); [[fallthrough]]; + case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 199: memcpy_avx_128(dd - 199, ss - 199); [[fallthrough]]; + case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 200: memcpy_avx_128(dd - 200, ss - 200); [[fallthrough]]; + case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; + case 201: memcpy_avx_128(dd - 201, ss - 201); [[fallthrough]]; + case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break; + case 202: memcpy_avx_128(dd - 202, ss - 202); [[fallthrough]]; + case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break; + case 203: memcpy_avx_128(dd - 203, ss - 203); [[fallthrough]]; + case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break; + case 204: memcpy_avx_128(dd - 204, ss - 204); [[fallthrough]]; + case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break; + case 205: memcpy_avx_128(dd - 205, ss - 205); [[fallthrough]]; + case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break; + case 206: memcpy_avx_128(dd - 206, ss - 206); [[fallthrough]]; + case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break; + case 207: memcpy_avx_128(dd - 207, ss - 207); [[fallthrough]]; + case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break; + case 208: memcpy_avx_128(dd - 208, ss - 208); [[fallthrough]]; + case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break; + case 209: memcpy_avx_128(dd - 209, ss - 209); [[fallthrough]]; + case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break; + case 210: memcpy_avx_128(dd - 210, ss - 210); [[fallthrough]]; + case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break; + case 211: memcpy_avx_128(dd - 211, ss - 211); [[fallthrough]]; + case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break; + case 212: memcpy_avx_128(dd - 212, ss - 212); [[fallthrough]]; + case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break; + case 213: memcpy_avx_128(dd - 213, ss - 213); [[fallthrough]]; + case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break; + case 214: memcpy_avx_128(dd - 214, ss - 214); [[fallthrough]]; + case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break; + case 215: memcpy_avx_128(dd - 215, ss - 215); [[fallthrough]]; + case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break; + case 216: memcpy_avx_128(dd - 216, ss - 216); [[fallthrough]]; + case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break; + case 217: memcpy_avx_128(dd - 217, ss - 217); [[fallthrough]]; + case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break; + case 218: memcpy_avx_128(dd - 218, ss - 218); [[fallthrough]]; + case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break; + case 219: memcpy_avx_128(dd - 219, ss - 219); [[fallthrough]]; + case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break; + case 220: memcpy_avx_128(dd - 220, ss - 220); [[fallthrough]]; + case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break; + case 221: memcpy_avx_128(dd - 221, ss - 221); [[fallthrough]]; + case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break; + case 222: memcpy_avx_128(dd - 222, ss - 222); [[fallthrough]]; + case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break; + case 223: memcpy_avx_128(dd - 223, ss - 223); [[fallthrough]]; + case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break; + case 224: memcpy_avx_128(dd - 224, ss - 224); [[fallthrough]]; + case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break; + case 225: memcpy_avx_128(dd - 225, ss - 225); [[fallthrough]]; + case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break; + case 226: memcpy_avx_128(dd - 226, ss - 226); [[fallthrough]]; + case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break; + case 227: memcpy_avx_128(dd - 227, ss - 227); [[fallthrough]]; + case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break; + case 228: memcpy_avx_128(dd - 228, ss - 228); [[fallthrough]]; + case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break; + case 229: memcpy_avx_128(dd - 229, ss - 229); [[fallthrough]]; + case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break; + case 230: memcpy_avx_128(dd - 230, ss - 230); [[fallthrough]]; + case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break; + case 231: memcpy_avx_128(dd - 231, ss - 231); [[fallthrough]]; + case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break; + case 232: memcpy_avx_128(dd - 232, ss - 232); [[fallthrough]]; + case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break; + case 233: memcpy_avx_128(dd - 233, ss - 233); [[fallthrough]]; + case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break; + case 234: memcpy_avx_128(dd - 234, ss - 234); [[fallthrough]]; + case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break; + case 235: memcpy_avx_128(dd - 235, ss - 235); [[fallthrough]]; + case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break; + case 236: memcpy_avx_128(dd - 236, ss - 236); [[fallthrough]]; + case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break; + case 237: memcpy_avx_128(dd - 237, ss - 237); [[fallthrough]]; + case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break; + case 238: memcpy_avx_128(dd - 238, ss - 238); [[fallthrough]]; + case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break; + case 239: memcpy_avx_128(dd - 239, ss - 239); [[fallthrough]]; + case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break; + case 240: memcpy_avx_128(dd - 240, ss - 240); [[fallthrough]]; + case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break; + case 241: memcpy_avx_128(dd - 241, ss - 241); [[fallthrough]]; + case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break; + case 242: memcpy_avx_128(dd - 242, ss - 242); [[fallthrough]]; + case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break; + case 243: memcpy_avx_128(dd - 243, ss - 243); [[fallthrough]]; + case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break; + case 244: memcpy_avx_128(dd - 244, ss - 244); [[fallthrough]]; + case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break; + case 245: memcpy_avx_128(dd - 245, ss - 245); [[fallthrough]]; + case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break; + case 246: memcpy_avx_128(dd - 246, ss - 246); [[fallthrough]]; + case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break; + case 247: memcpy_avx_128(dd - 247, ss - 247); [[fallthrough]]; + case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break; + case 248: memcpy_avx_128(dd - 248, ss - 248); [[fallthrough]]; + case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break; + case 249: memcpy_avx_128(dd - 249, ss - 249); [[fallthrough]]; + case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break; + case 250: memcpy_avx_128(dd - 250, ss - 250); [[fallthrough]]; + case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break; + case 251: memcpy_avx_128(dd - 251, ss - 251); [[fallthrough]]; + case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break; + case 252: memcpy_avx_128(dd - 252, ss - 252); [[fallthrough]]; + case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break; + case 253: memcpy_avx_128(dd - 253, ss - 253); [[fallthrough]]; + case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break; + case 254: memcpy_avx_128(dd - 254, ss - 254); [[fallthrough]]; + case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break; + case 255: memcpy_avx_128(dd - 255, ss - 255); [[fallthrough]]; + case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break; + case 256: memcpy_avx_256(dd - 256, ss - 256); break; + } + + return dst; +} + + +//--------------------------------------------------------------------- +// main routine +//--------------------------------------------------------------------- +void* memcpy_fast_avx(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + static size_t cachesize = 0x200000; // L3-cache size + size_t padding; + + // small memory copy + if (size <= 256) { + memcpy_tiny_avx(dst, src, size); + _mm256_zeroupper(); + return destination; + } + + // align destination to 16 bytes boundary + padding = (32 - (((size_t)dst) & 31)) & 31; + +#if 0 + if (padding > 0) { + __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); + _mm256_storeu_si256((__m256i*)dst, head); + dst += padding; + src += padding; + size -= padding; + } +#else + __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); + _mm256_storeu_si256((__m256i*)dst, head); + dst += padding; + src += padding; + size -= padding; +#endif + + // medium size copy + if (size <= cachesize) { + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 256; size -= 256) { + c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_storeu_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + } + } + else { // big memory copy + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ + + if ((((size_t)src) & 31) == 0) { // source aligned + for (; size >= 256; size -= 256) { + c0 = _mm256_load_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_load_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_load_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_load_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_load_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_load_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_load_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_load_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + } + } + else { // source unaligned + for (; size >= 256; size -= 256) { + c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_stream_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + } + } + _mm_sfence(); + } + + memcpy_tiny_avx(dst, src, size); + _mm256_zeroupper(); + + return destination; +} diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp new file mode 100644 index 000000000000..d4f886398daa --- /dev/null +++ b/utils/memcpy-bench/memcpy-bench.cpp @@ -0,0 +1,610 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +#pragma GCC diagnostic ignored "-Wold-style-cast" +#pragma GCC diagnostic ignored "-Wcast-align" +#pragma GCC diagnostic ignored "-Wcast-qual" +#include "FastMemcpy.h" +//#include "FastMemcpy_Avx.h" + +#include +#include + + + +template +void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl) +{ + while (size) + { + size_t bytes_to_copy = std::min(size, chunk_size_distribution()); + + impl(dst, src, bytes_to_copy); + + dst += bytes_to_copy; + src += bytes_to_copy; + size -= bytes_to_copy; + } +} + + +using RNG = pcg32_fast; + +template +size_t generatorUniform(RNG & rng) { return rng() % N; }; + + +template +void test(uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator, MemcpyImpl && impl) +{ + Stopwatch watch; + + std::vector threads; + threads.reserve(num_threads); + + for (size_t thread_num = 0; thread_num < num_threads; ++thread_num) + { + size_t begin = size * thread_num / num_threads; + size_t end = size * (thread_num + 1) / num_threads; + + threads.emplace_back([begin, end, iterations, &src, &dst, &generator, &impl] + { + for (size_t iteration = 0; iteration < iterations; ++iteration) + { + loop( + iteration % 2 ? &src[begin] : &dst[begin], + iteration % 2 ? &dst[begin] : &src[begin], + end - begin, + [rng = RNG(), &generator]() mutable { return generator(rng); }, + std::forward(impl)); + } + }); + } + + for (auto & thread : threads) + thread.join(); + + double elapsed_ns = watch.elapsed(); + + /// Validation + size_t sum = 0; + for (size_t i = 0; i < size; ++i) + sum += dst[i]; + + std::cerr << std::fixed << std::setprecision(3) + << "Processed in " << (elapsed_ns / 1e9) << "sec, " << (size * iterations * 1.0 / elapsed_ns) << " GB/sec (sum = " << sum << ")\n"; +} + + +using memcpy_type = void * (*)(const void * __restrict, void * __restrict, size_t); + + +static void * memcpy_erms(void * dst, const void * src, size_t size) +{ + asm volatile ( + "rep movsb" + : "=D"(dst), "=S"(src), "=c"(size) + : "0"(dst), "1"(src), "2"(size) + : "memory"); + return dst; +} + +extern "C" void * memcpy_jart(void * dst, const void * src, size_t size); +extern "C" void MemCpy(void * dst, const void * src, size_t size); + + +static void * memcpySSE2(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + size_t padding; + + // small memory copy + if (size <= 16) + return memcpy_tiny(dst, src, size); + + // align destination to 16 bytes boundary + padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + __m128i c0; + + for (; size >= 16; size -= 16) + { + c0 = _mm_loadu_si128(reinterpret_cast(src)); + src += 16; + _mm_store_si128((reinterpret_cast<__m128i*>(dst)), c0); + dst += 16; + } + + memcpy_tiny(dst, src, size); + return destination; +} + +static void * memcpySSE2Unrolled2(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + size_t padding; + + // small memory copy + if (size <= 32) + return memcpy_tiny(dst, src, size); + + // align destination to 16 bytes boundary + padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + __m128i c0, c1; + + for (; size >= 32; size -= 32) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + src += 32; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + dst += 32; + } + + memcpy_tiny(dst, src, size); + return destination; +} + +static void * memcpySSE2Unrolled4(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + size_t padding; + + // small memory copy + if (size <= 64) + return memcpy_tiny(dst, src, size); + + // align destination to 16 bytes boundary + padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + __m128i c0, c1, c2, c3; + + for (; size >= 64; size -= 64) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + src += 64; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + dst += 64; + } + + memcpy_tiny(dst, src, size); + return destination; +} + + +static void * memcpySSE2Unrolled8(void * __restrict destination, const void * __restrict source, size_t size) +{ + unsigned char *dst = reinterpret_cast(destination); + const unsigned char *src = reinterpret_cast(source); + size_t padding; + + // small memory copy + if (size <= 128) + return memcpy_tiny(dst, src, size); + + // align destination to 16 bytes boundary + padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + // medium size copy + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + for (; size >= 128; size -= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + } + + memcpy_tiny(dst, src, size); + return destination; +} + + +//static __attribute__((__always_inline__, __target__("sse2"))) +__attribute__((__always_inline__)) +void memcpy_my_medium_sse(uint8_t * __restrict & dst, const uint8_t * __restrict & src, size_t & size) +{ + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } +} + +__attribute__((__target__("avx"))) +void memcpy_my_medium_avx(uint8_t * __restrict & __restrict dst, const uint8_t * __restrict & __restrict src, size_t & __restrict size) +{ + size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; + + if (padding > 0) + { + __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); + _mm256_storeu_si256((__m256i*)dst, head); + dst += padding; + src += padding; + size -= padding; + } + + __m256i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 256) + { + c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); + c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); + c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); + c3 = _mm256_loadu_si256((reinterpret_cast(src)) + 3); + c4 = _mm256_loadu_si256((reinterpret_cast(src)) + 4); + c5 = _mm256_loadu_si256((reinterpret_cast(src)) + 5); + c6 = _mm256_loadu_si256((reinterpret_cast(src)) + 6); + c7 = _mm256_loadu_si256((reinterpret_cast(src)) + 7); + src += 256; + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 0), c0); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 1), c1); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 2), c2); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 3), c3); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 4), c4); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 5), c5); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 6), c6); + _mm256_store_si256(((reinterpret_cast<__m256i*>(dst)) + 7), c7); + dst += 256; + + size -= 256; + } +} + +bool have_avx = true; + +static uint8_t * memcpy_my(uint8_t * __restrict dst, const uint8_t * __restrict src, size_t size) +{ + uint8_t * ret = dst; + +tail: + if (size <= 16) + { + if (size >= 8) + { + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } + else if (size >= 4) + { + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } + else if (size >= 2) + { + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } + else if (size >= 1) + { + *dst = *src; + } + } + else if (have_avx) + { + if (size <= 256) + { + __asm__( + "vmovups -0x20(%[s],%[size],1), %%ymm0\n" + "vmovups %%ymm0, -0x20(%[d],%[size],1)\n" + : [d]"+r"(dst), [s]"+r"(src) + : [size]"r"(size) + : "ymm0", "memory"); + + while (size > 32) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups %%ymm0, (%[d])\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "memory"); + + dst += 32; + src += 32; + size -= 32; + } + } + else + { + size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; + + if (padding > 0) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups %%ymm0, (%[d])\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "memory"); + + dst += padding; + src += padding; + size -= padding; + } + + while (size >= 256) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups 0x20(%[s]), %%ymm1\n" + "vmovups 0x40(%[s]), %%ymm2\n" + "vmovups 0x60(%[s]), %%ymm3\n" + "vmovups 0x80(%[s]), %%ymm4\n" + "vmovups 0xa0(%[s]), %%ymm5\n" + "vmovups 0xc0(%[s]), %%ymm6\n" + "vmovups 0xe0(%[s]), %%ymm7\n" + "add $0x100,%[s]\n" + "vmovaps %%ymm0, (%[d])\n" + "vmovaps %%ymm1, 0x20(%[d])\n" + "vmovaps %%ymm2, 0x40(%[d])\n" + "vmovaps %%ymm3, 0x60(%[d])\n" + "vmovaps %%ymm4, 0x80(%[d])\n" + "vmovaps %%ymm5, 0xa0(%[d])\n" + "vmovaps %%ymm6, 0xc0(%[d])\n" + "vmovaps %%ymm7, 0xe0(%[d])\n" + "add $0x100, %[d]\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory"); + + size -= 256; + } + + goto tail; + } + } + else + { + if (size <= 128) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + + while (size > 16) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); + dst += 16; + src += 16; + size -= 16; + } + } + else + { + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } + + goto tail; + } + } + + return ret; +} + + + +template +void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator) +{ + memcpy_type memcpy_libc = reinterpret_cast(dlsym(RTLD_NEXT, "memcpy")); + + if (memcpy_variant == 1) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy); + if (memcpy_variant == 2) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_libc); + if (memcpy_variant == 3) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_erms); + if (memcpy_variant == 4) + test(dst, src, size, iterations, num_threads, std::forward(generator), MemCpy); + if (memcpy_variant == 5) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2); + if (memcpy_variant == 6) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2Unrolled2); + if (memcpy_variant == 7) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2Unrolled4); + if (memcpy_variant == 8) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpySSE2Unrolled8); +// if (memcpy_variant == 9) +// test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_fast_avx); + if (memcpy_variant == 10) + test(dst, src, size, iterations, num_threads, std::forward(generator), memcpy_my); +} + +void dispatchVariants(size_t memcpy_variant, size_t generator_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads) +{ + if (generator_variant == 1) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<16>); + if (generator_variant == 2) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<256>); + if (generator_variant == 3) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<4096>); + if (generator_variant == 4) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<65536>); + if (generator_variant == 5) + dispatchMemcpyVariants(memcpy_variant, dst, src, size, iterations, num_threads, generatorUniform<1048576>); +} + + +int main(int argc, char ** argv) +{ + size_t size = 1000000000; + if (argc >= 2) + size = std::stoull(argv[1]); + + size_t iterations = 10; + if (argc >= 3) + iterations = std::stoull(argv[2]); + + size_t num_threads = 1; + if (argc >= 4) + num_threads = std::stoull(argv[3]); + + size_t memcpy_variant = 1; + if (argc >= 5) + memcpy_variant = std::stoull(argv[4]); + + size_t generator_variant = 1; + if (argc >= 6) + generator_variant = std::stoull(argv[5]); + + std::unique_ptr src(new uint8_t[size]); + std::unique_ptr dst(new uint8_t[size]); + + /// Fill src with some pattern for validation. + for (size_t i = 0; i < size; ++i) + src[i] = i; + + /// Fill dst to avoid page faults. + memset(dst.get(), 0, size); + + dispatchVariants(memcpy_variant, generator_variant, dst.get(), src.get(), size, iterations, num_threads); + + return 0; +} diff --git a/utils/memcpy-bench/memcpy_jart.S b/utils/memcpy-bench/memcpy_jart.S new file mode 100644 index 000000000000..50430d0abe07 --- /dev/null +++ b/utils/memcpy-bench/memcpy_jart.S @@ -0,0 +1,138 @@ +/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ +│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2020 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ + +// Copies memory. +// +// DEST and SRC must not overlap, unless DEST≤SRC. +// +// @param rdi is dest +// @param rsi is src +// @param rdx is number of bytes +// @return original rdi copied to rax +// @mode long +// @asyncsignalsafe +memcpy_jart: mov %rdi,%rax +// 𝑠𝑙𝑖𝑑𝑒 + .align 16 + .type memcpy_jart,@function + .size memcpy_jart,.-memcpy_jart + .globl memcpy_jart + +// Copies memory w/ minimal impact ABI. +// +// @param rdi is dest +// @param rsi is src +// @param rdx is number of bytes +// @clob flags,rcx,xmm3,xmm4 +// @mode long +MemCpy: mov $.Lmemcpytab.size,%ecx + cmp %rcx,%rdx + cmovb %rdx,%rcx + jmp *memcpytab(,%rcx,8) +.Lanchorpoint: +.L16r: cmp $1024,%rdx + jae .Lerms +.L16: movdqu -16(%rsi,%rdx),%xmm4 + mov $16,%rcx +0: add $16,%rcx + movdqu -32(%rsi,%rcx),%xmm3 + movdqu %xmm3,-32(%rdi,%rcx) + cmp %rcx,%rdx + ja 0b + movdqu %xmm4,-16(%rdi,%rdx) + pxor %xmm4,%xmm4 + pxor %xmm3,%xmm3 + jmp .L0 +.L8: push %rbx + mov (%rsi),%rcx + mov -8(%rsi,%rdx),%rbx + mov %rcx,(%rdi) + mov %rbx,-8(%rdi,%rdx) +1: pop %rbx +.L0: ret +.L4: push %rbx + mov (%rsi),%ecx + mov -4(%rsi,%rdx),%ebx + mov %ecx,(%rdi) + mov %ebx,-4(%rdi,%rdx) + jmp 1b +.L3: push %rbx + mov (%rsi),%cx + mov -2(%rsi,%rdx),%bx + mov %cx,(%rdi) + mov %bx,-2(%rdi,%rdx) + jmp 1b +.L2: mov (%rsi),%cx + mov %cx,(%rdi) + jmp .L0 +.L1: mov (%rsi),%cl + mov %cl,(%rdi) + jmp .L0 +.Lerms: cmp $1024*1024,%rdx + ja .Lnts + push %rdi + push %rsi + mov %rdx,%rcx + rep movsb + pop %rsi + pop %rdi + jmp .L0 +.Lnts: movdqu (%rsi),%xmm3 + movdqu %xmm3,(%rdi) + lea 16(%rdi),%rcx + and $-16,%rcx + sub %rdi,%rcx + add %rcx,%rdi + add %rcx,%rsi + sub %rcx,%rdx + mov $16,%rcx +0: add $16,%rcx + movdqu -32(%rsi,%rcx),%xmm3 + movntdq %xmm3,-32(%rdi,%rcx) + cmp %rcx,%rdx + ja 0b + sfence + movdqu -16(%rsi,%rdx),%xmm3 + movdqu %xmm3,-16(%rdi,%rdx) + pxor %xmm3,%xmm3 + jmp .L0 + .type MemCpy,@function + .size MemCpy,.-MemCpy + .globl MemCpy + + .section .rodata + .align 8 +memcpytab: + .quad .L0 + .quad .L1 + .quad .L2 + .quad .L3 + .rept 4 + .quad .L4 + .endr + .rept 8 + .quad .L8 + .endr + .rept 16 + .quad .L16 + .endr + .equ .Lmemcpytab.size,(.-memcpytab)/8 + .quad .L16r # SSE + ERMS + NTS + .type memcpytab,@object + .previous From f0342eda9c7d594746171dbf4f4b63b1d47b0c93 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 8 Mar 2021 13:05:18 +0300 Subject: [PATCH 07/22] Add experimental memcpy implementation --- base/glibc-compatibility/memcpy/CMakeLists.txt | 7 ++++--- programs/main.cpp | 4 ++++ utils/memcpy-bench/memcpy-bench.cpp | 12 ++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/base/glibc-compatibility/memcpy/CMakeLists.txt b/base/glibc-compatibility/memcpy/CMakeLists.txt index b51ebab1d50d..e8de76151bcd 100644 --- a/base/glibc-compatibility/memcpy/CMakeLists.txt +++ b/base/glibc-compatibility/memcpy/CMakeLists.txt @@ -1,5 +1,6 @@ -enable_language(ASM) -add_library(memcpy STATIC memcpy.S) +add_library(memcpy STATIC memcpy.cpp) -# We allow to include memcpy.h from user code for better inlining (less number of registers are clobbered?). +# We allow to include memcpy.h from user code for better inlining. target_include_directories(memcpy PUBLIC $) + +target_compile_options(memcpy PRIVATE -fno-builtin-memcpy) diff --git a/programs/main.cpp b/programs/main.cpp index cbb22b7a87b2..6f23edb7113f 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -23,6 +23,8 @@ #include #include +#include + /// Universal executable for various clickhouse applications #if ENABLE_CLICKHOUSE_SERVER @@ -342,6 +344,8 @@ int main(int argc_, char ** argv_) inside_main = true; SCOPE_EXIT({ inside_main = false; }); + init_memcpy(); + /// Reset new handler to default (that throws std::bad_alloc) /// It is needed because LLVM library clobbers it. std::set_new_handler(nullptr); diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp index d4f886398daa..05029ee5a694 100644 --- a/utils/memcpy-bench/memcpy-bench.cpp +++ b/utils/memcpy-bench/memcpy-bench.cpp @@ -394,6 +394,18 @@ static uint8_t * memcpy_my(uint8_t * __restrict dst, const uint8_t * __restrict } else if (have_avx) { + if (size <= 32) + { + __builtin_memcpy(dst, src, 8); + __builtin_memcpy(dst + 8, src + 8, 8); + + dst += 16; + src += 16; + size -= 16; + + goto tail; + } + if (size <= 256) { __asm__( From 3fb5b247ed615963c9fedb21acd6caf7ed8e375e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 8 Mar 2021 13:19:25 +0300 Subject: [PATCH 08/22] Evaluate another memcpy --- base/glibc-compatibility/memcpy/memcpy.cpp | 22 +++ base/glibc-compatibility/memcpy/memcpy.h | 184 +++++++++++++++++++++ programs/main.cpp | 4 + 3 files changed, 210 insertions(+) create mode 100644 base/glibc-compatibility/memcpy/memcpy.cpp create mode 100644 base/glibc-compatibility/memcpy/memcpy.h diff --git a/base/glibc-compatibility/memcpy/memcpy.cpp b/base/glibc-compatibility/memcpy/memcpy.cpp new file mode 100644 index 000000000000..c67e4eb54cae --- /dev/null +++ b/base/glibc-compatibility/memcpy/memcpy.cpp @@ -0,0 +1,22 @@ +#include "memcpy.h" + +#include +#include + + +bool have_avx = false; + +void init_memcpy() +{ + uint32_t eax; + uint32_t ebx; + uint32_t ecx; + uint32_t edx; + __cpuid(1, eax, ebx, ecx, edx); + have_avx = (ecx >> 28) & 1; +} + +extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size) +{ + return inline_memcpy(reinterpret_cast(dst), reinterpret_cast(src), size); +} diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h new file mode 100644 index 000000000000..034df74d832e --- /dev/null +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -0,0 +1,184 @@ +#include + +#include + + +extern bool have_avx; +void init_memcpy(); + + +static inline char * inline_memcpy(char * __restrict dst, const char * __restrict src, size_t size) +{ + char * ret = dst; + +tail: + if (size <= 16) + { + if (size >= 8) + { + __builtin_memcpy(dst + size - 8, src + size - 8, 8); + __builtin_memcpy(dst, src, 8); + } + else if (size >= 4) + { + __builtin_memcpy(dst + size - 4, src + size - 4, 4); + __builtin_memcpy(dst, src, 4); + } + else if (size >= 2) + { + __builtin_memcpy(dst + size - 2, src + size - 2, 2); + __builtin_memcpy(dst, src, 2); + } + else if (size >= 1) + { + *dst = *src; + } + } + else if (have_avx) + { + if (size <= 32) + { + __builtin_memcpy(dst, src, 8); + __builtin_memcpy(dst + 8, src + 8, 8); + + dst += 16; + src += 16; + size -= 16; + + goto tail; + } + + if (size <= 256) + { + __asm__( + "vmovups -0x20(%[s],%[size],1), %%ymm0\n" + "vmovups %%ymm0, -0x20(%[d],%[size],1)\n" + : [d]"+r"(dst), [s]"+r"(src) + : [size]"r"(size) + : "ymm0", "memory"); + + while (size > 32) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups %%ymm0, (%[d])\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "memory"); + + dst += 32; + src += 32; + size -= 32; + } + } + else + { + size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; + + if (padding > 0) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups %%ymm0, (%[d])\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "memory"); + + dst += padding; + src += padding; + size -= padding; + } + + while (size >= 256) + { + __asm__( + "vmovups (%[s]), %%ymm0\n" + "vmovups 0x20(%[s]), %%ymm1\n" + "vmovups 0x40(%[s]), %%ymm2\n" + "vmovups 0x60(%[s]), %%ymm3\n" + "vmovups 0x80(%[s]), %%ymm4\n" + "vmovups 0xa0(%[s]), %%ymm5\n" + "vmovups 0xc0(%[s]), %%ymm6\n" + "vmovups 0xe0(%[s]), %%ymm7\n" + "add $0x100,%[s]\n" + "vmovaps %%ymm0, (%[d])\n" + "vmovaps %%ymm1, 0x20(%[d])\n" + "vmovaps %%ymm2, 0x40(%[d])\n" + "vmovaps %%ymm3, 0x60(%[d])\n" + "vmovaps %%ymm4, 0x80(%[d])\n" + "vmovaps %%ymm5, 0xa0(%[d])\n" + "vmovaps %%ymm6, 0xc0(%[d])\n" + "vmovaps %%ymm7, 0xe0(%[d])\n" + "add $0x100, %[d]\n" + : [d]"+r"(dst), [s]"+r"(src) + : + : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory"); + + size -= 256; + } + + goto tail; + } + } + else + { + if (size <= 128) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + + while (size > 16) + { + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); + dst += 16; + src += 16; + size -= 16; + } + } + else + { + /// Align destination to 16 bytes boundary. + size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + + if (padding > 0) + { + __m128i head = _mm_loadu_si128(reinterpret_cast(src)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); + dst += padding; + src += padding; + size -= padding; + } + + /// Aligned unrolled copy. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7); + dst += 128; + + size -= 128; + } + + goto tail; + } + } + + return ret; +} + diff --git a/programs/main.cpp b/programs/main.cpp index 6f23edb7113f..54b36fb7781d 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -23,7 +23,11 @@ #include #include +#if defined(__x86_64__) #include +#else +void init_memcpy() {} +#endif /// Universal executable for various clickhouse applications From e02de2355ec0a8d9cfb127c223281db113550f70 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 8 Mar 2021 13:22:17 +0300 Subject: [PATCH 09/22] Remove useless files --- base/glibc-compatibility/CMakeLists.txt | 3 --- base/glibc-compatibility/tests/CMakeLists.txt | 2 -- .../glibc-compatibility/tests/memcpy_test.cpp | 26 ------------------- contrib/FastMemcpy/memcpy_wrapper.cpp | 6 ----- 4 files changed, 37 deletions(-) delete mode 100644 base/glibc-compatibility/tests/CMakeLists.txt delete mode 100644 base/glibc-compatibility/tests/memcpy_test.cpp delete mode 100644 contrib/FastMemcpy/memcpy_wrapper.cpp diff --git a/base/glibc-compatibility/CMakeLists.txt b/base/glibc-compatibility/CMakeLists.txt index 1fc537ded244..cdd5ec618339 100644 --- a/base/glibc-compatibility/CMakeLists.txt +++ b/base/glibc-compatibility/CMakeLists.txt @@ -54,9 +54,6 @@ if (GLIBC_COMPATIBILITY) message (STATUS "Some symbols from glibc will be replaced for compatibility") - if (ENABLE_TESTS) - add_subdirectory (tests) - endif () elseif (YANDEX_OFFICIAL_BUILD) message (WARNING "Option GLIBC_COMPATIBILITY must be turned on for production builds.") endif () diff --git a/base/glibc-compatibility/tests/CMakeLists.txt b/base/glibc-compatibility/tests/CMakeLists.txt deleted file mode 100644 index f2978a86664a..000000000000 --- a/base/glibc-compatibility/tests/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_executable(memcpy_test memcpy_test.cpp) -target_link_libraries(memcpy_test common memcpy) diff --git a/base/glibc-compatibility/tests/memcpy_test.cpp b/base/glibc-compatibility/tests/memcpy_test.cpp deleted file mode 100644 index a1f782380b03..000000000000 --- a/base/glibc-compatibility/tests/memcpy_test.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include -#include -#include - -__attribute__((__noinline__)) void memcpy_noinline(void * __restrict dst, const void * __restrict src, size_t size) -{ - memcpy(dst, src, size); -} - - -int main(int, char **) -{ - constexpr size_t buf_size = 100; - char buf[buf_size]{}; - memcpy_noinline(buf, "abc", 3); - - size_t bytes_to_copy = 3; - while (bytes_to_copy * 2 < buf_size) - { - memcpy_noinline(&buf[bytes_to_copy], buf, bytes_to_copy); - bytes_to_copy *= 2; - } - - std::cerr << buf << "\n"; - return 0; -} diff --git a/contrib/FastMemcpy/memcpy_wrapper.cpp b/contrib/FastMemcpy/memcpy_wrapper.cpp deleted file mode 100644 index 8fa6cf291ecb..000000000000 --- a/contrib/FastMemcpy/memcpy_wrapper.cpp +++ /dev/null @@ -1,6 +0,0 @@ -#include "FastMemcpy.h" - -extern "C" void * memcpy(void * __restrict destination, const void * __restrict source, size_t size) -{ - return memcpy_fast(destination, source, size); -} From 7664a3d53e1cf75e59ab251d390771fce3cc903c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 9 Mar 2021 01:29:32 +0300 Subject: [PATCH 10/22] Fix MSan --- base/glibc-compatibility/memcpy/memcpy.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index 034df74d832e..86ffe0c0d53b 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -2,6 +2,19 @@ #include +#if defined(__clang__) && defined(__has_feature) +# define ch_has_feature __has_feature +#endif +#if !defined(MEMORY_SANITIZER) +# if defined(ch_has_feature) +# if ch_has_feature(memory_sanitizer) +# define MEMORY_SANITIZER 1 +# endif +# elif defined(__MEMORY_SANITIZER__) +# define MEMORY_SANITIZER 1 +# endif +#endif + extern bool have_avx; void init_memcpy(); @@ -34,6 +47,7 @@ static inline char * inline_memcpy(char * __restrict dst, const char * __restric *dst = *src; } } +#if !defined(MEMORY_SANITIZER) /// Asm code is not instrumented by MSan, skip this branch else if (have_avx) { if (size <= 32) @@ -120,6 +134,7 @@ static inline char * inline_memcpy(char * __restrict dst, const char * __restric goto tail; } } +#endif else { if (size <= 128) From 5f65d469299fdd6c7958c2ef5ef2c8815fbc4035 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 9 Mar 2021 02:22:12 +0300 Subject: [PATCH 11/22] Fix Arcadia --- programs/main.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/programs/main.cpp b/programs/main.cpp index 54b36fb7781d..447a3518dcc5 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -23,8 +23,9 @@ #include #include -#if defined(__x86_64__) -#include +/// Custom memcpy - only for x86_64 and not for Arcadia. +#if defined(__x86_64__) && !defined(ARCADIA_BUILD) +#include // Y_IGNORE #else void init_memcpy() {} #endif From 39dbc11a3d408248327120567222158c9d80a4e2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 9 Mar 2021 03:21:38 +0300 Subject: [PATCH 12/22] Fix style check --- utils/memcpy-bench/FastMemcpy.h | 48 +++++++++++++++++--------- utils/memcpy-bench/FastMemcpy_Avx.h | 52 +++++++++++++++++++---------- utils/memcpy-bench/memcpy-bench.cpp | 2 -- 3 files changed, 66 insertions(+), 36 deletions(-) diff --git a/utils/memcpy-bench/FastMemcpy.h b/utils/memcpy-bench/FastMemcpy.h index 3f2278eac33f..9c37524443aa 100644 --- a/utils/memcpy-bench/FastMemcpy.h +++ b/utils/memcpy-bench/FastMemcpy.h @@ -1,3 +1,5 @@ +#pragma once + //===================================================================== // // FastMemcpy.c - skywind3000@163.com, 2015 @@ -6,7 +8,6 @@ // 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) // //===================================================================== -#pragma once #include #include @@ -39,19 +40,22 @@ typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; //--------------------------------------------------------------------- // fast copy for different sizes //--------------------------------------------------------------------- -static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src) { +static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src) +{ __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); } -static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src) { +static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src) +{ __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); } -static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src) { +static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src) +{ __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); @@ -62,7 +66,8 @@ static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); } -static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src) { +static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src) +{ __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); @@ -88,11 +93,13 @@ static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restric /// Attribute is used to avoid an error with undefined behaviour sanitizer /// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer /// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. -__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size) { +__attribute__((__no_sanitize__("undefined"))) static INLINE void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size) +{ unsigned char *dd = ((unsigned char*)dst) + size; const unsigned char *ss = ((const unsigned char*)src) + size; - switch (size) { + switch (size) + { case 64: memcpy_sse2_64(dd - 64, ss - 64); [[fallthrough]]; @@ -653,14 +660,16 @@ void* memcpy_fast_sse(void * __restrict destination, const void * __restrict sou size_t padding; // small memory copy - if (size <= 128) { + if (size <= 128) +{ return memcpy_tiny(dst, src, size); } // align destination to 16 bytes boundary padding = (16 - (((size_t)dst) & 15)) & 15; - if (padding > 0) { + if (padding > 0) + { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); dst += padding; @@ -669,10 +678,12 @@ void* memcpy_fast_sse(void * __restrict destination, const void * __restrict sou } // medium size copy - if (size <= cachesize) { + if (size <= cachesize) + { __m128i c0, c1, c2, c3, c4, c5, c6, c7; - for (; size >= 128; size -= 128) { + for (; size >= 128; size -= 128) + { c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); @@ -694,13 +705,16 @@ void* memcpy_fast_sse(void * __restrict destination, const void * __restrict sou dst += 128; } } - else { // big memory copy + else + { // big memory copy __m128i c0, c1, c2, c3, c4, c5, c6, c7; _mm_prefetch((const char*)(src), _MM_HINT_NTA); - if ((((size_t)src) & 15) == 0) { // source aligned - for (; size >= 128; size -= 128) { + if ((((size_t)src) & 15) == 0) + { // source aligned + for (; size >= 128; size -= 128) + { c0 = _mm_load_si128((reinterpret_cast(src)) + 0); c1 = _mm_load_si128((reinterpret_cast(src)) + 1); c2 = _mm_load_si128((reinterpret_cast(src)) + 2); @@ -722,8 +736,10 @@ void* memcpy_fast_sse(void * __restrict destination, const void * __restrict sou dst += 128; } } - else { // source unaligned - for (; size >= 128; size -= 128) { + else + { // source unaligned + for (; size >= 128; size -= 128) + { c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); diff --git a/utils/memcpy-bench/FastMemcpy_Avx.h b/utils/memcpy-bench/FastMemcpy_Avx.h index a36964eb013d..ee7d4e19536f 100644 --- a/utils/memcpy-bench/FastMemcpy_Avx.h +++ b/utils/memcpy-bench/FastMemcpy_Avx.h @@ -1,3 +1,5 @@ +#pragma once + //===================================================================== // // FastMemcpy.c - skywind3000@163.com, 2015 @@ -6,7 +8,6 @@ // 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) // //===================================================================== -#pragma once #include #include @@ -33,11 +34,11 @@ #endif - //--------------------------------------------------------------------- // fast copy for different sizes //--------------------------------------------------------------------- -static INLINE void memcpy_avx_16(void * __restrict dst, const void * __restrict src) { +static INLINE void memcpy_avx_16(void * __restrict dst, const void * __restrict src) +{ #if 1 __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); _mm_storeu_si128(((__m128i*)dst) + 0, m0); @@ -47,19 +48,22 @@ static INLINE void memcpy_avx_16(void * __restrict dst, const void * __restrict #endif } -static INLINE void memcpy_avx_32(void *dst, const void *src) { +static INLINE void memcpy_avx_32(void *dst, const void *src) +{ __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); } -static INLINE void memcpy_avx_64(void *dst, const void *src) { +static INLINE void memcpy_avx_64(void *dst, const void *src) +{ __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 0, m0); _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 1, m1); } -static INLINE void memcpy_avx_128(void *dst, const void *src) { +static INLINE void memcpy_avx_128(void *dst, const void *src) +{ __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); __m256i m2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); @@ -70,7 +74,8 @@ static INLINE void memcpy_avx_128(void *dst, const void *src) { _mm256_storeu_si256((reinterpret_cast<__m256i*>(dst)) + 3, m3); } -static INLINE void memcpy_avx_256(void *dst, const void *src) { +static INLINE void memcpy_avx_256(void *dst, const void *src) +{ __m256i m0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); __m256i m1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); __m256i m2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); @@ -93,11 +98,13 @@ static INLINE void memcpy_avx_256(void *dst, const void *src) { //--------------------------------------------------------------------- // tiny memory copy with jump table optimized //--------------------------------------------------------------------- -static INLINE void *memcpy_tiny_avx(void * __restrict dst, const void * __restrict src, size_t size) { +static INLINE void *memcpy_tiny_avx(void * __restrict dst, const void * __restrict src, size_t size) +{ unsigned char *dd = reinterpret_cast(dst) + size; const unsigned char *ss = reinterpret_cast(src) + size; - switch (size) { + switch (size) + { case 128: memcpy_avx_128(dd - 128, ss - 128); [[fallthrough]]; case 0: break; case 129: memcpy_avx_128(dd - 129, ss - 129); [[fallthrough]]; @@ -372,7 +379,8 @@ void* memcpy_fast_avx(void * __restrict destination, const void * __restrict sou size_t padding; // small memory copy - if (size <= 256) { + if (size <= 256) + { memcpy_tiny_avx(dst, src, size); _mm256_zeroupper(); return destination; @@ -382,7 +390,8 @@ void* memcpy_fast_avx(void * __restrict destination, const void * __restrict sou padding = (32 - (((size_t)dst) & 31)) & 31; #if 0 - if (padding > 0) { + if (padding > 0) + { __m256i head = _mm256_loadu_si256(reinterpret_cast(src)); _mm256_storeu_si256((__m256i*)dst, head); dst += padding; @@ -398,10 +407,12 @@ void* memcpy_fast_avx(void * __restrict destination, const void * __restrict sou #endif // medium size copy - if (size <= cachesize) { + if (size <= cachesize) + { __m256i c0, c1, c2, c3, c4, c5, c6, c7; - for (; size >= 256; size -= 256) { + for (; size >= 256; size -= 256) + { c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); @@ -422,12 +433,15 @@ void* memcpy_fast_avx(void * __restrict destination, const void * __restrict sou dst += 256; } } - else { // big memory copy + else + { // big memory copy __m256i c0, c1, c2, c3, c4, c5, c6, c7; /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ - if ((((size_t)src) & 31) == 0) { // source aligned - for (; size >= 256; size -= 256) { + if ((((size_t)src) & 31) == 0) + { // source aligned + for (; size >= 256; size -= 256) + { c0 = _mm256_load_si256((reinterpret_cast(src)) + 0); c1 = _mm256_load_si256((reinterpret_cast(src)) + 1); c2 = _mm256_load_si256((reinterpret_cast(src)) + 2); @@ -448,8 +462,10 @@ void* memcpy_fast_avx(void * __restrict destination, const void * __restrict sou dst += 256; } } - else { // source unaligned - for (; size >= 256; size -= 256) { + else + { // source unaligned + for (; size >= 256; size -= 256) + { c0 = _mm256_loadu_si256((reinterpret_cast(src)) + 0); c1 = _mm256_loadu_si256((reinterpret_cast(src)) + 1); c2 = _mm256_loadu_si256((reinterpret_cast(src)) + 2); diff --git a/utils/memcpy-bench/memcpy-bench.cpp b/utils/memcpy-bench/memcpy-bench.cpp index 05029ee5a694..2df72cb5ccbf 100644 --- a/utils/memcpy-bench/memcpy-bench.cpp +++ b/utils/memcpy-bench/memcpy-bench.cpp @@ -24,7 +24,6 @@ #include - template void NO_INLINE loop(uint8_t * dst, uint8_t * src, size_t size, F && chunk_size_distribution, MemcpyImpl && impl) { @@ -541,7 +540,6 @@ static uint8_t * memcpy_my(uint8_t * __restrict dst, const uint8_t * __restrict } - template void dispatchMemcpyVariants(size_t memcpy_variant, uint8_t * dst, uint8_t * src, size_t size, size_t iterations, size_t num_threads, F && generator) { From 8619665ef23af39d7f9deadd456e7fe2e5ddf6b2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 10 Mar 2021 10:04:33 +0300 Subject: [PATCH 13/22] Remove AVX --- base/glibc-compatibility/memcpy/memcpy.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index 86ffe0c0d53b..b651e6c3b62f 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -16,7 +16,7 @@ #endif -extern bool have_avx; +//extern bool have_avx; void init_memcpy(); @@ -47,6 +47,7 @@ static inline char * inline_memcpy(char * __restrict dst, const char * __restric *dst = *src; } } +/* #if !defined(MEMORY_SANITIZER) /// Asm code is not instrumented by MSan, skip this branch else if (have_avx) { @@ -134,7 +135,7 @@ static inline char * inline_memcpy(char * __restrict dst, const char * __restric goto tail; } } -#endif +#endif*/ else { if (size <= 128) From 6d91881f6af68efe6e9ceaa8f82b11ab518782fd Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Mar 2021 10:38:35 +0300 Subject: [PATCH 14/22] Remove currently unused code --- base/glibc-compatibility/memcpy/memcpy.h | 113 +---------------------- programs/main.cpp | 9 -- 2 files changed, 5 insertions(+), 117 deletions(-) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index b651e6c3b62f..3e1ef793f816 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -2,27 +2,13 @@ #include -#if defined(__clang__) && defined(__has_feature) -# define ch_has_feature __has_feature -#endif -#if !defined(MEMORY_SANITIZER) -# if defined(ch_has_feature) -# if ch_has_feature(memory_sanitizer) -# define MEMORY_SANITIZER 1 -# endif -# elif defined(__MEMORY_SANITIZER__) -# define MEMORY_SANITIZER 1 -# endif -#endif - -//extern bool have_avx; -void init_memcpy(); - - -static inline char * inline_memcpy(char * __restrict dst, const char * __restrict src, size_t size) +static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size) { - char * ret = dst; + char * __restrict dst = reinterpret_cast(dst_); + const char * __restrict src = reinterpret_cast(src_); + + void * ret = dst; tail: if (size <= 16) @@ -47,95 +33,6 @@ static inline char * inline_memcpy(char * __restrict dst, const char * __restric *dst = *src; } } -/* -#if !defined(MEMORY_SANITIZER) /// Asm code is not instrumented by MSan, skip this branch - else if (have_avx) - { - if (size <= 32) - { - __builtin_memcpy(dst, src, 8); - __builtin_memcpy(dst + 8, src + 8, 8); - - dst += 16; - src += 16; - size -= 16; - - goto tail; - } - - if (size <= 256) - { - __asm__( - "vmovups -0x20(%[s],%[size],1), %%ymm0\n" - "vmovups %%ymm0, -0x20(%[d],%[size],1)\n" - : [d]"+r"(dst), [s]"+r"(src) - : [size]"r"(size) - : "ymm0", "memory"); - - while (size > 32) - { - __asm__( - "vmovups (%[s]), %%ymm0\n" - "vmovups %%ymm0, (%[d])\n" - : [d]"+r"(dst), [s]"+r"(src) - : - : "ymm0", "memory"); - - dst += 32; - src += 32; - size -= 32; - } - } - else - { - size_t padding = (32 - (reinterpret_cast(dst) & 31)) & 31; - - if (padding > 0) - { - __asm__( - "vmovups (%[s]), %%ymm0\n" - "vmovups %%ymm0, (%[d])\n" - : [d]"+r"(dst), [s]"+r"(src) - : - : "ymm0", "memory"); - - dst += padding; - src += padding; - size -= padding; - } - - while (size >= 256) - { - __asm__( - "vmovups (%[s]), %%ymm0\n" - "vmovups 0x20(%[s]), %%ymm1\n" - "vmovups 0x40(%[s]), %%ymm2\n" - "vmovups 0x60(%[s]), %%ymm3\n" - "vmovups 0x80(%[s]), %%ymm4\n" - "vmovups 0xa0(%[s]), %%ymm5\n" - "vmovups 0xc0(%[s]), %%ymm6\n" - "vmovups 0xe0(%[s]), %%ymm7\n" - "add $0x100,%[s]\n" - "vmovaps %%ymm0, (%[d])\n" - "vmovaps %%ymm1, 0x20(%[d])\n" - "vmovaps %%ymm2, 0x40(%[d])\n" - "vmovaps %%ymm3, 0x60(%[d])\n" - "vmovaps %%ymm4, 0x80(%[d])\n" - "vmovaps %%ymm5, 0xa0(%[d])\n" - "vmovaps %%ymm6, 0xc0(%[d])\n" - "vmovaps %%ymm7, 0xe0(%[d])\n" - "add $0x100, %[d]\n" - : [d]"+r"(dst), [s]"+r"(src) - : - : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "memory"); - - size -= 256; - } - - goto tail; - } - } -#endif*/ else { if (size <= 128) diff --git a/programs/main.cpp b/programs/main.cpp index 447a3518dcc5..cbb22b7a87b2 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -23,13 +23,6 @@ #include #include -/// Custom memcpy - only for x86_64 and not for Arcadia. -#if defined(__x86_64__) && !defined(ARCADIA_BUILD) -#include // Y_IGNORE -#else -void init_memcpy() {} -#endif - /// Universal executable for various clickhouse applications #if ENABLE_CLICKHOUSE_SERVER @@ -349,8 +342,6 @@ int main(int argc_, char ** argv_) inside_main = true; SCOPE_EXIT({ inside_main = false; }); - init_memcpy(); - /// Reset new handler to default (that throws std::bad_alloc) /// It is needed because LLVM library clobbers it. std::set_new_handler(nullptr); From a446612e8aea837f0aa49dc20a2d8c1af5d575c9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Mar 2021 10:39:43 +0300 Subject: [PATCH 15/22] Remove currently unused code --- base/glibc-compatibility/memcpy/memcpy.cpp | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/base/glibc-compatibility/memcpy/memcpy.cpp b/base/glibc-compatibility/memcpy/memcpy.cpp index c67e4eb54cae..ec43a2c36496 100644 --- a/base/glibc-compatibility/memcpy/memcpy.cpp +++ b/base/glibc-compatibility/memcpy/memcpy.cpp @@ -1,22 +1,6 @@ #include "memcpy.h" -#include -#include - - -bool have_avx = false; - -void init_memcpy() -{ - uint32_t eax; - uint32_t ebx; - uint32_t ecx; - uint32_t edx; - __cpuid(1, eax, ebx, ecx, edx); - have_avx = (ecx >> 28) & 1; -} - extern "C" void * memcpy(void * __restrict dst, const void * __restrict src, size_t size) { - return inline_memcpy(reinterpret_cast(dst), reinterpret_cast(src), size); + return inline_memcpy(dst, src, size); } From 37cc2fe9770ad7850fe4e1f947f3b35653cf8f3c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Mar 2021 22:39:10 +0300 Subject: [PATCH 16/22] Use custom memcpy only for AArch64 --- base/glibc-compatibility/CMakeLists.txt | 7 +++++-- base/glibc-compatibility/memcpy/CMakeLists.txt | 10 ++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/base/glibc-compatibility/CMakeLists.txt b/base/glibc-compatibility/CMakeLists.txt index cdd5ec618339..e785e2ab2ce4 100644 --- a/base/glibc-compatibility/CMakeLists.txt +++ b/base/glibc-compatibility/CMakeLists.txt @@ -1,5 +1,8 @@ if (GLIBC_COMPATIBILITY) add_subdirectory(memcpy) + if(TARGET memcpy) + set(MEMCPY_LIBRARY memcpy) + endif() enable_language(ASM) include(CheckIncludeFile) @@ -44,10 +47,10 @@ if (GLIBC_COMPATIBILITY) target_compile_options(glibc-compatibility PRIVATE -fPIC) endif () - target_link_libraries(global-libs INTERFACE glibc-compatibility memcpy) + target_link_libraries(global-libs INTERFACE glibc-compatibility ${MEMCPY_LIBRARY}) install( - TARGETS glibc-compatibility memcpy + TARGETS glibc-compatibility ${MEMCPY_LIBRARY} EXPORT global ARCHIVE DESTINATION lib ) diff --git a/base/glibc-compatibility/memcpy/CMakeLists.txt b/base/glibc-compatibility/memcpy/CMakeLists.txt index e8de76151bcd..133995d9b964 100644 --- a/base/glibc-compatibility/memcpy/CMakeLists.txt +++ b/base/glibc-compatibility/memcpy/CMakeLists.txt @@ -1,6 +1,8 @@ -add_library(memcpy STATIC memcpy.cpp) +if (ARCH_AMD64) + add_library(memcpy STATIC memcpy.cpp) -# We allow to include memcpy.h from user code for better inlining. -target_include_directories(memcpy PUBLIC $) + # We allow to include memcpy.h from user code for better inlining. + target_include_directories(memcpy PUBLIC $) -target_compile_options(memcpy PRIVATE -fno-builtin-memcpy) + target_compile_options(memcpy PRIVATE -fno-builtin-memcpy) +endif () From 36a41220d3e9b9e052b3a26ce4538c73e47d7c6e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Mar 2021 00:46:47 +0300 Subject: [PATCH 17/22] Add comments and documentation --- base/glibc-compatibility/memcpy/memcpy.h | 118 ++++++++++++++++++++++- 1 file changed, 117 insertions(+), 1 deletion(-) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index 3e1ef793f816..b069bcf9653e 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -3,42 +3,152 @@ #include +/** Custom memcpy implementation for ClickHouse. + * It has the following benefits over using glibc's implementation: + * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability. + * 2. Avoiding indirect call via PLT due to shared linking, that can be less efficient. + * 3. It's possible to include this header and call inline_memcpy directly for better inlining or interprocedural analysis. + * 4. Better results on our performance tests on current CPUs: up to 25% on some queries and up to 0.7%..1% in average across all queries. + * + * Writing our own memcpy is extremely difficult for the following reasons: + * 1. The optimal variant depends on the specific CPU model. + * 2. The optimal variant depends on the distribution of size arguments. + * 3. It depends on the number of threads copying data concurrently. + * 4. It also depends on how the calling code is using the copied data and how the different memcpy calls are related to each other. + * Due to vast range of scenarios it makes proper testing especially difficult. + * When writing our own memcpy there is a risk to overoptimize it + * on non-representative microbenchmarks while making real-world use cases actually worse. + * + * Most of the benchmarks for memcpy on the internet are wrong. + * + * Let's look at the details: + * + * For small size, the order of branches in code is important. + * There are variants with specific order of branches (like here or in glibc) + * or with jump table (in asm code see example from Cosmopolitan libc: + * https://github.com/jart/cosmopolitan/blob/de09bec215675e9b0beb722df89c6f794da74f3f/libc/nexgen32e/memcpy.S#L61) + * or with Duff device in C (see https://github.com/skywind3000/FastMemcpy/) + * + * It's also important how to copy uneven sizes. + * Almost every implementation, including this, is using two overlapping movs. + * + * It is important to disable -ftree-loop-distribute-patterns when compiling memcpy implementation, + * otherwise the compiler can replace internal loops to a call to memcpy that will lead to infinite recursion. + * + * For larger sizes it's important to choose the instructions used: + * - SSE or AVX or AVX-512; + * - rep movsb; + * Performance will depend on the size threshold, on the CPU model, on the "erms" flag + * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes) + * + * Using AVX-512 can be bad due to throttling. + * Using AVX can be bad if most code is using SSE due to switching penalty + * (it also depends on the usage of "vzeroupper" instruction). + * But in some cases AVX gives a win. + * + * It also depends on how many times the loop will be unrolled. + * We are unrolling the loop 8 times (by the number of available registers), but it not always the best. + * + * It also depends on the usage of aligned or unaligned loads/stores. + * We are using unaligned loads and aligned stores. + * + * It also depends on the usage of prefetch instructions. It makes sense on some Intel CPUs but can slow down performance on AMD. + * Setting up correct offset for prefetching is non-obvious. + * + * Non-temporary (cache bypassing) stores can be used for very large sizes (more than a half of L3 cache). + * But the exact threshold is unclear - when doing memcpy from multiple threads the optimal threshold can be lower, + * because L3 cache is shared (and L2 cache is partially shared). + * + * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios, + * so we don't pay attention to it. + * + * On recent Intel CPUs, the presense of "erms" makes "rep movsb" the most benefitial, + * even comparing to non-temporary aligned unrolled stores even with the most wide registers. + * + * memcpy can be written in asm, C or C++. The latter can also use inline asm. + * The asm implementation can be better to make sure that compiler won't make the code worse, + * to ensure the order of branches, the code layout, the usage of all required registers. + * But if it is located in separate translation unit, inlining will not be possible + * (inline asm can be used to overcome this limitation). + * Sometimes C or C++ code can be further optimized by compiler. + * For example, clang is capable replacing SSE intrinsics to AVX code if -mavx is used. + * + * Please note that compiler can replace plain code to memcpy and vice versa. + * - memcpy with compile-time known small size is replaced to simple instructions without a call to memcpy; + * it is controlled by -fbuiltin-memcpy and can be manually ensured by calling __builtin_memcpy. + * This is often used to implement unaligned load/store without undefined behaviour in C++. + * - a loop with copying bytes can be recognized and replaced by a call to memcpy; + * it is controlled by -ftree-loop-distribute-patterns. + * - also note that a loop with copying bytes can be unrolled, peeled and vectorized that will give you + * inline code somewhat similar to a decent implementation of memcpy. + * + * This description is up to date as of Mar 2021. + * + * How to test the memcpy implementation for performance: + * 1. Test on real production workload. + * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios. + */ + + static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size) { + /// We will use pointer arithmetic, so char pointer will be used. + /// Note that __restrict makes sense (otherwise compiler will reload data from memory + /// instead of using the value of registers due to possible aliasing). char * __restrict dst = reinterpret_cast(dst_); const char * __restrict src = reinterpret_cast(src_); + /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it. + /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly + /// for inlining and removing this single instruction. void * ret = dst; tail: + /// Small sizes and tails after the loop for large sizes. + /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application. + /// This order of branches is from the disassembly of glibc's code. + /// We copy chunks of possibly uneven size with two overlapping movs. + /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3]. if (size <= 16) { if (size >= 8) { + /// Chunks of 8..16 bytes. __builtin_memcpy(dst + size - 8, src + size - 8, 8); __builtin_memcpy(dst, src, 8); } else if (size >= 4) { + /// Chunks of 4..7 bytes. __builtin_memcpy(dst + size - 4, src + size - 4, 4); __builtin_memcpy(dst, src, 4); } else if (size >= 2) { + /// Chunks of 2..3 bytes. __builtin_memcpy(dst + size - 2, src + size - 2, 2); __builtin_memcpy(dst, src, 2); } else if (size >= 1) { + /// A single byte. *dst = *src; } + /// Zero bytes. } else { + /// Medium and large sizes. if (size <= 128) { + /// Medium size, not enough for full loop unrolling. + + /// We will copy the last 16 bytes. _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast(src + size - 16))); + /// Then we will copy every 16 bytes from the beginning in a loop. + /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes. + /// This is Ok, similar to the code for small sizes above. while (size > 16) { _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast(src))); @@ -49,9 +159,12 @@ static inline void * inline_memcpy(void * __restrict dst_, const void * __restri } else { + /// Large size with fully unrolled loop. + /// Align destination to 16 bytes boundary. size_t padding = (16 - (reinterpret_cast(dst) & 15)) & 15; + /// If not aligned - we will copy first 16 bytes with unaligned stores. if (padding > 0) { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); @@ -61,7 +174,9 @@ static inline void * inline_memcpy(void * __restrict dst_, const void * __restri size -= padding; } - /// Aligned unrolled copy. + /// Aligned unrolled copy. We will use all available SSE registers. + /// It's not possible to have both src and dst aligned. + /// So, we will use aligned stores and unaligned loads. __m128i c0, c1, c2, c3, c4, c5, c6, c7; while (size >= 128) @@ -88,6 +203,7 @@ static inline void * inline_memcpy(void * __restrict dst_, const void * __restri size -= 128; } + /// The latest remaining 0..127 bytes will be processed as usual. goto tail; } } From b4cae2c8589ff8ef0155ccd90d3b7ffee1446961 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Mar 2021 00:48:08 +0300 Subject: [PATCH 18/22] Add TODO --- base/glibc-compatibility/memcpy/memcpy.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index b069bcf9653e..c9a9b3a77729 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -87,6 +87,9 @@ * How to test the memcpy implementation for performance: * 1. Test on real production workload. * 2. For synthetic test, see utils/memcpy-bench, but make sure you will do the best to exhaust the wide range of scenarios. + * + * TODO: Add self-tuning memcpy with bayesian bandits algorithm for large sizes. + * See https://habr.com/en/company/yandex/blog/457612/ */ From 32e66baa0a9acb30b44f836321cd3216da851774 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Mar 2021 00:48:44 +0300 Subject: [PATCH 19/22] Add comments and documentation --- base/glibc-compatibility/memcpy/memcpy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index c9a9b3a77729..67b705d930a6 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -137,7 +137,7 @@ static inline void * inline_memcpy(void * __restrict dst_, const void * __restri /// A single byte. *dst = *src; } - /// Zero bytes. + /// No bytes remaining. } else { From a1ebd52d09a15f4e5215bf4f8706a02ef5f2719a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Mar 2021 00:51:13 +0300 Subject: [PATCH 20/22] Add comments and documentation --- base/glibc-compatibility/memcpy/memcpy.h | 1 + 1 file changed, 1 insertion(+) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index 67b705d930a6..1fd522fd4c38 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -40,6 +40,7 @@ * - rep movsb; * Performance will depend on the size threshold, on the CPU model, on the "erms" flag * ("Enhansed Rep MovS" - it indicates that performance of "rep movsb" is decent for large sizes) + * https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy * * Using AVX-512 can be bad due to throttling. * Using AVX can be bad if most code is using SSE due to switching penalty From bab924620ac804b51cc7c8514fc19e937ea7ffcb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Mar 2021 00:53:00 +0300 Subject: [PATCH 21/22] Add comments and documentation --- base/glibc-compatibility/memcpy/memcpy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index 1fd522fd4c38..60303e4f5441 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -61,7 +61,7 @@ * because L3 cache is shared (and L2 cache is partially shared). * * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios, - * so we don't pay attention to it. + * so we don't pay attention to using non-temporary stores. * * On recent Intel CPUs, the presense of "erms" makes "rep movsb" the most benefitial, * even comparing to non-temporary aligned unrolled stores even with the most wide registers. From 1606c7e3f3aafd041009b9686a224e3a063b50c7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Mar 2021 12:24:17 +0300 Subject: [PATCH 22/22] Fix typo --- base/glibc-compatibility/memcpy/memcpy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h index 60303e4f5441..f9f81bcb0fe3 100644 --- a/base/glibc-compatibility/memcpy/memcpy.h +++ b/base/glibc-compatibility/memcpy/memcpy.h @@ -63,7 +63,7 @@ * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios, * so we don't pay attention to using non-temporary stores. * - * On recent Intel CPUs, the presense of "erms" makes "rep movsb" the most benefitial, + * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial, * even comparing to non-temporary aligned unrolled stores even with the most wide registers. * * memcpy can be written in asm, C or C++. The latter can also use inline asm.