From 677c6a52d8cde3b8630932a93b631d2d4e68ab52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aras=20Pranckevi=C4=8Dius?= Date: Fri, 3 Mar 2023 23:53:29 +0200 Subject: [PATCH] NEON optimizations for ZIP reading (reconstruct and interleave) (#1348) Pretty much straight copies from the SSE2/SSE4 SIMD code, just changed for NEON. Reading 310Mpix worth of ZIP compressed EXR files into memory, on Apple M1 Max (Clang 14, RelWithDebInfo build config): 7.50s -> 5.52s. "reconstruct" part goes 1.55s -> 0.35s, "interleave" 0.80s -> 0.04s. Signed-off-by: Aras Pranckevicius --- src/lib/OpenEXR/ImfZip.cpp | 93 ++++++++++++++++++++++++++++++ src/lib/OpenEXRCore/internal_zip.c | 77 +++++++++++++++++++++++++ 2 files changed, 170 insertions(+) diff --git a/src/lib/OpenEXR/ImfZip.cpp b/src/lib/OpenEXR/ImfZip.cpp index 70673f7b85..0e2b031d81 100644 --- a/src/lib/OpenEXR/ImfZip.cpp +++ b/src/lib/OpenEXR/ImfZip.cpp @@ -160,6 +160,56 @@ reconstruct_sse41 (char* buf, size_t outSize) #endif +#ifdef IMF_HAVE_NEON + +void +reconstruct_neon (char* buf, size_t outSize) +{ + static const size_t bytesPerChunk = sizeof (uint8x16_t); + const size_t vOutSize = outSize / bytesPerChunk; + + const uint8x16_t c = vdupq_n_u8 (-128); + const uint8x16_t shuffleMask = vdupq_n_u8 (15); + + // The first element doesn't have its high bit flipped during compression, + // so it must not be flipped here. To make the SIMD loop nice and + // uniform, we pre-flip the bit so that the loop will unflip it again. + buf[0] += -128; + + unsigned char* vBuf = reinterpret_cast (buf); + uint8x16_t vZero = vdupq_n_u8 (0); + uint8x16_t vPrev = vdupq_n_u8 (0); + for (size_t i = 0; i < vOutSize; ++i) + { + uint8x16_t d = vaddq_u8 (vld1q_u8 (vBuf), c); + + // Compute the prefix sum of elements. + d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 1)); + d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 2)); + d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 4)); + d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 8)); + d = vaddq_u8 (d, vPrev); + + vst1q_u8 (vBuf, d); + vBuf += sizeof (uint8x16_t); + + // Broadcast the high byte in our result to all lanes of the prev + // value for the next iteration. + vPrev = vqtbl1q_u8 (d, shuffleMask); + } + + unsigned char prev = vgetq_lane_u8 (vPrev, 15); + for (size_t i = vOutSize * bytesPerChunk; i < outSize; ++i) + { + unsigned char d = prev + buf[i] - 128; + buf[i] = d; + prev = d; + } +} + +#endif + + void reconstruct_scalar (char* buf, size_t outSize) { @@ -212,6 +262,44 @@ interleave_sse2 (const char* source, size_t outSize, char* out) #endif +#ifdef IMF_HAVE_NEON + +void +interleave_neon (const char* source, size_t outSize, char* out) +{ + static const size_t bytesPerChunk = 2 * sizeof (uint8x16_t); + + const size_t vOutSize = outSize / bytesPerChunk; + + const unsigned char* v1 = reinterpret_cast (source); + const unsigned char* v2 = + reinterpret_cast (source + (outSize + 1) / 2); + unsigned char* vOut = reinterpret_cast (out); + + for (size_t i = 0; i < vOutSize; ++i) + { + uint8x16_t a = vld1q_u8 (v1); v1 += sizeof (uint8x16_t); + uint8x16_t b = vld1q_u8 (v2); v2 += sizeof (uint8x16_t); + + uint8x16_t lo = vzip1q_u8 (a, b); + uint8x16_t hi = vzip2q_u8 (a, b); + + vst1q_u8 (vOut, lo); vOut += sizeof (uint8x16_t); + vst1q_u8 (vOut, hi); vOut += sizeof (uint8x16_t); + } + + const char* t1 = reinterpret_cast (v1); + const char* t2 = reinterpret_cast (v2); + char* sOut = reinterpret_cast (vOut); + + for (size_t i = vOutSize * bytesPerChunk; i < outSize; ++i) + { + *(sOut++) = (i % 2 == 0) ? *(t1++) : *(t2++); + } +} + +#endif + void interleave_scalar (const char* source, size_t outSize, char* out) { @@ -291,6 +379,11 @@ Zip::initializeFuncs () interleave = interleave_sse2; } #endif + +#ifdef IMF_HAVE_NEON + reconstruct = reconstruct_neon; + interleave = interleave_neon; +#endif } OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT diff --git a/src/lib/OpenEXRCore/internal_zip.c b/src/lib/OpenEXRCore/internal_zip.c index 2ea3b36483..e829e6d7ce 100644 --- a/src/lib/OpenEXRCore/internal_zip.c +++ b/src/lib/OpenEXRCore/internal_zip.c @@ -24,6 +24,11 @@ # define IMF_HAVE_SSE4_1 1 # include #endif +#if defined(__ARM_NEON) +# define IMF_HAVE_NEON 1 +# include +#endif + /**************************************/ @@ -73,6 +78,54 @@ reconstruct (uint8_t* buf, uint64_t outSize) prev = d; } } +#elif defined(IMF_HAVE_NEON) +static void +reconstruct (uint8_t* buf, uint64_t outSize) +{ + static const uint64_t bytesPerChunk = sizeof (uint8x16_t); + const uint64_t vOutSize = outSize / bytesPerChunk; + const uint8x16_t c = vdupq_n_u8 (-128); + const uint8x16_t shuffleMask = vdupq_n_u8 (15); + const uint8x16_t zero = vdupq_n_u8 (0); + uint8_t * vBuf; + uint8x16_t vPrev; + uint8_t prev; + + /* + * The first element doesn't have its high bit flipped during compression, + * so it must not be flipped here. To make the SIMD loop nice and + * uniform, we pre-flip the bit so that the loop will unflip it again. + */ + buf[0] += -128; + vBuf = buf; + vPrev = vdupq_n_u8 (0); + + for (uint64_t i = 0; i < vOutSize; ++i) + { + uint8x16_t d = vaddq_u8 (vld1q_u8 (vBuf), c); + + /* Compute the prefix sum of elements. */ + d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 1)); + d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 2)); + d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 4)); + d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 8)); + d = vaddq_u8 (d, vPrev); + + vst1q_u8 (vBuf, d); vBuf += sizeof (uint8x16_t); + + // Broadcast the high byte in our result to all lanes of the prev + // value for the next iteration. + vPrev = vqtbl1q_u8 (d, shuffleMask); + } + + prev = vgetq_lane_u8 (vPrev, 15); + for (uint64_t i = vOutSize * bytesPerChunk; i < outSize; ++i) + { + uint8_t d = prev + buf[i] - 128; + buf[i] = d; + prev = d; + } +} #else static void reconstruct (uint8_t* buf, uint64_t sz) @@ -121,6 +174,30 @@ interleave (uint8_t* out, const uint8_t* source, uint64_t outSize) *(sOut++) = (i % 2 == 0) ? *(t1++) : *(t2++); } +#elif defined(IMF_HAVE_NEON) +static void +interleave (uint8_t* out, const uint8_t* source, uint64_t outSize) +{ + static const uint64_t bytesPerChunk = 2 * sizeof (uint8x16_t); + const uint64_t vOutSize = outSize / bytesPerChunk; + const uint8_t* v1 = source; + const uint8_t* v2 = source + (outSize + 1) / 2; + + for (uint64_t i = 0; i < vOutSize; ++i) + { + uint8x16_t a = vld1q_u8 (v1); v1 += sizeof (uint8x16_t); + uint8x16_t b = vld1q_u8 (v2); v2 += sizeof (uint8x16_t); + uint8x16_t lo = vzip1q_u8 (a, b); + uint8x16_t hi = vzip2q_u8 (a, b); + + vst1q_u8 (out, lo); out += sizeof (uint8x16_t); + vst1q_u8 (out, hi); out += sizeof (uint8x16_t); + } + + for (uint64_t i = vOutSize * bytesPerChunk; i < outSize; ++i) + *(out++) = (i % 2 == 0) ? *(v1++) : *(v2++); +} + #else static void