From 677c6a52d8cde3b8630932a93b631d2d4e68ab52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aras=20Pranckevi=C4=8Dius?= <aras@nesnausk.org>
Date: Fri, 3 Mar 2023 23:53:29 +0200
Subject: [PATCH] NEON optimizations for ZIP reading (reconstruct and
 interleave) (#1348)

Pretty much straight copies from the SSE2/SSE4 SIMD code, just changed
for NEON.

Reading 310Mpix worth of ZIP compressed EXR files into memory,
on Apple M1 Max (Clang 14, RelWithDebInfo build config): 7.50s -> 5.52s.

"reconstruct" part goes 1.55s -> 0.35s, "interleave" 0.80s -> 0.04s.

Signed-off-by: Aras Pranckevicius <aras@nesnausk.org>
---
 src/lib/OpenEXR/ImfZip.cpp         | 93 ++++++++++++++++++++++++++++++
 src/lib/OpenEXRCore/internal_zip.c | 77 +++++++++++++++++++++++++
 2 files changed, 170 insertions(+)
diff --git a/src/lib/OpenEXR/ImfZip.cpp b/src/lib/OpenEXR/ImfZip.cpp
index 70673f7b85..0e2b031d81 100644
--- a/src/lib/OpenEXR/ImfZip.cpp
+++ b/src/lib/OpenEXR/ImfZip.cpp
@@ -160,6 +160,56 @@ reconstruct_sse41 (char* buf, size_t outSize)
 
 #endif
 
+#ifdef IMF_HAVE_NEON
+
+void
+reconstruct_neon (char* buf, size_t outSize)
+{
+    static const size_t bytesPerChunk = sizeof (uint8x16_t);
+    const size_t        vOutSize      = outSize / bytesPerChunk;
+
+    const uint8x16_t c           = vdupq_n_u8 (-128);
+    const uint8x16_t shuffleMask = vdupq_n_u8 (15);
+
+    // The first element doesn't have its high bit flipped during compression,
+    // so it must not be flipped here.  To make the SIMD loop nice and
+    // uniform, we pre-flip the bit so that the loop will unflip it again.
+    buf[0] += -128;
+
+    unsigned char* vBuf  = reinterpret_cast<unsigned char*> (buf);
+    uint8x16_t  vZero = vdupq_n_u8 (0);
+    uint8x16_t  vPrev = vdupq_n_u8 (0);
+    for (size_t i = 0; i < vOutSize; ++i)
+    {
+        uint8x16_t d = vaddq_u8 (vld1q_u8 (vBuf), c);
+
+        // Compute the prefix sum of elements.
+        d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 1));
+        d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 2));
+        d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 4));
+        d = vaddq_u8 (d, vextq_u8 (vZero, d, 16 - 8));
+        d = vaddq_u8 (d, vPrev);
+
+        vst1q_u8 (vBuf, d);
+        vBuf += sizeof (uint8x16_t);
+
+        // Broadcast the high byte in our result to all lanes of the prev
+        // value for the next iteration.
+        vPrev = vqtbl1q_u8 (d, shuffleMask);
+    }
+
+    unsigned char prev = vgetq_lane_u8 (vPrev, 15);
+    for (size_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
+    {
+        unsigned char d = prev + buf[i] - 128;
+        buf[i]          = d;
+        prev            = d;
+    }
+}
+
+#endif
+
+
 void
 reconstruct_scalar (char* buf, size_t outSize)
 {
@@ -212,6 +262,44 @@ interleave_sse2 (const char* source, size_t outSize, char* out)
 
 #endif
 
+#ifdef IMF_HAVE_NEON
+
+void
+interleave_neon (const char* source, size_t outSize, char* out)
+{
+    static const size_t bytesPerChunk = 2 * sizeof (uint8x16_t);
+
+    const size_t vOutSize = outSize / bytesPerChunk;
+
+    const unsigned char* v1 = reinterpret_cast<const unsigned char*> (source);
+    const unsigned char* v2 =
+        reinterpret_cast<const unsigned char*> (source + (outSize + 1) / 2);
+    unsigned char* vOut = reinterpret_cast<unsigned char*> (out);
+
+    for (size_t i = 0; i < vOutSize; ++i)
+    {
+        uint8x16_t a = vld1q_u8 (v1); v1 += sizeof (uint8x16_t);
+        uint8x16_t b = vld1q_u8 (v2); v2 += sizeof (uint8x16_t);
+
+        uint8x16_t lo = vzip1q_u8 (a, b);
+        uint8x16_t hi = vzip2q_u8 (a, b);
+
+        vst1q_u8 (vOut, lo); vOut += sizeof (uint8x16_t);
+        vst1q_u8 (vOut, hi); vOut += sizeof (uint8x16_t);
+    }
+
+    const char* t1   = reinterpret_cast<const char*> (v1);
+    const char* t2   = reinterpret_cast<const char*> (v2);
+    char*       sOut = reinterpret_cast<char*> (vOut);
+
+    for (size_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
+    {
+        *(sOut++) = (i % 2 == 0) ? *(t1++) : *(t2++);
+    }
+}
+
+#endif
+
 void
 interleave_scalar (const char* source, size_t outSize, char* out)
 {
@@ -291,6 +379,11 @@ Zip::initializeFuncs ()
         interleave = interleave_sse2;
     }
 #endif
+
+#ifdef IMF_HAVE_NEON
+    reconstruct = reconstruct_neon;
+    interleave = interleave_neon;
+#endif
 }
 
 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
diff --git a/src/lib/OpenEXRCore/internal_zip.c b/src/lib/OpenEXRCore/internal_zip.c
index 2ea3b36483..e829e6d7ce 100644
--- a/src/lib/OpenEXRCore/internal_zip.c
+++ b/src/lib/OpenEXRCore/internal_zip.c
@@ -24,6 +24,11 @@
 #    define IMF_HAVE_SSE4_1 1
 #    include <smmintrin.h>
 #endif
+#if defined(__ARM_NEON)
+#    define IMF_HAVE_NEON 1
+#    include <arm_neon.h>
+#endif
+
 
 /**************************************/
 
@@ -73,6 +78,54 @@ reconstruct (uint8_t* buf, uint64_t outSize)
         prev      = d;
     }
 }
+#elif defined(IMF_HAVE_NEON)
+static void
+reconstruct (uint8_t* buf, uint64_t outSize)
+{
+    static const uint64_t bytesPerChunk = sizeof (uint8x16_t);
+    const uint64_t        vOutSize      = outSize / bytesPerChunk;
+    const uint8x16_t      c             = vdupq_n_u8 (-128);
+    const uint8x16_t      shuffleMask   = vdupq_n_u8 (15);
+    const uint8x16_t      zero          = vdupq_n_u8 (0);
+    uint8_t *             vBuf;
+    uint8x16_t            vPrev;
+    uint8_t               prev;
+
+    /*
+     * The first element doesn't have its high bit flipped during compression,
+     * so it must not be flipped here.  To make the SIMD loop nice and
+     * uniform, we pre-flip the bit so that the loop will unflip it again.
+     */
+    buf[0] += -128;
+    vBuf  = buf;
+    vPrev = vdupq_n_u8 (0);
+
+    for (uint64_t i = 0; i < vOutSize; ++i)
+    {
+        uint8x16_t d = vaddq_u8 (vld1q_u8 (vBuf), c);
+
+        /* Compute the prefix sum of elements. */
+        d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 1));
+        d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 2));
+        d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 4));
+        d = vaddq_u8 (d, vextq_u8 (zero, d, 16 - 8));
+        d = vaddq_u8 (d, vPrev);
+
+        vst1q_u8 (vBuf, d); vBuf += sizeof (uint8x16_t);
+
+        // Broadcast the high byte in our result to all lanes of the prev
+        // value for the next iteration.
+        vPrev = vqtbl1q_u8 (d, shuffleMask);
+    }
+
+    prev = vgetq_lane_u8 (vPrev, 15);
+    for (uint64_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
+    {
+        uint8_t d = prev + buf[i] - 128;
+        buf[i]    = d;
+        prev      = d;
+    }    
+}
 #else
 static void
 reconstruct (uint8_t* buf, uint64_t sz)
@@ -121,6 +174,30 @@ interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
         *(sOut++) = (i % 2 == 0) ? *(t1++) : *(t2++);
 }
 
+#elif defined(IMF_HAVE_NEON)
+static void
+interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
+{
+    static const uint64_t bytesPerChunk = 2 * sizeof (uint8x16_t);
+    const uint64_t        vOutSize      = outSize / bytesPerChunk;
+    const uint8_t*        v1   = source;
+    const uint8_t*        v2   = source + (outSize + 1) / 2;
+
+    for (uint64_t i = 0; i < vOutSize; ++i)
+    {
+        uint8x16_t a  = vld1q_u8 (v1); v1 += sizeof (uint8x16_t);
+        uint8x16_t b  = vld1q_u8 (v2); v2 += sizeof (uint8x16_t);
+        uint8x16_t lo = vzip1q_u8 (a, b);
+        uint8x16_t hi = vzip2q_u8 (a, b);
+
+        vst1q_u8 (out, lo); out += sizeof (uint8x16_t);
+        vst1q_u8 (out, hi); out += sizeof (uint8x16_t);
+    }
+
+    for (uint64_t i = vOutSize * bytesPerChunk; i < outSize; ++i)
+        *(out++) = (i % 2 == 0) ? *(v1++) : *(v2++);
+}
+
 #else
 
 static void